06-correlation.Rmd

---
title: "Correlation analysis"
output: html_document
---

```{r, load-data}
# load image and table
load('processed_data/aggregating.rda')
aggregated_dt <- fread('processed_data/aggregated_dt_filtered.csv.gz', stringsAsFactors=TRUE)
# remove manual_ids, i.e., filter IDs in keep_rows
aggregated_dt <- aggregated_dt[ID %in% keep_rows]
# check if there are still manual_ids in the dt
stopifnot(!any(manual_ids %in% aggregated_dt$ID))
aggregated_dt[, `Event Type`:=factor(`Event Type`, levels=to_analyze)]
```

```{r, generate-cor-dt}
# if cor_dt exists, read it, otherwise produce it:
corr_file <- 'processed_data/correlation.csv.gz'
if (file.exists(corr_file)) {
  all_cor <- fread(corr_file, stringsAsFactors = TRUE)
} else {
  # try multiple correlation methods:
  cor_dt <-
    rbindlist(pbmclapply(setNames(cor_methods, cor_methods), function(cor_method)
      # correlate all the features with PSI, first for Low and High Variability Events separately
      rbind(aggregated_dt[, lapply(.SD, function(y) {
        # only use full pairs
        rmv <- is.na(y) | is.na(PSI)
        res <- NA
        # only compute correlation if there are enough events
        if ((length(rmv) - sum(rmv)) >= minimum_events)
          res <- suppressWarnings(stats::cor(PSI, y, use = 'pairwise.complete.obs', method = cor_method))
        return(as.numeric(res))
      }), by = .(`Event Type`, IHEC, Variability), .SDcols = names(aggregated_dt)[!names(aggregated_dt) %in% c('ID', 'PSI', 'IHEC', 'Event Type', 'gene_id', 'Variability')]],
      # correlate all the features with PSI, now for all events
      aggregated_dt[, lapply(.SD, function(y) {
        rmv <- is.na(y) | is.na(PSI)
        res <- NA
        if ((length(rmv) - sum(rmv)) >= minimum_events)
          res <-
          suppressWarnings(stats::cor(PSI, y, use = 'pairwise.complete.obs', method = cor_method))
        return(as.numeric(res))
      }), by = .(`Event Type`, IHEC), .SDcols = names(aggregated_dt)[!names(aggregated_dt) %in% c('ID', 'PSI', 'IHEC', 'Event Type', 'gene_id', 'Variability')]][, Variability := 'All'])),
      idcol = 'Correlation Coefficient')
  
  # set some information
  cor_dt[, `Correlation Coefficient` := as.factor(`Correlation Coefficient`)]
  cor_dt[, `Partial Correlation` := FALSE]
  
  # repeat but now with partial correlation
  pcor_dt <-
    rbindlist(pbmclapply(setNames(cor_methods, cor_methods), function(cor_method)
      rbind(aggregated_dt[, lapply(.SD, function(y) {
        rmv <- is.na(y) | is.na(PSI) | is.na(gene_expression)
        res <- NA
        if ((length(rmv) - sum(rmv)) >= minimum_events)
          res <- tryCatch(
            suppressWarnings(
              ppcor::pcor.test(PSI[!rmv], y[!rmv], gene_expression[!rmv], method = cor_method)$estimate
            ),
            error = function(e) {
              if (identical(y, gene_expression) &
                  startsWith(e$message, prefix = 'system is computationally singular'))
                return(suppressWarnings(
                  stats::cor(y, PSI, use = 'na.or.complete', method = cor_method)
                ))
              return(e)
            }
          )
        return(as.numeric(res))
      }), by = .(`Event Type`, IHEC, Variability), .SDcols = names(aggregated_dt)[!names(aggregated_dt) %in% c('ID', 'PSI', 'IHEC', 'Event Type', 'gene_id', 'Variability')]],
      aggregated_dt[, lapply(.SD, function(y) {
        rmv <- is.na(y) | is.na(PSI) | is.na(gene_expression)
        res <- NA
        if ((length(rmv) - sum(rmv)) >= minimum_events)
          res <- tryCatch(
            suppressWarnings(
              ppcor::pcor.test(PSI[!rmv], y[!rmv], gene_expression[!rmv], method = cor_method)$estimate
            ),
            error = function(e) {
              if (identical(y, gene_expression) &
                  startsWith(e$message, prefix = 'system is computationally singular'))
                return(suppressWarnings(
                  stats::cor(y, PSI, use = 'na.or.complete', method = cor_method)
                ))
              return(e)
            }
          )
        return(as.numeric(res))
      }), by = .(`Event Type`, IHEC), .SDcols = names(aggregated_dt)[!names(aggregated_dt) %in% c('ID', 'PSI', 'IHEC', 'Event Type', 'gene_id', 'Variability')]][, Variability := 'All'])),
      idcol = 'Correlation Coefficient')
  # set some information
  pcor_dt[, `Correlation Coefficient` := as.factor(`Correlation Coefficient`)]
  pcor_dt[, `Partial Correlation` := TRUE]
  
  # bind correlation with partial correlation
  all_cor <- rbind(cor_dt, pcor_dt)
  # set variability as factor
  all_cor[, Variability := factor(Variability,
                                  levels = c('Low Variability', 'All', 'High Variability'))]
  # write table
  fwrite(all_cor, corr_file)
}
```

```{r, nicer-names}
# this chunk just changes some names in the dt so the plots look nicer 
melt_cor_dt <- melt(all_cor, id.vars = c('IHEC', 'Event Type', 'Variability', 'Partial Correlation', 'Correlation Coefficient'), variable.name = 'feature', value.name = 'Correlation w/ PSI')
melt_cor_dt[, `Event Type`:=factor(`Event Type`, levels=c('SE', 'RI'))]
melt_cor_dt[`Event Type` == 'SE', feature := gsub('other_region', 'Intron', feature, fixed=TRUE)]
melt_cor_dt[`Event Type` == 'RI', feature := gsub('other_region', 'Exon', feature, fixed=TRUE)]
melt_cor_dt[`Event Type` == 'SE', feature := gsub('event_name', 'SE', feature, fixed=TRUE)]
melt_cor_dt[`Event Type` == 'RI', feature := gsub('event_name', 'RI', feature, fixed=TRUE)]
melt_cor_dt[, feature:=sub(';', '\n', feature, fixed = TRUE)]
melt_cor_dt[, feature:=sub(',', '\n', feature, fixed = TRUE)]
melt_cor_dt[, feature:=gsub('_', ' ', feature, fixed = TRUE)]
melt_cor_dt[, feature:=sub('distance ', 'Distance\n', feature, fixed = TRUE)]
melt_cor_dt[, feature:=sub('upstream Intron', 'Intron\nUpstream', feature, fixed = TRUE)]
melt_cor_dt[, feature:=sub('upstream Exon', 'Exon\nUpstream', feature, fixed = TRUE)]
melt_cor_dt[, feature:=sub('downstream Intron', 'Intron\nDownstream', feature, fixed = TRUE)]
melt_cor_dt[, feature:=sub('downstream Exon', 'Exon\nDownstream', feature, fixed = TRUE)]
melt_cor_dt[, feature:=sub('gene expression', 'Gene\nExpression', feature, fixed = TRUE)]
melt_cor_dt[, feature:=sub('width', 'Width', feature, fixed = TRUE)]
aggregate_median_by <- c('feature', 'Event Type', 'Variability', 'Partial Correlation', 'Correlation Coefficient')
melt_cor_dt[, median_cor:=median(`Correlation w/ PSI`, na.rm = TRUE), by = aggregate_median_by]
melt_cor_dt[, mean_cor:=mean(`Correlation w/ PSI`, na.rm = TRUE), by = aggregate_median_by]
melt_cor_dt[, sd_cor:=sd(`Correlation w/ PSI`, na.rm = TRUE), by = aggregate_median_by]
melt_cor_dt[, relevant:=any(abs(median_cor) > .1), by = aggregate_median_by]
melt_cor_dt[, Variability:=sub(' Variability', '', Variability, fixed = TRUE)]
melt_cor_dt[, Variability:=factor(Variability, levels = c('Low', 'All', 'High'))]
```

```{r, top10-overall}
top_n <- 10
# get the top n correlations with PSI from the feature vector
tmp_cor_dt <- melt_cor_dt[`Variability` == 'All' & `Correlation Coefficient` == 'pearson' & `Partial Correlation` == 'TRUE']
tmp_cor_dt[order(-abs(median_cor)), .(feature=unique(feature)[1:top_n]), by=.(`Event Type`)]
# annotate epigenetic vs intrinsic features
tmp_cor_dt[, 'Feature Type' := ifelse(Reduce(`|`, lapply(c(histone_marks, 'DNAm'), function(x) startsWith(feature, x))), 'Epigenetic', 'Intrinsic')]
tmp_cor_dt[feature %in% c('max promoter', 'summed enhancer'), `Feature Type` := 'Epigenetic']
plot_dt <- merge(tmp_cor_dt, tmp_cor_dt[order(-abs(median_cor)), .(feature=unique(feature)[1:top_n]), by=.(`Event Type`)])
# Create a vector of colors for ggplot, defaulting to dark grey
plot_colors <- setNames(rep("dark grey", plot_dt[, uniqueN(feature)]), plot_dt[, unique(feature)])
for(name in names(mark_hex_colors)) {
  inds <- startsWith(names(plot_colors), name)
  plot_colors[inds] <- mark_hex_colors[name]
}
# plot
ggplot(plot_dt, 
       aes(y = `Correlation w/ PSI`, 
           x =  tidytext::reorder_within(feature, -median_cor, `Event Type`),
       color = feature)) + #`Feature Type`)) + 
    geom_hline(yintercept = 0, color = "light grey", linetype = "dashed") + 
    geom_boxplot() + 
    labs(x = NULL, #'Aggregated Signal or Statistic', 
         title = NULL#, paste('Top', top_n, 'Globally Correlated Features')
         ) +
    tidytext::scale_x_reordered() + 
    facet_wrap(. ~ `Event Type`, scales="free", nrow=2) + 
    scale_color_manual(values = plot_colors) +
    theme_bw(base_size=12) + theme(legend.position = 'none', strip.background = element_rect(fill = 'white'))
ggsave(filename = file.path(plot_dir, paste0('top', top_n, '_overall.png')), width = 10, height = 5)
ggsave(filename = file.path(plot_dir, paste0('top', top_n, '_overall.svg')), width = 10, height = 5)
```


```{r, comparison-plots}
# make plots comparing different measures and variability
for (color_by in c('Correlation Coefficient', 'Partial Correlation', 'Variability')) {
  # aggregate the medians by the relevant columns
  find_relevant_by <- aggregate_median_by[aggregate_median_by != color_by]
  # remove column from earlier iteration
  melt_cor_dt[, plot_relevant:=NULL]
  # set which columns are relevant
  melt_cor_dt[, plot_relevant:=any(relevant), by = mget(find_relevant_by)]
  # get the subset of the table that is relevant depending on what to compare
  not_color_by <- TRUE
  if (color_by == 'Partial Correlation') {
    not_color_by <- melt_cor_dt[, `Correlation Coefficient` == 'pearson' & `Variability` == 'All']
  } else if (color_by == 'Correlation Coefficient') {
    not_color_by <- melt_cor_dt[, `Partial Correlation` == 'TRUE'  & `Variability` == 'All']
  } else if (color_by == 'Variability') {
    not_color_by <- melt_cor_dt[, `Partial Correlation` == 'TRUE' & `Correlation Coefficient` == 'pearson']
  }
  # get the number of levels we need to color_by
  color_levels <- melt_cor_dt[not_color_by, uniqueN(get(color_by))]
  tmp_cor_dt <- melt_cor_dt[not_color_by & !is.na(`Correlation w/ PSI`), if(uniqueN(get(color_by)) == color_levels) .SD, by=.(`Event Type`, feature)]
  if (color_by == 'Variability') {
    # for variablity make another plot sorted by the greatest differences
    median_dt <- dcast(tmp_cor_dt[, .(median_cor=unique(median_cor)), by=.(`Event Type`, feature, `Variability`)], `Event Type` + feature ~ `Variability`, value.var = 'median_cor')
    median_dt[, diff_col:=abs(`Low`-`High`)]
    sorted_idx <- median_dt[, order(-diff_col)]
    p <- ggplot(merge(tmp_cor_dt, median_dt[sorted_idx, .(feature=feature[1:top_n], diff_col=diff_col[1:top_n], all_median=All[1:top_n]), by=.(`Event Type`)]), 
       aes(y = `Correlation w/ PSI`, 
           x =  tidytext::reorder_within(feature, -diff_col, `Event Type`), 
           fill = get(color_by))) + 
    theme_bw() + geom_hline(yintercept = 0, color = "darkgrey", linetype = "dashed") + 
    geom_boxplot() + 
    labs(x = 'Aggregated Signal or Statistic') +
    tidytext::scale_x_reordered() + 
    facet_wrap(. ~ `Event Type`, scales="free", nrow=2) + 
    theme(legend.position = 'bottom') +
    scale_fill_manual(values = variability_colors[1:color_levels], name = color_by)
    ggsave(filename = file.path(plot_dir, paste0('max diff ', color_by, '.pdf')), p, width = 10, height = 7) 
    print(p)
  }
  # always make one plot sorted by the median
  p <- ggplot(merge(tmp_cor_dt, tmp_cor_dt[order(-abs(median_cor)), .(feature=unique(feature)[1:top_n]), by=.(`Event Type`)]), 
               aes(y = `Correlation w/ PSI`, 
                   x =  tidytext::reorder_within(feature, -median_cor, `Event Type`), 
                   color = get(color_by))) + 
          theme_bw() + geom_hline(yintercept = 0, color = "darkgrey", linetype = "dashed") + 
      geom_boxplot() + 
      labs(x = 'Aggregated Signal or Statistic') +
      tidytext::scale_x_reordered() + 
      facet_wrap(. ~ `Event Type`, scales="free", nrow=2) +
      theme(legend.position = 'bottom', strip.background = element_rect(fill = 'white')) +
      scale_color_manual(values = variability_colors, name = color_by)
  if (color_levels == 2){
    p <- p + scale_color_manual(values = unname(variability_colors[c(1,3)]), name = color_by)}
  
  if (color_by == "Variability")
  ggsave(filename = file.path(plot_dir, paste0('top', top_n,  color_by, '.pdf')), p, width = 7, height = 4.5)
  print(p)
}
```

```{r, cor-heatmap}
# make heatmap with the correlation per feature
heatmap_dt <- melt_cor_dt[`Partial Correlation` == TRUE & `Correlation Coefficient` == 'pearson' & `Variability` == 'All' & grepl(paste(c(histone_marks, "DNAm"), collapse = "|"), x = feature)]
heatmap_dt[, mark:=tstrsplit(x = feature, split = "\n", fixed = TRUE, keep = 1)]
heatmap_dt[, region:=sub(pattern = ".*?\n", replacement = "", x = feature)]
ggplot(heatmap_dt, aes(x = factor(region, levels = c("Intron\nUpstream", "SE", "Intron\nDownstream", "Exon\nUpstream", "RI", "Exon\nDownstream")), 
                                                                                                                              y = factor(mark, levels = rev(c("H3K9me3", "H3K27me3", "H3K4me3", "H3K4me1", "H3K27ac", "H3K36me3", "DNAm"))), 
                                                                                                                              fill = mean_cor)) + 
    geom_tile() + 
    ggh4x::facet_grid2( ~ `Event Type`, scales = 'free') +
    theme_bw() + 
    theme(strip.background = element_rect(fill = 'white'), legend.position = "bottom") + 
    scale_fill_gradient2(
  low = scales::muted("blue"),
  mid = "white",
  high = scales::muted("red")) + 
  # add text into the tile with the value of the mean and sd_cor in parantheses
  geom_text(aes(label = paste0(round(mean_cor, 2), " (\u00B1", round(sd_cor, 2), ")")), size=3) +
  
    labs(y = 'Mark', x = 'Region', fill = 'Partial Correlation')
ggsave(filename = file.path(plot_dir, "epigenetic_cor.pdf"), width = 7, height = 4.5)
```