-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapple_WGCNA.R
319 lines (271 loc) · 11.5 KB
/
apple_WGCNA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
require(devtools)
library(dplyr)
library(tidyverse)
library(magrittr)
library(WGCNA)
library(ggplot2)
library(tidyverse)
library(cowplot)
library(patchwork)
library(DESeq2)
library(igraph)
# using the cowplot theme for ggplot
theme_set(theme_cowplot())
# Create the data folder if it doesn't exist
if (!dir.exists("data")) {
dir.create("data")
}
# Define the file path to the plots directory
plots_dir <- "plots"
# Create the plots folder if it doesn't exist
if (!dir.exists(plots_dir)) {
dir.create(plots_dir)
}
# Define the file path to the results directory
results_dir <- "results"
# Create the results folder if it doesn't exist
if (!dir.exists(results_dir)) {
dir.create(results_dir)
}
# Define the file path to the data directory
# Replace with the path of the folder the files will be in
data_dir <- file.path("data", "apple")
# Declare the file path to the gene expression matrix file
# inside directory saved as `data_dir`
# Replace with the path to your dataset file
data_file <- file.path(data_dir, "salmon_merge_all.tsv")
data_file
# Declare the file path to the metadata file
# inside the directory saved as `data_dir`
# Replace with the path to your metadata file
metadata_file <- file.path(data_dir, "RNAappleinfo_2.tsv")
metadata_file
# Check if the gene expression matrix file is at the path stored in `data_file`
file.exists(data_file)
# Check if the metadata file is at the file path stored in `metadata_file`
file.exists(metadata_file)
# Read in metadata TSV file
metadata <- readr::read_tsv(metadata_file)
# Read in data TSV file
df <- readr::read_tsv(data_file) %>%
# Here we are going to store the gene IDs as row names so that we can have a numeric matrix to perform calculations on later
tibble::column_to_rownames("Name")
# Make the data in the order of the metadata
df <- df %>%
dplyr::select(metadata$Run)
# Check if this is in the same order
all.equal(colnames(df), metadata$Run)
# The next DESeq2 functions need the values to be converted to integers
df <- round(df) %>%
# The next steps require a data frame and round() returns a matrix
as.data.frame() %>%
# Only keep rows that have total counts above the cutoff
dplyr::filter(rowSums(.) >= 100)
# Create a `DESeqDataSet` object
dds <- DESeqDataSetFromMatrix(
countData = df, # Our prepped data frame with counts
colData = metadata, # Data frame with annotation for our samples
design = ~1 # Here we are not specifying a model
)
# Normalize and transform the data in the `DESeqDataSet` object using the `vst()`
# function from the `DESEq2` R package
dds_norm <- vst(dds)
#check for outliers here because it can affect WGCNA results.
#this example skip this step (there are no obvious outliers).
# Retrieve the normalized data from the `DESeqDataSet`
normalized_counts <- assay(dds_norm) %>%
t() # Transpose this data
sft <- pickSoftThreshold(normalized_counts,
dataIsExpr = TRUE,
corFnc = cor,
networkType = "signed"
)
sft_df <- data.frame(sft$fitIndices) %>%
dplyr::mutate(model_fit = -sign(slope) * SFT.R.sq)
ggplot(sft_df, aes(x = Power, y = model_fit, label = Power)) +
# Plot the points
geom_point() +
# We'll put the Power labels slightly above the data points
geom_text(nudge_y = 0.1) +
# We will plot what WGCNA recommends as an R^2 cutoff
geom_hline(yintercept = 0.80, col = "red") +
# Just in case our values are low, we want to make sure we can still see the 0.80 level
ylim(c(min(sft_df$model_fit), 1.05)) +
# We can add more sensible labels for our axis
xlab("Soft Threshold (power)") +
ylab("Scale Free Topology Model Fit, signed R^2") +
ggtitle("Scale independence") +
# This adds some nicer aesthetics to our plot
theme_classic()
##So using the plot above, going with a power soft-threshold of 7!
###A 4GB standard desktop or a laptop may handle up to 8000-10000 probes, depending on operating system and other running programs.
#Run WGCNA!
cor=WGCNA::cor
bwnet <- blockwiseModules(normalized_counts,
maxBlockSize = 20000, # What size chunks (how many genes) the calculations should be run in
TOMType = "signed", # topological overlap matrix
power = 12, # soft threshold for network construction
numericLabels = TRUE, # Let's use numbers instead of colors for module labels
randomSeed = 1234, # there's some randomness associated with this calculation
# so we should set a seed
)
#Write main WGCNA results object to file
readr::write_rds(bwnet,
file = file.path("results", "apple_wgcna_results_all.RDS")
)
#Explore our WGCNA results
module_eigengenes <- bwnet$MEs
# Print out a preview
head(module_eigengenes)
#Which modules have biggest differences across treatment groups?
all.equal(metadata$Run, rownames(module_eigengenes))
# Create the design matrix from the `tissue` variable
des_mat <- model.matrix(~ metadata$Tissue)
# lmFit() needs a transposed version of the matrix
fit <- limma::lmFit(t(module_eigengenes), design = des_mat)
# Apply empirical Bayes to smooth standard errors
fit <- limma::eBayes(fit)
# Apply multiple testing correction and obtain stats
stats_df <- limma::topTable(fit, number = ncol(module_eigengenes)) %>%
tibble::rownames_to_column("module")
head(stats_df)
#Let’s make plot of module 27
module_df <- module_eigengenes %>%
tibble::rownames_to_column("accession_code") %>%
# Here we are performing an inner join with a subset of metadata
dplyr::inner_join(metadata %>%
dplyr::select(Run, Tissue),
by = c("accession_code" = "Run")
)
#Now we are ready for plotting.
ggplot(
module_df,
aes(
x = Tissue,
y = ME27,
color = Tissue
)
) +
# a boxplot with outlier points hidden (they will be in the sina plot)
geom_boxplot(width = 0.2, outlier.shape = NA) +
# A sina plot to show all of the individual data points
ggforce::geom_sina(maxwidth = 0.3) +
theme_classic()+ theme(axis.text.x=element_text(color = "black", size=8, angle=90))
#What genes are a part of module 27?
gene_module_key <- tibble::enframe(bwnet$colors, name = "gene", value = "module") %>%
# Let's add the `ME` part so its more clear what these numbers are and it matches elsewhere
dplyr::mutate(module = paste0("ME", module))
gene_module_key %>%
dplyr::filter(module == "ME27")
readr::write_tsv(gene_module_key, file = file.path("results", "apple_wgcna_gene_allgenes_to_module.tsv")
)
#Let’s save this gene to module key to a TSV file for future use.
readr::write_tsv(gene_module_key %>%
dplyr::filter(module == "ME27"), file = file.path("results", "apple_M27_wgcna_gene_all_to_module.tsv")
)
#Make a custom heatmap function
make_module_heatmap <- function(module_name,
expression_mat = normalized_counts,
metadata_df = metadata,
gene_module_key_df = gene_module_key,
module_eigengenes_df = module_eigengenes) {
# Create a summary heatmap of a given module.
#
# Args:
# module_name: a character indicating what module should be plotted, e.g. "ME19"
# expression_mat: The full gene expression matrix. Default is `normalized_counts`.
# metadata_df: a data frame with refinebio_accession_code and time_point
# as columns. Default is `metadata`.
# gene_module_key: a data.frame indicating what genes are a part of what modules. Default is `gene_module_key`.
# module_eigengenes: a sample x eigengene data.frame with samples as row names. Default is `module_eigengenes`.
#
# Returns:
# A heatmap of expression matrix for a module's genes, with a barplot of the
# eigengene expression for that module.
# Set up the module eigengene with its refinebio_accession_code
module_eigengene <- module_eigengenes_df %>%
dplyr::select(all_of(module_name)) %>%
tibble::rownames_to_column("Run")
# Set up column annotation from metadata
col_annot_df <- metadata_df %>%
# Only select the treatment and sample ID columns
dplyr::select(Run, Tissue) %>%
# Add on the eigengene expression by joining with sample IDs
dplyr::inner_join(module_eigengene, by = "Run") %>%
# Arrange by patient and time point
dplyr::arrange(Tissue) %>%
# Store sample
tibble::column_to_rownames("Run")
# Create the ComplexHeatmap column annotation object
col_annot <- ComplexHeatmap::HeatmapAnnotation(
# Supply treatment labels
Tissue = col_annot_df$Tissue,
# Add annotation barplot
module_eigengene = ComplexHeatmap::anno_barplot(dplyr::select(col_annot_df, module_name))
# Pick colors for each experimental group in time_point
#col = list(Tissue = c("anthers" = "#f1a340", "apple_fruit" = "#998ec3","filaments" = "#408ef1","ovaries" = "#f14b40","petals" = "#40f13a","pollen" = "#e7f140","receptacles" = "#564ec3","sepals" = "#f15b40","apple_roots_infect"="#f15","apple_roots_mock"="#f99","bud_stage"="#408","flower" ="#500","fruit_skin"="#998", "leaf"="#100ec2", "leaf_N"="#990ec2", "leaf_phospho"="#99ec22","root"="#881","root_N"="#212","root_phospho"="#111ec2","stigmas"="#900" ))
)
# Get a vector of the Ensembl gene IDs that correspond to this module
module_genes <- gene_module_key_df %>%
dplyr::filter(module == module_name) %>%
dplyr::pull(gene)
# Set up the gene expression data frame
mod_mat <- expression_mat %>%
t() %>%
as.data.frame() %>%
# Only keep genes from this module
dplyr::filter(rownames(.) %in% module_genes) %>%
# Order the samples to match col_annot_df
dplyr::select(rownames(col_annot_df)) %>%
# Data needs to be a matrix
as.matrix()
# Normalize the gene expression values
mod_mat <- mod_mat %>%
# Scale can work on matrices, but it does it by column so we will need to
# transpose first
t() %>%
scale() %>%
# And now we need to transpose back
t()
# Create a color function based on standardized scale
color_func <- circlize::colorRamp2(
c(-2, 0, 2),
c("#67a9cf", "#f7f7f7", "#ef8a62")
)
# Plot on a heatmap
heatmap <- ComplexHeatmap::Heatmap(mod_mat,
name = module_name,
# Supply color function
col = color_func,
# Supply column annotation
bottom_annotation = col_annot,
# We don't want to cluster samples
cluster_columns = FALSE,
# We don't need to show sample or gene labels
show_row_names = FALSE,
show_column_names = FALSE
)
# Return heatmap
return(heatmap)
}
#Make module heatmaps
mod_27_heatmap <- make_module_heatmap(module_name = "ME27")
# Print out the plot
mod_27_heatmap
print(mod_27_heatmap)
#We can save this plot to PNG.
pdf(file.path("results", "apple_module_27_heatmap.pdf"))
mod_27_heatmap
dev.off()
LISTOFGENES <- (gene_module_key[gene_module_key$module == "ME27",1])
normalized_counts[,colnames(normalized_counts) %in% LISTOFGENES]
head(LISTOFGENES)
head(colnames(normalized_counts))
normalized_counts[,colnames(normalized_counts) %in% LISTOFGENES$gene]
SUBNET <- normalized_counts[,colnames(normalized_counts) %in% LISTOFGENES$gene]
CR <- cor((SUBNET))
head(CR)
hist(CR)
DF <- as.data.frame(as.table(CR))
M27_Correlations <- DF[DF$Freq > 0.7 | DF$Freq < -0.7,]
write.table(M27_Correlations,file="FM27corrleations",row.names=F)