-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfigures.Rmd
403 lines (330 loc) · 12.1 KB
/
figures.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
---
title: "Figures"
author: "Jesse Connell"
date: "2018/05/15"
output:
html_document:
css: "figures.css"
toc: yes
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, fig.width = 9, fig.height = 7, dev="svg")
```
The code here will use the analysis output files to create some of the figures
for the paper, for those figures derived from our software. Figures created
manually are noted in the comments. The final published figures include
post-processing for color-coding, fonts, etc., but should show the same data as
given here. Separate files for figures and table are saved as well where
possible.
```{r load, message=FALSE}
## Load data and set up some helper functions
devtools::load_all("chiimp", quiet = TRUE)
library(msa)
dp_root <- "."
dp_results <- file.path(dp_root, "results")
# Round 24, the Miseq-vs-Sanger comparison
gm24 <- readRDS(file.path(dp_results, "gombe-round24/results.rds"))
# The second blinded test, first replicate, including a version without
# sequence-based filtering
gmblind <- readRDS(file.path(dp_results,
"gombe-blinded-test-2/results.rds"))
gmblindsimple <- readRDS(file.path(dp_results,
"gombe-blinded-test-2-simple/results.rds"))
gmblindfull <- readRDS(file.path(dp_results,
"gombe-blinded-test-2-full/results.rds"))
LOCI <- c("A", "B", "C", "D", "1", "2", "3", "4")
names(LOCI) <- LOCI
g_tbl_ce <- read.csv("metadata/gombe_ce_table.csv",
colClasses = "character",
check.names = F,
na.strings = "")
g_ce <- do.call(rbind, lapply(LOCI,
function(loc) {
data.frame(Locus = loc,
Name = g_tbl_ce$Name,
Allele1Seq = g_tbl_ce[, paste(loc, 1, sep="_")],
Allele2Seq = g_tbl_ce[, paste(loc, 2, sep="_")],
stringsAsFactors = FALSE)
}))
align_rep1_simple <- align_alleles(gmblindsimple$summary)
align_rep1 <- align_alleles(gmblind$summary)
# Set up a PDF for a single image with some default settings.
figure_pdf <- function(fignum, width = 11, height = 8.5, ...) {
pdf(file.path(dp_results, paste0("figure", fignum, ".pdf")),
title = paste("CHIIMP - Figure", fignum),
width = width,
height = height,
...)
}
# Save data in CSV format.
save_table <- function(data, fp_out) {
write.csv(data, fp_out, row.names=FALSE)
}
# Not currently used.
pandoc_pdf <- function(data, fp_out, title) {
fp <- tempfile()
cat(data, file = fp)
system2("pandoc", args = c("-V", "geometry:landscape",
"-T", paste("CHIIMP -", title),
"-s", fp,
"-o", fp_out))
}
# Wrapper around chiimp::plot_alignment to add some custom axes and labels
plot_aln_for_paper <- function(aln, ...) {
plt <- plot_alignment(
aln,
labels = NULL,
ylab = NULL,
display = c(yAxis = FALSE),
...)
title(ylab = "Sequence Count", line = 2)
axis(2, tick = FALSE, padj = 1, cex.axis = 0.75,
at = seq_along(plt$seq),
labels = sapply(strsplit(names(plt$seqs), "_"), "[", 2))
}
```
# Main
## Figure 1
```{r Fig1}
figure_1 <- function() {
# Panel a
histogram(gm24$data[["2-4861-C"]],
main = "Sanger-vs-MiSeq 24 - Histogram for Sample 4861 Locus C",
sample.summary = gm24$summary["2-4861-C", ],
locus.name = "C",
cutoff_fraction = 0.05,
xlim = c(125, 225))
#Panel b
invisible(plot_aln_for_paper(
gm24$alignments[["1"]],
main = "a) Sanger-vs-MiSeq 24 - Alignment for Locus 1"))
#Panel c
invisible(plot_aln_for_paper(
gm24$alignments[["C"]],
main = "b) Sanger-vs-MiSeq 24 - Alignment for Locus C"))
}
figure_pdf(1)
figure_1()
invisible(dev.off())
figure_1()
```
## Figure 2
```{r Fig2}
## homplasy example and inheritance.
# Panel a: alignment of the 234-bp alleles for Locus 3.
figure_2 <- function() {
# This version is color-coded differently but shows the same sequence content.
alleles <- subset(gm24$allele.names[grep("^234-.", gm24$allele.names$Name), ],
Locus == "3")
seqs <- as.character(alleles$Seq)
names(seqs) <- as.character(alleles$Name)
alignment_segment <- msa(seqs, type = "dna", order = "input")
# For some reason msaPrettyPrint seems to ignore a path given to the file
# argument. So we'll change directories back and forth to make it work.
.wd <- getwd()
setwd(dp_results)
msaPrettyPrint(alignment_segment,
c(82, 162),
showNumbering = c("none"),
showLogo = "none",
output = "pdf",
file = "figure2.pdf",
paperWidth = 10,
paperHeight = 3,
askForOverwrite = FALSE)
setwd(.wd)
}
figure_2()
# Panel b
# Manually-created family tree using the allele names identified.
```
![](results/figure2.pdf)
## Figure 3
```{r Fig3, results='asis', fig.width=10, fig.height=4}
figure_3 <- function() {
samps <- c("4821", "4807", "4566")
# Panel a - c
# example idents tables for gm24 samples 4821, 4807, 4566
# In this example we allow a higher maximum distance so we can show an example
# for sample 4566: a number of individuals have similar distance scores but
# they're all "far away" from the sample.
ks <- lapply(samps, function(samp) {
gm24_chunk <- gm24
gm24_chunk$summary <- subset(gm24$summary, Sample == samp)
dist_mat <- gm24$dist_mat_known[samp, , drop = FALSE]
closest <- find_closest_matches(dist_mat, range = 2, maximum = 8)
tbl <- report_idents(gm24_chunk, closest, allele.names = gm24$allele.names)
tbl$Name[1] <- paste("Sample", samp)
kable_idents(tbl, closest)
})
# Panel d
# restricting the distance matrix heatmap to entries of interest as above
yrange <- c("Bahati", "Mgani")
idx <- match(yrange, colnames(gm24$dist_mat_known))
chunk <- gm24$dist_mat_known[samps, idx[1]:idx[2]]
f <- function () plot_dist_mat(chunk,
dist.display_thresh = gm24$config$report.dist_max,
main = "Sanger vs. MiSeq 24 - Distance Matrix Between Samples and Known Genotypes",
cellwidth = 9,
cellheight = 12)
figure_pdf("3d")
f()
dev.off()
for (k in ks)
cat(k, "\n")
f()
}
figure_3()
```
## Figure 4
```{r Fig4, fig.width=10, fig.height=12, dev="png"}
## Alignments for GME/Gombe/Both for loci B and D
# known genotypes for GME
g_gme <- load_genotypes("metadata/known_genotypes_gme.csv")
g_gme$Locus <- factor(g_gme$Locus)
# known genotypes for Gombe
g <- gm24$genotypes.known[, c("Name", "Locus", "Allele1Seq", "Allele2Seq")]
g$Locus <- factor(g$Locus)
# Combination of both groups
g_combo <- rbind(g, g_gme)
alignments <- align_alleles(g)
alignments_gme <- align_alleles(g_gme)
alignments_combo <- align_alleles(g_combo)
figure_4 <- function() {
plot_aln_for_paper(alignments_gme[["B"]], main = "GME Locus B")
plot_aln_for_paper(alignments[["B"]], main = "Gombe Locus B")
plot_aln_for_paper(alignments_combo[["B"]], main = "Combined Locus B")
plot_aln_for_paper(alignments_gme[["D"]], main = "GME Locus D")
plot_aln_for_paper(alignments[["D"]], main = "Gombe Locus D")
plot_aln_for_paper(alignments_combo[["D"]], main = "Combined Locus D")
invisible()
}
figure_pdf(4, width = 12, height=12)
layout(matrix(c(rep(1:3, each=3, times=2),
rep(4:6, each=3, times=2),
rep(0, 9)), ncol=5))
figure_4()
invisible(dev.off())
layout(matrix(1:6, ncol = 2))
figure_4()
```
## Figure 5
*Figure 5 is a set of manually-created flowcharts.*
## Table 1
*Table 1 is a manually-created comparison of capillary electrophoresis and MiSeq based genotyping results.*
## Table 2
*Table 2 is a manually-created table of erroneous allele calls by capillary electrophoresis and MiSeq genotyping methods.*
## Table 3
```{r Table3, results="asis"}
# Table 3: Allele and heterozygosity information between CE (capillary
# electrophoresis) and MiSeq genotypes. Info for cryptic alleles was handled
# manually so is not shown here.
# Expected heterozygote frequency (for any combo "Aa") is just 100% minus all
# the possible homozygous situations. And the expected frequency of AA will be
# the square of the frequency for A since we're assuming an independent mixing
# of possible alleles. Since each homozygous pair is a completely separate
# outcome we can sum those together to get the total for any homozygous
# situation and then do the subtraction.
# values: frequency of each allele
heterozygosity_exp <- function(values) {
cts <- table(unlist(values))
freq <- cts / sum(cts)
1 - sum(freq^2)
}
# expected heterozygosity fraction per locus
hz_exp_loci <- function(g) {
sapply(split(g, g$Locus), function(chunk) {
a <- chunk[, c("Allele1Seq", "Allele2Seq")]
a$Allele2Seq[is.na(a$Allele2Seq)] <- a$Allele1Seq[is.na(a$Allele2Seq)]
heterozygosity_exp(a)
})
}
# number of distinct alleles per locus
num_alleles <- function(g) {
sapply(split(g, g$Locus), function(chunk) {
length(table(unlist(chunk[, c("Allele1Seq", "Allele2Seq")])))
})
}
g <- gm24$genotypes.known
g$Locus <- factor(g$Locus, levels = LOCI)
g_ce$Locus <- factor(g_ce$Locus, levels = levels(g$Locus))
num_a <- num_alleles(g)
num_a_ce <- num_alleles(g_ce)
# Exclude incomplete genotypes in CE list from both CE and MiSeq, for
# comparability
missings <- apply(g_tbl_ce, 1, function(x) sum(is.na(x)))
names(missings) <- g_tbl_ce$Name
g_ce <- subset(g_ce, Name %in% names(missings[missings==0]))
g <- subset(g, Name %in% names(missings[missings==0]))
hz_exp <- hz_exp_loci(g)
hz_exp_ce <- hz_exp_loci(g_ce)
table_3 <- function() {
tbl3 <- data.frame(Locus = names(num_a),
NumAllelesCE = num_a_ce,
NumAllelesMiSeq = num_a,
HCE = hz_exp_ce,
HMiSeq = hz_exp,
check.names = FALSE,
stringsAsFactors = FALSE)
tbl3 <- rbind(tbl3, list(Locus = "Total/Mean",
NumAllelesCE = sum(num_a_ce),
NumAllelesMiSeq = sum(num_a),
HCE = mean(hz_exp_ce),
HMiSeq = mean(hz_exp)))
tbl3
}
tbl <- table_3()
tbl[, -1] <- round(tbl[, -1], digits = 2)
save_table(tbl, file.path(dp_results, "table3.csv"))
colnames(tbl) <- c("Locus", "CE", "MiSeq", "CE", "MiSeq")
k <- knitr::kable(tbl,
row.names = F,
format = "html")
k <- kableExtra::kable_styling(k, full_width = FALSE)
kableExtra::add_header_above(k,
c(" " = 1,
"Number of Alleles" = 2,
"Heterozygosity" = 2))
```
`r sum(missings>0)` individuals out of `r nrow(g_tbl_ce)` excluded due to
missing CE data: `r names(missings[missings>0])`
## Table 4
*Table 4 shows the "cryptic" alleles identified.*
## Table 5
*Table 5 is a comparsion of singleplex and multiplex amplification.*
# Supplemental
## Figure S1
Figure S1 is an example report. See `results/gombe-round24/report.html` and the
images in `results/gombe-round24/histograms`.
## Table S1
*Table S1 is a summary of the loci used.*
## Table S2
```{r TableS2}
table_S2 <- function() {
g <- gm24$genotypes.known
g <- g[order(match(g$Locus, LOCI), na.last = NA), ]
tbl.g <- summarize_genotypes_known(g)
r <- report_genotypes(tbl.g, gm24$allele.names)
idx <- match(r$Sample, gm24$genotypes.known$Name)
# Append extra columns
r$`Sample code` <- as.integer(gm24$genotypes.known$GMCode[idx])
r$`Chimp ID` <- gm24$genotypes.known$Name[idx]
# Order columns to put extra info first,
# remove Sample column
headings <- c("Sample code", "Chimp ID")
cols <- match(headings, colnames(r))
cols <- c(cols,
which(! colnames(r) %in% headings))
r <- r[, cols]
r <- r[, -match("Sample", colnames(r))]
# Order rows by GM code
r <- r[order(r$`Sample code`), ]
r
}
save_table(table_S2(), file.path(dp_results, "tableS2.csv"))
data <- table_S2()
k <- knitr::kable(data, row.names = FALSE, format = "html")
kableExtra::kable_styling(k)
```
## Table S3
*Table S3 is a comparsion of genotyping results between singleplex and multiplex approaches.*