figures.Rmd

---
title: "Figures"
author: "Jesse Connell"
date: "2018/05/15"
output:
  html_document:
    css: "figures.css"
    toc: yes
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, fig.width = 9, fig.height = 7, dev="svg")
```

The code here will use the analysis output files to create some of the figures
for the paper, for those figures derived from our software.  Figures created
manually are noted in the comments.  The final published figures include
post-processing for color-coding, fonts, etc., but should show the same data as
given here.  Separate files for figures and table are saved as well where
possible.

```{r load, message=FALSE}
## Load data and set up some helper functions
devtools::load_all("chiimp", quiet = TRUE)
library(msa)

dp_root <- "."
dp_results <- file.path(dp_root, "results")
# Round 24, the Miseq-vs-Sanger comparison
gm24 <- readRDS(file.path(dp_results, "gombe-round24/results.rds"))
# The second blinded test, first replicate, including a version without
# sequence-based filtering
gmblind <- readRDS(file.path(dp_results,
                      "gombe-blinded-test-2/results.rds"))
gmblindsimple <- readRDS(file.path(dp_results,
                      "gombe-blinded-test-2-simple/results.rds"))
gmblindfull <- readRDS(file.path(dp_results,
                      "gombe-blinded-test-2-full/results.rds"))

LOCI <- c("A", "B", "C", "D", "1", "2", "3", "4")
names(LOCI) <- LOCI

g_tbl_ce <- read.csv("metadata/gombe_ce_table.csv",
                     colClasses = "character",
                     check.names = F,
                     na.strings = "")
g_ce <- do.call(rbind, lapply(LOCI,
                              function(loc) {
  data.frame(Locus = loc,
             Name = g_tbl_ce$Name,
             Allele1Seq = g_tbl_ce[, paste(loc, 1, sep="_")],
             Allele2Seq = g_tbl_ce[, paste(loc, 2, sep="_")],
             stringsAsFactors = FALSE)
}))

align_rep1_simple <- align_alleles(gmblindsimple$summary)
align_rep1        <- align_alleles(gmblind$summary)

# Set up a PDF for a single image with some default settings.
figure_pdf <- function(fignum, width = 11, height = 8.5, ...) {
  pdf(file.path(dp_results, paste0("figure", fignum, ".pdf")),
      title = paste("CHIIMP - Figure", fignum),
      width = width,
      height = height,
      ...)
}

# Save data in CSV format.
save_table <- function(data, fp_out) {
	write.csv(data, fp_out, row.names=FALSE)
}

# Not currently used.
pandoc_pdf <- function(data, fp_out, title) {
  fp <- tempfile()
  cat(data, file = fp)
  system2("pandoc", args = c("-V", "geometry:landscape",
                           "-T", paste("CHIIMP -", title),
                           "-s", fp,
                           "-o", fp_out))
}

# Wrapper around chiimp::plot_alignment to add some custom axes and labels
plot_aln_for_paper <- function(aln, ...) {
  plt <- plot_alignment(
    aln,
    labels = NULL,
    ylab = NULL,
    display = c(yAxis = FALSE),
    ...)
  title(ylab = "Sequence Count", line = 2)
  axis(2, tick = FALSE, padj = 1, cex.axis = 0.75,
       at = seq_along(plt$seq),
       labels = sapply(strsplit(names(plt$seqs), "_"), "[", 2))
}
```

# Main

## Figure 1

```{r Fig1}

figure_1 <- function() {
  # Panel a
  histogram(gm24$data[["2-4861-C"]],
            main = "Sanger-vs-MiSeq 24 - Histogram for Sample 4861 Locus C",
            sample.summary = gm24$summary["2-4861-C", ],
            locus.name = "C",
            cutoff_fraction = 0.05,
            xlim = c(125, 225))
  #Panel b
  invisible(plot_aln_for_paper(
    gm24$alignments[["1"]],
    main = "a) Sanger-vs-MiSeq 24 - Alignment for Locus 1"))
  #Panel c
  invisible(plot_aln_for_paper(
    gm24$alignments[["C"]],
    main = "b) Sanger-vs-MiSeq 24 - Alignment for Locus C"))
}

figure_pdf(1)
figure_1()
invisible(dev.off())
figure_1()
```

## Figure 2

```{r Fig2}
## homplasy example and inheritance.

# Panel a: alignment of the 234-bp alleles for Locus 3.
figure_2 <- function() {
  # This version is color-coded differently but shows the same sequence content.
  alleles <- subset(gm24$allele.names[grep("^234-.", gm24$allele.names$Name), ],
                    Locus == "3")
  seqs <- as.character(alleles$Seq)
  names(seqs) <- as.character(alleles$Name)
  alignment_segment <- msa(seqs, type = "dna", order = "input")
  # For some reason msaPrettyPrint seems to ignore a path given to the file
  # argument.  So we'll change directories back and forth to make it work.
  .wd <- getwd()
  setwd(dp_results)
  msaPrettyPrint(alignment_segment,
                 c(82, 162), 
                 showNumbering = c("none"),
                 showLogo = "none",
                 output = "pdf",
                 file = "figure2.pdf",
                 paperWidth = 10,
                 paperHeight = 3,
                 askForOverwrite = FALSE)
  setwd(.wd)
}

figure_2()
# Panel b
# Manually-created family tree using the allele names identified.
```

![](results/figure2.pdf)

## Figure 3

```{r Fig3, results='asis', fig.width=10, fig.height=4}

figure_3 <- function() {
  samps <- c("4821", "4807", "4566")

  # Panel a - c
  # example idents tables for gm24 samples 4821, 4807, 4566
  
  # In this example we allow a higher maximum distance so we can show an example 
  # for sample 4566: a number of individuals have similar distance scores but
  # they're all "far away" from the sample.
  ks <- lapply(samps, function(samp) {
    gm24_chunk <- gm24
    gm24_chunk$summary <- subset(gm24$summary, Sample == samp)
    dist_mat <- gm24$dist_mat_known[samp, , drop = FALSE]
    closest <- find_closest_matches(dist_mat, range = 2, maximum = 8)
    tbl <- report_idents(gm24_chunk, closest, allele.names = gm24$allele.names)
    tbl$Name[1] <- paste("Sample", samp)
    kable_idents(tbl, closest)
  })
  
  # Panel d
  # restricting the distance matrix heatmap to entries of interest as above
  yrange <- c("Bahati", "Mgani")
  idx <- match(yrange, colnames(gm24$dist_mat_known))
  chunk <- gm24$dist_mat_known[samps, idx[1]:idx[2]]
  
  f <- function () plot_dist_mat(chunk,
                     dist.display_thresh = gm24$config$report.dist_max,
                     main = "Sanger vs. MiSeq 24 - Distance Matrix Between Samples and Known Genotypes",
                     cellwidth = 9,
                     cellheight = 12)
  
  figure_pdf("3d")
  f()
  dev.off()
  
  for (k in ks)
    cat(k, "\n")
  f()
}

figure_3()

```

## Figure 4

```{r Fig4, fig.width=10, fig.height=12, dev="png"}
## Alignments for GME/Gombe/Both for loci B and D

# known genotypes for GME
g_gme <- load_genotypes("metadata/known_genotypes_gme.csv")
g_gme$Locus <- factor(g_gme$Locus)

# known genotypes for Gombe
g <- gm24$genotypes.known[, c("Name", "Locus", "Allele1Seq", "Allele2Seq")]
g$Locus <- factor(g$Locus)

# Combination of both groups
g_combo <- rbind(g, g_gme)

alignments <- align_alleles(g)
alignments_gme <- align_alleles(g_gme)
alignments_combo <- align_alleles(g_combo)

figure_4 <- function() {
  plot_aln_for_paper(alignments_gme[["B"]],   main = "GME Locus B")
  plot_aln_for_paper(alignments[["B"]],       main = "Gombe Locus B")
  plot_aln_for_paper(alignments_combo[["B"]], main = "Combined Locus B")
  plot_aln_for_paper(alignments_gme[["D"]],   main = "GME Locus D")
  plot_aln_for_paper(alignments[["D"]],       main = "Gombe Locus D")
  plot_aln_for_paper(alignments_combo[["D"]], main = "Combined Locus D")
  invisible()
}

figure_pdf(4, width = 12, height=12)
layout(matrix(c(rep(1:3, each=3, times=2),
                rep(4:6, each=3, times=2),
                rep(0, 9)), ncol=5))
figure_4()
invisible(dev.off())

layout(matrix(1:6, ncol = 2))
figure_4()
```

## Figure 5

*Figure 5 is a set of manually-created flowcharts.*

## Table 1

*Table 1 is a manually-created comparison of capillary electrophoresis and MiSeq based genotyping results.*

## Table 2

*Table 2 is a manually-created table of erroneous allele calls by capillary electrophoresis and MiSeq genotyping methods.*

## Table 3

```{r Table3, results="asis"}
# Table 3: Allele and heterozygosity information between CE (capillary 
# electrophoresis) and MiSeq genotypes.  Info for cryptic alleles was handled
# manually so is not shown here.

# Expected heterozygote frequency (for any combo "Aa") is just 100% minus all
# the possible homozygous situations.  And the expected frequency of AA will be
# the square of the frequency for A since we're assuming an independent mixing
# of possible alleles.  Since each homozygous pair is a completely separate
# outcome we can sum those together to get the total for any homozygous
# situation and then do the subtraction.
# values: frequency of each allele
heterozygosity_exp <- function(values) {
  cts <- table(unlist(values))
  freq <- cts / sum(cts)
  1 - sum(freq^2)
}

# expected heterozygosity fraction per locus
hz_exp_loci <- function(g) {
  sapply(split(g, g$Locus), function(chunk) {
    a <- chunk[, c("Allele1Seq", "Allele2Seq")]
    a$Allele2Seq[is.na(a$Allele2Seq)] <- a$Allele1Seq[is.na(a$Allele2Seq)]
    heterozygosity_exp(a)
  })
}

# number of distinct alleles per locus
num_alleles <- function(g) {
  sapply(split(g, g$Locus), function(chunk) {
    length(table(unlist(chunk[, c("Allele1Seq", "Allele2Seq")])))
  })
}

g <- gm24$genotypes.known
g$Locus <- factor(g$Locus, levels = LOCI)
g_ce$Locus <- factor(g_ce$Locus, levels = levels(g$Locus))

num_a <- num_alleles(g)
num_a_ce <- num_alleles(g_ce)

# Exclude incomplete genotypes in CE list from both CE and MiSeq, for
# comparability
missings <- apply(g_tbl_ce, 1, function(x) sum(is.na(x)))
names(missings) <- g_tbl_ce$Name
g_ce <- subset(g_ce, Name %in% names(missings[missings==0]))
g <- subset(g, Name %in% names(missings[missings==0]))

hz_exp <- hz_exp_loci(g)
hz_exp_ce <- hz_exp_loci(g_ce)

table_3 <- function() {
  tbl3 <- data.frame(Locus = names(num_a),
                   NumAllelesCE = num_a_ce,
                   NumAllelesMiSeq = num_a,
                   HCE = hz_exp_ce,
                   HMiSeq = hz_exp,
                   check.names = FALSE,
                   stringsAsFactors = FALSE)
  tbl3 <- rbind(tbl3, list(Locus = "Total/Mean",
                         NumAllelesCE = sum(num_a_ce),
                         NumAllelesMiSeq = sum(num_a),
                         HCE = mean(hz_exp_ce),
                         HMiSeq = mean(hz_exp)))
  tbl3
}

tbl <- table_3()
tbl[, -1] <- round(tbl[, -1], digits = 2)
save_table(tbl, file.path(dp_results, "table3.csv"))
colnames(tbl) <- c("Locus", "CE", "MiSeq", "CE", "MiSeq")
k <- knitr::kable(tbl,
                  row.names = F,
                  format = "html")
k <- kableExtra::kable_styling(k, full_width = FALSE)
kableExtra::add_header_above(k,
                             c(" " = 1,
                               "Number of Alleles" = 2,
                               "Heterozygosity" = 2))
```

`r sum(missings>0)` individuals out of `r nrow(g_tbl_ce)` excluded due to
missing CE data: `r names(missings[missings>0])`

## Table 4

*Table 4 shows the "cryptic" alleles identified.*

## Table 5

*Table 5 is a comparsion of singleplex and multiplex amplification.*

# Supplemental

## Figure S1

Figure S1 is an example report.  See `results/gombe-round24/report.html` and the
images in `results/gombe-round24/histograms`.

## Table S1

*Table S1 is a summary of the loci used.*

## Table S2

```{r TableS2}
table_S2 <- function() {
  g <- gm24$genotypes.known
  g <- g[order(match(g$Locus, LOCI), na.last = NA), ]
  tbl.g <- summarize_genotypes_known(g)
  r <- report_genotypes(tbl.g, gm24$allele.names)
  idx <- match(r$Sample, gm24$genotypes.known$Name)
  # Append extra columns
  r$`Sample code` <- as.integer(gm24$genotypes.known$GMCode[idx])
  r$`Chimp ID` <- gm24$genotypes.known$Name[idx]
  # Order columns to put extra info first,
  # remove Sample column
  headings <- c("Sample code", "Chimp ID")
  cols <- match(headings, colnames(r))
  cols <- c(cols,
            which(! colnames(r) %in% headings))
  r <- r[, cols]
  r <- r[, -match("Sample", colnames(r))]
  # Order rows by GM code
  r <- r[order(r$`Sample code`), ]
  r
}

save_table(table_S2(), file.path(dp_results, "tableS2.csv"))
data <- table_S2()
k <- knitr::kable(data, row.names = FALSE, format = "html")
kableExtra::kable_styling(k)
```

## Table S3

*Table S3 is a comparsion of genotyping results between singleplex and multiplex approaches.*