Skip to content

Latest commit

 

History

History
125 lines (102 loc) · 3.31 KB

abyss2_bionano_arcs.hg004.bx.as100.nm5.bam.mi.bx.molecule.summary.md

File metadata and controls

125 lines (102 loc) · 3.31 KB

Summary statistics of a Chromium library

Shaun Jackman

Calculate Lx and Nx statistics, like L50 and N50

weighted_median_lx <- function(x, g = sum(as.numeric(x)), p = 0.5)
{
	tibble(x = x) %>% arrange(desc(x)) %>% mutate(i = seq_along(x), fraction = cumsum(as.numeric(x)) / g) %>% filter(fraction >= p) %$% i[[1]]
}

weighted_median_nx <- function(x, g = sum(as.numeric(x)), p = 0.5)
{
	tibble(x = x) %>% arrange(desc(x)) %$% x[[weighted_median_lx(x, g, p)]]
}

Read data

molecules_orig <- read_tsv(molecules_filename,
	col_types = cols(
	  Rname = col_character(),
	  Start = col_integer(),
	  End = col_integer(),
	  Size = col_integer(),
	  BX = col_character(),
	  MI = col_integer(),
	  Reads = col_integer(),
	  Mapq_median = col_integer(),
	  AS_median = col_integer(),
	  NM_median = col_integer()))

Filter

reads_threshold <- 4
as_median_threshold <- 100
nm_median_threshold <- 5
size_threshold <- 500

molecules <- molecules_orig %>%
	mutate(
		LogReadDensity = log10(Reads / ifelse(Size == 0, NA, Size)),
		Plastid = Rname == "KT634228") %>%
	filter(!is.na(BX),
		!Plastid,
		Reads >= reads_threshold,
		AS_median >= as_median_threshold,
		NM_median < nm_median_threshold,
		Size >= size_threshold)

Count molecules and reads per barcode

barcodes <- molecules %>%
	group_by(BX) %>%
	summarize(Molecules = n(), Reads = sum(Reads), Size = sum(Size)) %>%
	arrange(desc(Reads)) %>%
	ungroup()

GEM Performance

summary_table_barcodes <- tibble(
	"GEMs Detected" = nrow(barcodes),
	"N50 Linked-Reads per GEM" = weighted_median_nx(barcodes$Reads),
	"Median DNA per GEM" = median(barcodes$Size),
	"Mean DNA per GEM" = mean(barcodes$Size),
	"N50 DNA per GEM" = weighted_median_nx(barcodes$Size)
	) %>% gather("Metric", "Value")
summary_table_barcodes
Metric Value
GEMs Detected 1713802.0
N50 Linked-Reads per GEM 699.0
Median DNA per GEM 214716.0
Mean DNA per GEM 263369.9
N50 DNA per GEM 395612.0

Input DNA

summary_table_molecules <- tibble(
	"Molecules Detected" = nrow(molecules),
	"N50 Linked-Reads per Molecule" = weighted_median_nx(molecules$Reads),
	"Median Molecule Length" = median(molecules$Size),
	"Mean Molecule Length" = mean(molecules$Size),
	"N50 Molecule Length" = weighted_median_nx(molecules$Size)
	) %>% gather("Metric", "Value")
summary_table_molecules
Metric Value
Molecules Detected 11518885.00
N50 Linked-Reads per Molecule 138.00
Median Molecule Length 23525.00
Mean Molecule Length 39184.68
N50 Molecule Length 75138.00

Total DNA Mass

ggplot(molecules) +
	aes(x = Size, weight = Size) +
	geom_histogram(binwidth = 1000, boundary = 0) +
	scale_x_continuous(name = "Molecule size", labels = unit_format(unit = "kbp", scale = 1e-3)) +
	scale_y_continuous(name = "Total DNA mass (Mbp)", labels = unit_format(unit = "Mbp", scale = 1e-6))

Write summary table to a TSV file

rbind(summary_table_barcodes, summary_table_molecules) %>% write_tsv(output_tsv_filename)