Skip to content

Commit 070d30a

Browse files
authored
Merge pull request #14 from nevilledusaj/master
Add GEX Comparison Script
2 parents e9a9629 + 7b07ed4 commit 070d30a

File tree

3 files changed

+307
-1
lines changed

3 files changed

+307
-1
lines changed

Genotyping_GEX_Comparison.Rmd

Lines changed: 306 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,306 @@
1+
---
2+
title: "IronThrone Gene Expression Library Comparison"
3+
output: html_notebook
4+
---
5+
6+
This script uses BC/UMI pairs from the gene expression library to further refine genotyping calls from IronThrone. We begin by assigning file paths and character strings as options for the various files and parameters we will need.
7+
- `bc_loc` - File path for text file containing filtered barcodes from Cellranger output, can be found in `/outs/filtered_feature_bc_matrix`
8+
- `h5_file` - File path for gene expression `molecule_info.h5` file from Cellranger output
9+
- `got_df_loc` - File path for IronThrone output `.txt` file
10+
- `target_gene` - Character string for targeted gene of interest as it is named in 10X data
11+
- `output_dir` - Folder path for desired location for outputs from this script
12+
13+
```{r}
14+
bc_loc <- "~/GitHub/IronThrone_RMD_Files/barcodes.tsv"
15+
h5_file <- "~/GitHub/IronThrone_RMD_Files/molecule_info.h5"
16+
got_df_loc <- "~/GitHub/IronThrone_RMD_Files/myGoT.summTable.concat.umi_collapsed.txt"
17+
target_gene <- "DNMT3A"
18+
output_dir <- "~/Output"
19+
```
20+
21+
Next, we define a couple of functions that will be useful for converting UMI sequences to binary representation and vice versa. To save storage space, Cellranger converts UMI sequences into 2-bit representation, as described [here](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/output/molecule_info). These functions help us convert between the two representations as needed. By default, we assume 12bp UMI sequences, but that can be modified in the `umi_bin_to_seq` parameters.
22+
```{r}
23+
umi_bin_to_seq <- function(umi_decimal, umi_len = 12){
24+
umi_bin <- R.utils::intToBin(umi_decimal)
25+
umi <- NA
26+
if (nchar(umi_bin) < (umi_len*2)){
27+
difference <- (umi_len*2)-nchar(umi_bin)
28+
umi_bin <- paste0(paste0(rep(0, difference), collapse = ""), umi_bin)
29+
}
30+
if (nchar(umi_bin) == (umi_len*2)){
31+
umi_bits <- substring(umi_bin, seq(1, nchar(umi_bin), 2), seq(2, nchar(umi_bin), 2))
32+
umi_char <- plyr::mapvalues(umi_bits, from = c("00", "01", "10", "11"), to = c("A", "C", "G", "T"), warn_missing = FALSE)
33+
umi <- paste(umi_char, sep = "", collapse = "")
34+
}
35+
return(umi)
36+
}
37+
38+
39+
umi_seq_to_bin_decimal <- function(umi_seq){
40+
umi_char <- unlist(strsplit(umi_seq, ""))
41+
umi_bits <- plyr::mapvalues(umi_char, to = c("00", "01", "10", "11"), from = c("A", "C", "G", "T"), warn_missing = FALSE)
42+
umi_bin <- paste0(umi_bits, collapse = "")
43+
umi_decimal <- strtoi(umi_bin, base = 2)
44+
return(umi_decimal)
45+
}
46+
```
47+
48+
We also load in additional requisite packages that will be used throughout this analysis.
49+
```{r, message=FALSE}
50+
library(parallel)
51+
library(tidyverse)
52+
library(rhdf5)
53+
library(stringdist)
54+
```
55+
56+
57+
First, we load barcodes that Cellranger has identified.
58+
``` {r}
59+
seurat_bcs <- scan(file = bc_loc, what = "character", quiet = TRUE)
60+
seurat_bcs <- gsub("-.*", "", seurat_bcs)
61+
```
62+
63+
64+
Next, we create a list containing the molecule information from cellranger as a database to which we can compare IronThrone results.
65+
```{r}
66+
gex_molecules <- list()
67+
gex_molecules[["barcode_idx"]] <- h5read(h5_file, name = "barcode_idx") + 1
68+
gex_molecules[["barcodes"]] <- h5read(h5_file, name = "barcodes")
69+
gex_molecules[["umi_counts"]] <- h5read(h5_file, name = "count")
70+
gex_molecules[["feature_idx"]] <- h5read(h5_file, name = "feature_idx") + 1
71+
gex_molecules[["feature_name"]] <- h5read(h5_file, name = "features/name")
72+
gex_molecules[["feature_id"]] <- h5read(h5_file, name = "features/id")
73+
gex_molecules[["umi"]] <- h5read(h5_file, name = "umi")
74+
```
75+
76+
We create a metadata data frame of cell barcodes found in gene expression data that will be useful for downstream analysis.
77+
```{r}
78+
md <- data.frame(BC = seurat_bcs)
79+
```
80+
81+
We then read in IronThrone results as a data frame, removing any rows where cell barcodes were ultimately filtered for not having sufficient supporting reads.
82+
```{r}
83+
got_df <- read.delim(got_df_loc, stringsAsFactors = FALSE)
84+
got_df <- got_df[got_df$UMI != "",]
85+
```
86+
87+
We can begin with the most naïve genotyping, using the complete IronThrone output with any comparison back to gene expression data. Here, any number of MUT UMIs results in a MUT call, and 0 MUT and any WT UMIs is called as WT. These results will be stored in the column `unfilt.Genotype`.
88+
```{r}
89+
temp_metadata <- merge(x = got_df, by.x = c("BC"), y = md, by.y = c("BC"), all.y = TRUE)
90+
91+
md$unfilt.WT.calls <- as.numeric(temp_metadata$WT.calls)
92+
md$unfilt.MUT.calls <- as.numeric(temp_metadata$MUT.calls)
93+
md$unfilt.Total.calls <- as.numeric(md$unfilt.WT.calls) + as.numeric(md$unfilt.MUT.calls)
94+
95+
md$unfilt.Genotype <- ifelse(is.na(md$unfilt.WT.calls), "No Data",
96+
ifelse(md$unfilt.MUT.calls>0, "MUT",
97+
ifelse(md$unfilt.WT.calls>=1, "WT", "NA")))
98+
```
99+
100+
To begin our comparisons to the GEX library, we split our IronThrone results into a per-UMI data frame.
101+
```{r}
102+
split_got_df <- data.frame(matrix(nrow = length(unlist(strsplit(got_df[,"UMI"],";"))), ncol = 0))
103+
for (i in colnames(got_df)){
104+
per_umi <- length(grep(";", got_df[,i])) > 0
105+
if (per_umi){
106+
split_got_df[,i] <- unlist(strsplit(got_df[,i],";"))
107+
} else {
108+
split_got_df[,i] <- rep(got_df[,i], times = got_df$WT.calls + got_df$MUT.calls + got_df$amb.calls)
109+
}
110+
}
111+
split_got_df$num.WT.in.dups <- as.numeric(split_got_df$num.WT.in.dups)
112+
split_got_df$num.MUT.in.dups <- as.numeric(split_got_df$num.MUT.in.dups)
113+
split_got_df$num.amb.in.dups <- as.numeric(split_got_df$num.amb.in.dups)
114+
split_got_df$WT.calls <- as.numeric(split_got_df$WT.calls)
115+
split_got_df$MUT.calls <- as.numeric(split_got_df$MUT.calls)
116+
split_got_df$amb.calls <- as.numeric(split_got_df$amb.calls)
117+
118+
split_got_df$BC_UMI <- paste0(split_got_df$BC, "_", split_got_df$UMI)
119+
split_got_df$total_dups <- split_got_df$num.WT.in.dups + split_got_df$num.MUT.in.dups + split_got_df$num.amb.in.dups
120+
split_got_df$total_dups_wt_mut <- split_got_df$num.WT.in.dups + split_got_df$num.MUT.in.dups
121+
split_got_df$UMI_bin <- unlist(mclapply(split_got_df$UMI, umi_seq_to_bin_decimal, mc.cores = detectCores()))
122+
split_got_df$BC_UMI_bin <- paste0(split_got_df$BC, "_", split_got_df$UMI_bin)
123+
```
124+
125+
Next, we create a new data frame of all the BC/UMI combinations found in our GEX library that correspond to our target gene of interest.
126+
```{r}
127+
target_gene_idx <- which(gex_molecules$feature_name == target_gene)
128+
target_gene_id <- gex_molecules$feature_id[target_gene_idx]
129+
target_gene_entries <- which(gex_molecules$feature_idx == target_gene_idx)
130+
df_10x <- data.frame("BC" = gex_molecules$barcodes[gex_molecules$barcode_idx[target_gene_entries]])
131+
132+
umi_filt <- gex_molecules$umi[target_gene_entries]
133+
df_10x$UMI <- unlist(mclapply(umi_filt, mc.cores = detectCores(), FUN = function(x){
134+
umi_bin_to_seq(x, umi_len = 12)
135+
}))
136+
df_10x$counts <- gex_molecules$umi_counts[target_gene_entries]
137+
df_10x$BC_UMI <- paste0(df_10x$BC, "_", df_10x$UMI)
138+
```
139+
140+
Using this information, we can see which BC/UMI pairs exactly or approximately (within 2 edits) match BC/UMI pairs for our target gene of interest in 10X GEX data.
141+
```{r}
142+
split_got_df$Exact_Match <- split_got_df$BC_UMI %in% df_10x$BC_UMI
143+
144+
split_got_df$Approx_Match <- unlist(mclapply(split_got_df$BC_UMI, mc.cores = detectCores(), FUN = function(x){
145+
ain(x, df_10x$BC_UMI, method = "lv", maxDist = 2)
146+
}))
147+
```
148+
149+
Now, we create a data frame of all BC/UMI pairs in our 10X GEX data, and filter it down to those BC/UMI pairs we find in our genotyping results.
150+
```{r}
151+
target_bc_idx <- which(gex_molecules$barcodes %in% got_df$BC)
152+
molecules <- gex_molecules$barcode_idx %in% target_bc_idx
153+
df_all_gene <- data.frame("BC_IDX" = gex_molecules$barcode_idx[molecules])
154+
df_all_gene$BC <- gex_molecules$barcodes[df_all_gene$BC_IDX]
155+
df_all_gene$UMI_bin <- gex_molecules$umi[molecules]
156+
df_all_gene$BC_UMI_bin <- paste0(df_all_gene$BC, "_", df_all_gene$UMI_bin)
157+
df_all_gene$gene_idx <- gex_molecules$feature_idx[molecules]
158+
df_all_gene$gene <- gex_molecules$feature_name[df_all_gene$gene_idx]
159+
df_all_gene$count <- gex_molecules$umi_counts[molecules]
160+
to_keep <- df_all_gene$BC_UMI_bin %in% (split_got_df %>% pull(BC_UMI_bin))
161+
162+
df_all_gene_to_keep <- df_all_gene[to_keep,]
163+
df_all_gene_to_keep$UMI <- unlist(mclapply(df_all_gene_to_keep$UMI_bin, mc.cores = detectCores(), FUN = function(x){
164+
umi_bin_to_seq(x, umi_len = 12)
165+
}))
166+
df_all_gene_to_keep$BC_UMI <- paste0(df_all_gene_to_keep$BC, "_", df_all_gene_to_keep$UMI)
167+
168+
df_all_gene_collapse <- df_all_gene_to_keep
169+
170+
#For more granular information, this step here parses the rare cases in which a single BC/UMI pair has been assigned to multiple genes in the molecule info file.
171+
for (k in unique(df_all_gene_collapse$BC_UMI)){
172+
target_rows <- which(df_all_gene_collapse$BC_UMI == k)
173+
if (length(target_rows) > 1){
174+
sub_df <- df_all_gene_collapse[target_rows,]
175+
if (target_gene %in% sub_df$gene){
176+
if(length(grep("_CITE", sub_df$gene)) > 0){
177+
sub_df$gene <- paste0("Multiple_", target_gene, "_CITE")
178+
} else{
179+
sub_df$gene <- paste0("Multiple_", target_gene)
180+
}
181+
} else {
182+
sub_df$gene <- "Multiple"
183+
}
184+
min_row <- min(target_rows)
185+
target_rows <- target_rows[target_rows != min_row]
186+
df_all_gene_collapse[min_row,] <- sub_df[1,]
187+
df_all_gene_collapse <- df_all_gene_collapse[-target_rows,]
188+
}
189+
}
190+
```
191+
192+
With all of this information combined, we can now assign BC/UMI pairs in IronThrone results to 1 of four categories.
193+
1. `Exact` - Exact match to a BC/UMI pair in 10X GEX data for the target gene of interest.
194+
2. `Approx` - Approximate match to a BC/UMI pair in 10X GEX data for the target gene of interest.
195+
3. `No Gene` - Does not match any BC/UMI pair in 10X GEX data.
196+
4. `Other Gene` - Found in 10X GEX data, but corresponding to a gene that is not the target gene of interest.
197+
```{r}
198+
split_got_df_gene <- (merge(split_got_df, df_all_gene_collapse[,c("BC_UMI", "gene")], by = "BC_UMI", all.x = TRUE, all.y = FALSE, sort = FALSE))
199+
200+
split_got_df_gene$In_GEX <- !is.na(split_got_df_gene$gene)
201+
202+
split_got_df_gene$Gene_Group <- ifelse(split_got_df_gene$Exact_Match,
203+
"Exact",
204+
ifelse(split_got_df_gene$Approx_Match,
205+
"Approx",
206+
ifelse(split_got_df_gene$In_GEX,
207+
"Other Gene",
208+
"No Gene")))
209+
```
210+
211+
We can now define a function to collapse our per-UMI data frame back into a per-barcode data frame after we filter our UMIs we no longer want to include.
212+
```{r}
213+
concatenate_got <- function(BC, split_df){
214+
single_bc_mat <- split_df[split_df[,"BC"] == BC,]
215+
single_bc_vec <- apply(single_bc_mat, MARGIN = 2, FUN = function(x) paste0(x, collapse = ";"))
216+
single_bc_vec["BC"] <- BC
217+
single_bc_vec["WT.calls"] <- sum(single_bc_mat[,"call.in.dups"] == "WT")
218+
single_bc_vec["MUT.calls"] <- sum(single_bc_mat[,"call.in.dups"] == "MUT")
219+
single_bc_vec["amb.calls"] <- sum(single_bc_mat[,"call.in.dups"] == "AMB")
220+
single_bc_df <- t(as.data.frame(single_bc_vec, stringsAsFactors = FALSE))
221+
rownames(single_bc_df) <- NULL
222+
return(single_bc_df)
223+
}
224+
```
225+
226+
First, we will return genotyping results by filtering out those UMIs that correspond to non-target genes in the 10X GEX data. These results will be stored in the column `filt.Genotype`.
227+
```{r}
228+
unique_bc_approx_no_gene <- unique(split_got_df_gene %>% filter(Gene_Group != "Other Gene") %>% pull(BC))
229+
split_got_df_approx_no_gene <- split_got_df_gene %>% filter(Gene_Group != "Other Gene")
230+
concat_got_df_approx_no_gene <- as.data.frame(Reduce(rbind, mclapply(unique_bc_approx_no_gene, FUN = function(x) (concatenate_got(BC = x, split_df = split_got_df_approx_no_gene)), mc.cores = detectCores())), stringsAsFactors = FALSE)
231+
concat_got_df_approx_no_gene$Genotype <- ifelse(is.na(concat_got_df_approx_no_gene$WT.calls), "No Data",
232+
ifelse(concat_got_df_approx_no_gene$MUT.calls>0, "MUT",
233+
ifelse(concat_got_df_approx_no_gene$WT.calls>=1, "WT", "NA")))
234+
235+
236+
temp_metadata <- merge(md, concat_got_df_approx_no_gene[, c("BC", "Genotype", "WT.calls", "MUT.calls")], by = "BC", all.x = TRUE, all.y = FALSE)
237+
rownames(temp_metadata) <- temp_metadata$BC
238+
md$filt.Genotype <- temp_metadata$Genotype
239+
md$WT.calls.filt <- as.numeric(temp_metadata$WT.calls)
240+
md$MUT.calls.filt <- as.numeric(temp_metadata$MUT.calls)
241+
md$Total.calls.filt <- md$WT.calls.filt + md$MUT.calls.filt
242+
```
243+
244+
Next, to further increase the accuracy of our genotyping results, we can apply a threshold to the number of supporting reads required for a UMI that matches no genes in the 10X GEX data (`No Gene`) to be included in our results. There are a couple ways of doing this.
245+
1. We can use the distribution of supporting reads for UMIs with matches to non-target genes (`Other Gene`) as a basis for determining which UMIs with no gene match (`No Gene`) should be discarded. As a default starting parameter for this approach, we use the 80th percentile of supporting reads for the `Other Gene` group.
246+
```{r}
247+
quant_thresh <- 0.8
248+
other_gene_counts <- split_got_df_gene %>% filter(Gene_Group == "Other Gene") %>% pull(total_dups_wt_mut)
249+
threshold <- quantile(other_gene_counts, probs = quant_thresh)
250+
```
251+
252+
2. We have noticed that the distribution of supporting reads for the `No Gene` UMI group tends to be bimodal, and have hypothesized that the upper peak of the distribution may include more true genotyping UMIs. We can thus find the local minimum between these peaks and use that as our threshold cutoff.
253+
```{r}
254+
no_gene_counts <- split_got_df_gene %>% filter(Gene_Group == "No Gene") %>% pull(total_dups_wt_mut)
255+
d <- density(log10(no_gene_counts))
256+
257+
threshold <- 10^(optimize(approxfun(d$x,d$y),interval=c(0,3))$minimum)
258+
```
259+
260+
To visualize how different thresholds will impact exclusion of `No Gene` UMIs, we can plot supporting read counts across all 4 of our UMI groups.
261+
```{r, message = FALSE}
262+
thresh_plot <- ggplot(split_got_df_gene, aes(y = log10(total_dups), x = Gene_Group, fill = Gene_Group))+ geom_violin(position = position_dodge(0.9), trim = FALSE) +
263+
geom_boxplot(width=0.1, position = position_dodge(0.9), alpha = 0.5) +
264+
geom_hline(yintercept = log10(threshold)) +
265+
theme_bw() +
266+
labs(y = "log10(Supporting Read Counts per UMI)", x = "Amplicon Match to GEX Library")
267+
thresh_plot
268+
269+
if(file.exists(paste0(output_dir, "/threshold_plot.pdf"))){
270+
print("Warning: File threshold_plot.pdf already exists in output directory")
271+
} else{
272+
ggsave(filename = paste0(output_dir, "/threshold_plot.pdf"), plot = thresh_plot, device = "pdf")
273+
}
274+
```
275+
276+
We can now use this threshold value to exlude `No Gene` UMIs with fewer supporting read counts, and create one final set of genotyping calls. These results willbe stored in the column `thresh.filt.Genotype`.
277+
```{r}
278+
split_got_df_gene$Keep <- ifelse(split_got_df_gene$Gene_Group %in% c("Exact", "Approx"), TRUE,
279+
ifelse(split_got_df_gene$Gene_Group == "Other Gene", FALSE,
280+
ifelse(split_got_df_gene$total_dups_wt_mut > threshold, TRUE, FALSE)))
281+
split_got_df_gene_thresh <- split_got_df_gene
282+
283+
unique_bc_approx_no_gene_thresh <- unique(split_got_df_gene_thresh %>% filter(Keep) %>% pull(BC))
284+
split_got_df_approx_no_gene_thresh <- split_got_df_gene_thresh %>% filter(Keep)
285+
concat_got_df_approx_no_gene_thresh <- as.data.frame(Reduce(rbind, mclapply(unique_bc_approx_no_gene_thresh, FUN = function(x) (concatenate_got(BC = x, split_df = split_got_df_approx_no_gene_thresh)), mc.cores = detectCores())), stringsAsFactors = FALSE)
286+
concat_got_df_approx_no_gene_thresh$Genotype <- ifelse(is.na(concat_got_df_approx_no_gene_thresh$WT.calls), "No Data",
287+
ifelse(concat_got_df_approx_no_gene_thresh$MUT.calls>0, "MUT",
288+
ifelse(concat_got_df_approx_no_gene_thresh$WT.calls>=1, "WT", "NA")))
289+
290+
291+
temp_metadata <- merge(md, concat_got_df_approx_no_gene_thresh[, c("BC", "Genotype", "WT.calls", "MUT.calls")], by = "BC", all.x = TRUE, all.y = FALSE)
292+
rownames(temp_metadata) <- md$BC
293+
md$thresh.filt.Genotype <- temp_metadata$Genotype
294+
md$WT.calls.thresh.filt <- as.numeric(temp_metadata$WT.calls)
295+
md$MUT.calls.thresh.filt <- as.numeric(temp_metadata$MUT.calls)
296+
md$Total.calls.thresh.filt <- md$WT.calls.thresh.filt + md$MUT.calls.thresh.filt
297+
```
298+
299+
Finally, we save our metadata file to our output folder for later integration with a single-cell object and downstream analysis.
300+
```{r}
301+
if(file.exists(paste0(output_dir, "/metadata.Rdata"))){
302+
print("Warning: File metadata.Rdata already exists in output directory")
303+
} else{
304+
save(md, file = paste0(output_dir, "/metadata.Rdata"))
305+
}
306+
```

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
## Notes/Bugs to be Ironed out in Future Versions
2929
- Amplicon fastqs should be concatenated, paired-end, into 1 `R1` and 1 `R2` file
3030
- fastq read lengths should be identical for all reads within a given R1 or R2 file (R1 and R2 read lengths do not need to be equivalent)
31-
- Barcode file can be a complete whitelist provided by 10X (see below) or a custom `.txt` file of barcodes from a corresponding 10X run
31+
- Barcode file can be a complete whitelist provided by 10X (see below) or a custom `.txt` file of barcodes from a corresponding 10X run. Note: If using the unzipped `barcodes.tsv` file from CellRanger's `filtered_feature_bc_matrix` folder, barcodes will often be appended with `-1`, which will result in an error. You can run `sed 's/..$//' < barcodes.tsv > barcodes_trim.tsv` to trim these suffixes.
3232
- `.config` file entries should be hard-tab-separated
3333

3434
# <a name="started"></a>IronThrone-GoT

README renamed to v1_README

File renamed without changes.

0 commit comments

Comments
 (0)