Skip to content

Latest commit

 

History

History
652 lines (489 loc) · 18.1 KB

README.md

File metadata and controls

652 lines (489 loc) · 18.1 KB

CSeQTLworkflow

Introduction and Outline

This repository contains template codes for runnings workflow steps including

for our manuscript Cell Type-Specific Expression Quantitative Trait Loci.

Reference Data

GTEx Brain and Blood

Since GTEx samples are available on the NHGRI AnVIL cloud, they were processed using a combination of Docker and WDL with details provided in this repository.

CommonMind Consortium

Click to expand!

Code to obtain CMC inputs

# Install package
install.packages("synapser",
	repos = c("https://sage-bionetworks.github.io/ran","http://cran.fhcrc.org"))

library(synapser)

# Login to Synapse
username = "." # your username
password = "." # your password
synLogin(username,password)

down_dir = "." # user-specified directory to download files

# Function used to download files
entity = "syn4600989" # an example SYN ID unique to a file
synGet(entity = entity,downloadLocation = down_dir)

# List of SYN_IDs, use synGet() to download
"syn4600989" 	# CMC Human sampleIDkey metadata
"syn4600985" 	# QC'd genotype bed file data
"syn4600987" 	# QC'd bim file
"syn4600989" 	# QC fam file
"syn4935690" 	# README file
"syn5600756" 	# list of outlier samples
"syn18080588"	# SampleID key
"syn3354385"	# clinical data
"syn18358379"	# RNAseq meta/QC data

# List of BAM SYN_IDs
tableId = "syn11638850" 
results = synTableQuery(smart_sprintf("select * from %s 
	where species='Human' and dataType='geneExpression'", tableId))
# results$filepath
aa = as.data.frame(results)
aa = aa[grep("bam",aa$name),]
aa = aa[grep("accepted",aa$name),] # excluding unmapped bam files
dim(aa)
saveRDS(aa,"aligned_bam_files.rds")

# Download BAM files one by one
BAM_dir = "./BAM"; dir.create(BAM_dir)
for(samp in sort(unique(aa$specimenID))){
	samp_dir = file.path(BAM_dir,samp)
	dir.create(samp_dir)
	samp_syn_ids = aa$id[which(aa$specimenID == samp)]
	for(samp_syn_id in samp_syn_ids){
		synGet(entity = samp_syn_id,down_dir = samp_dir)
	}
	cat("\n")
}

# SNP6 annotation CSV file
tmp_link = "http://www.affymetrix.com/Auth/analysis"
tmp_link = file.path(tmp_link,"downloads/na35/genotyping")
tmp_link = file.path(tmp_link,"GenomeWideSNP_6.na35.annot.csv.zip")
system(sprintf("wget %s",tmp_link))

Blueprint

Click to expand!
  • Download metadata and BAMs with Pyega3

  • cred_file.json contains user login information

  • To obtain metadata,

     # for example
     dataset=EGAD00001002663 # genotypes
     # dataset=EGAD00001002671 # purified cell type 1
     # dataset=EGAD00001002674 # purified cell type 2
     # dataset=EGAD00001002675 # purified cell type 3
    
     # Download metadata code
     pyega3 -cf cred_file.json files $dataset
  • To obtain BAM files one by one,

     # num threads/cores
     nt=1
    
     # Example file id per BAM
     id=EGAF00001330176
    
     # Download code
     pyega3 -cf cred_file.json -c $nt fetch $id

BAM workflow

Click to expand!
  • BamToFastq

    Click to expand!

    For paired-end reads

     java -Xmx4g -jar picard.jar SamToFastq \
     	INPUT=input.bam FASTQ=output_1.fastq.gz \
     	SECOND_END_FASTQ=output_2.fastq.gz \
     	UNPAIRED_FASTQ=output_unpair.fastq.gz \
     	INCLUDE_NON_PF_READS=true VALIDATION_STRINGENCY=SILENT

    For single-end reads

     java -Xmx4g -jar picard.jar SamToFastq \
     	INPUT=input.bam FASTQ=output.fastq \
     	INCLUDE_NON_PF_READS=true \
     	VALIDATION_STRINGENCY=SILENT
  • Build STAR index

    Click to expand!
     fasta_fn=Homo_sapiens_assembly38_noALT_noHLA_noDecoy_ERCC.fasta
     gtf_fn=gencode.v26.GRCh38.ERCC.genes.gtf
     
     STAR --runMode genomeGenerate \
     	--genomeDir ./star_index \
     	--genomeFastaFiles $fasta_fn \
     	--sjdbGTFfile $gtf_fn \
     	--sjdbOverhang 99 --runThreadN 1
  • Run STAR for read alignment

    Click to expand!

    For paired-end reads

     STAR --runMode alignReads \
     	--runThreadN 1 --genomeDir ./star_index --twopassMode Basic \
     	--outFilterMultimapNmax 20 --alignSJoverhangMin 8 \
     	--alignSJDBoverhangMin 1 --outFilterMismatchNmax 999 \
     	--outFilterMismatchNoverLmax 0.1 --alignIntronMin 20 \
     	--alignIntronMax 1000000 --alignMatesGapMax 1000000 \
     	--outFilterType BySJout --outFilterScoreMinOverLread 0.33 \
     	--outFilterMatchNminOverLread 0.33 --limitSjdbInsertNsj 1200000 \
     	--readFilesIn output_1.fastq.gz output_2.fastq.gz \
     	--readFilesCommand zcat --outFileNamePrefix output_hg38 \
     	--outSAMstrandField intronMotif --outFilterIntronMotifs None \
     	--alignSoftClipAtReferenceEnds Yes --quantMode TranscriptomeSAM \
     	GeneCounts --outSAMtype BAM Unsorted --outSAMunmapped Within \
     	--genomeLoad NoSharedMemory --chimSegmentMin 15 \
     	--chimJunctionOverhangMin 15 --chimOutType WithinBAM SoftClip \
     	--chimMainSegmentMultNmax 1 --outSAMattributes NH HI AS nM NM ch \
     	--outSAMattrRGline ID:rg1 SM:sm1

    For single-end reads

     STAR --runMode alignReads \
     	--runThreadN 1 --genomeDir ./star_index --twopassMode Basic \
     	--outFilterMultimapNmax 20 --alignSJoverhangMin 8 \
     	--alignSJDBoverhangMin 1 --outFilterMismatchNmax 999 \
     	--outFilterMismatchNoverLmax 0.1 --alignIntronMin 20 \
     	--alignIntronMax 1000000 --alignMatesGapMax 1000000 \
     	--outFilterType BySJout --outFilterScoreMinOverLread 0.33 \
     	--outFilterMatchNminOverLread 0.33 --limitSjdbInsertNsj 1200000 \
     	--readFilesIn output.fastq.gz --readFilesCommand zcat \
     	--outFileNamePrefix output_hg38 --outSAMstrandField intronMotif \
     	--outFilterIntronMotifs None --alignSoftClipAtReferenceEnds Yes \
     	--quantMode TranscriptomeSAM GeneCounts --outSAMtype BAM Unsorted \
     	--outSAMunmapped Within --genomeLoad NoSharedMemory \
     	--chimSegmentMin 15 --chimJunctionOverhangMin 15 \
     	--chimOutType WithinBAM SoftClip --chimMainSegmentMultNmax 1 \
     	--outSAMattributes NH HI AS nM NM ch \
     	--outSAMattrRGline ID:rg1 SM:sm1
  • MarkDuplicates

    Click to expand!
     # Sort reads by coordinate
     samtools sort -@ 0 -o output_hg38.sortedByCoordinate.bam \
     	output_hg38Aligned.out.bam
     
     # Make bam index
     samtools index -b -@ 1 output_hg38.sortedByCoordinate.bam
     
     # MarkDuplicates
     java -Xmx4g -jar picard.jar MarkDuplicates \
     	INPUT=output_hg38.sortedByCoordinate.bam \
     	OUTPUT=output_hg38.sortedByCoordinate.md.bam \
     	M=out.marked_dup_metrics.txt ASSUME_SORT_ORDER=coordinate

    Process post-markduplicate sorted bam

     # Count num reads in bam
     samtools view -c -@ 0 output_hg38.sortedByCoordinate.md.bam
     
     # Re-index bam
     samtools index -b -@ 1 output_hg38.sortedByCoordinate.md.bam
  • Get TReC and ASReC

    Download asSeq source package

    Click to expand!
     url=https://github.com/Sun-lab/asSeq/raw
     url=$url/master/asSeq_0.99.501.tar.gz
     
     wget $url

    Install R package and run asSeq to get unique reads and filter

    Click to expand!
     install.packages(pkgs = "asSeq_0.99.501.tar.gz",
     	type = "source",repos = NULL)
     
     PE = TRUE 
     	# set TRUE for paired-end samples
     	# set FALSE for single-end
     
     flag1 = Rsamtools::scanBamFlag(isUnmappedQuery = FALSE,
     	isSecondaryAlignment = FALSE,isDuplicate = FALSE,
     	isNotPassingQualityControls = FALSE,
     	isSupplementaryAlignment = FALSE,isProperPair = PE)
     
     param1 = Rsamtools::ScanBamParam(flag = flag1,
     	what = "seq",mapqFilter = 255)
     
     bam_file = "output_hg38.sortedByCoordinate.md.bam"
     bam_filt_fn = "output.filtered.asSeq.bam"
     Rsamtools::filterBam(file = bam_file,
     	destination = bam_filt_fn,
     	param = param1)

    Create exon image file

    Click to expand!
     gtf_fn = "gencode.v26.GRCh38.ERCC.genes.gtf.gz"
     exdb = GenomicFeatures::makeTxDbFromGFF(file = gtf_fn,
     	format = "gtf")
     exons_list_per_gene = GenomicFeatures::exonsBy(exdb,
     	by = "gene")
     
     gtf_rds_fn = "exon_by_genes_gencode_v26.rds"
     saveRDS(exons_list_per_gene,gtf_rds_fn)

    Get total read count (TReC)

    Click to expand!
     genes = readRDS(gtf_rds_fn)
     bamfile = Rsamtools::BamFileList(bam_filt_fn,
     	yieldSize = 1000000)
     se = GenomicAlignments::summarizeOverlaps(features = genes,
     	reads = bamfile,mode = "Union",singleEnd = !PE,
     	ignore.strand = TRUE,fragments = PE)
     ct = as.data.frame(SummarizedExperiment::assay(se))

    Filter reads by Qname

    Click to expand!
     samtools sort -n -o output.filtered.asSeq.sortQ.bam \
     	output.filtered.asSeq.bam

    Extract allele-specific reads, outputs hap1.bam, hap2.bam, hapN.bam

    Click to expand!
     het_snp_fn = "<tab delimited filename of heterozygous SNPs for sample>"
     	# Columns: chr, position, hap1 allele, hap2 allele
     	# no header
     
     bam_filt_sortQ_fn = "output.filtered.asSeq.sortQ"
     asSeq::extractAsReads(input = sprintf("%s.bam",bam_filt_sortQ_fn),
     	snpList = het_snp_fn,min.avgQ = 20,min.snpQ = 20)

    Count allele-specific read counts (ASReC)

    Click to expand!
     se1 = GenomicAlignments::summarizeOverlaps(features = genes,
     	reads = sprintf("%s_hap1.bam",bam_filt_sortQ_fn),mode = "Union",
     	singleEnd = !PE,ignore.strand = TRUE,fragments = PE)
     se2 = GenomicAlignments::summarizeOverlaps(features = genes,
     	reads = sprintf("%s_hap2.bam",bam_filt_sortQ_fn),mode = "Union",
     	singleEnd = !PE,ignore.strand = TRUE,fragments = PE)
     seN = GenomicAlignments::summarizeOverlaps(features = genes,
     	reads = sprintf("%s_hapN.bam",bam_filt_sortQ_fn),mode = "Union",
     	singleEnd = !PE,ignore.strand = TRUE,fragments = PE)

    Save read counts

    Click to expand!
     ct1 = as.data.frame(SummarizedExperiment::assay(se1))
     ct2 = as.data.frame(SummarizedExperiment::assay(se2))
     ctN = as.data.frame(SummarizedExperiment::assay(seN))
     cts = cbind(ct,ct1,ct2,ctN) # trec, hap1, hap2, hapN
     dim(cts); cts[1:2,]
     out_fn = "output.trecase.txt"
     write.table(cts,file = out_fn,quote = FALSE,
     	sep = "\t", eol = "\n")

Deconvolution

Click to expand!
  • Signature expression derived from single cell RNAseq

  • Comments:

    • Transcripts per million (TPM), gene count normalized by exonic gene length and then all genes are normalized by total normalized gene count, then multipled by 1 million
    • Cell sizes: For brain, summation of gene count normalized by exonic gene length. For blood, obtained from EPIC and ICeDT papers.
  • Deconvolution Inputs

    • Bulk RNA-seq TPM (bulk_tpm),
    • scRNA-seq TPM (sig_tpm),
    • cell sizes
  • Template code

    Input variables and objects. Make sure the genes are ordered consistently between sig_tpm and bulk_tpm

     work_dir = "." # working directory
     setwd(work_dir)
     
     sig_tpm 	# TPM signature expression matrix
     bulk_tpm 	# TPM bulk expression matrix
     

    CIBERSORT: [Paper, Software]

     sig_fn = file.path(work_dir,"signature.txt")
     mix_fn = file.path(work_dir,"mixture.txt")
     write.table(cbind(rowname=rownames(sig_tpm),sig_tpm),
     	file = sig_fn,sep = "\t",quote = FALSE,row.names = FALSE)
     write.table(cbind(rowname=rownames(bulk_tpm),bulk_tpm),
     	file = mix_fn,sep = "\t",quote = FALSE,row.names = FALSE)
     
     source("CIBERSORT.R") # obtained from CIBERSORT website
     results = CIBERSORT(sig_matrix = sig_fn,mixture_file = mix_fn,
     	perm = 0,QN = FALSE,absolute = FALSE,abs_method = 'sig.score',
     	filename = "DECON")
     unlink(x = c(sig_fn,mix_fn))
     ciber_fn = sprintf("CIBERSORT-Results_%s.txt","DECON")
     unlink(ciber_fn)
     QQ = ncol(sig_tpm)
     
     # Extract proportion of transcripts per cell type per sample
     pp_bar_ciber = results[,seq(QQ)]
     
     # Calculate proportion of cell types per sample
     pp_hat_ciber = t(apply(pp_bar_ciber,1,function(xx){
     	yy = xx / cell_sizes; yy / sum(yy)
     }))
     

    ICeDT: [Paper, Software]

     fit = ICeDT::ICeDT(Y = bulk_tpm,Z = sig_tpm,
     	tumorPurity = rep(0,ncol(bulk_tpm)),refVar = NULL,
     	rhoInit = NULL,maxIter_prop = 4e3,
     	maxIter_PP = 4e3,rhoConverge = 1e-2)
     
     # Extract proportion of transcripts per cell type per sample
     pp_bar_icedt = t(fit$rho)[,-1]
     
     # Calculate proportion of cell type per sample
     pp_hat_icedt = t(apply(pp_bar_icedt,1,function(xx){
     	yy = xx / cell_sizes; yy / sum(yy)
     }))
     

eQTL mapping

Click to expand!
  • BULK mode: If running bulk analyses, set RHO to a matrix with N rows and 1 column and set XX_trecPC to residual TReC PCs calculated without accounting for cell types.

  • Cell type-specific (CTS) mode: If running cell type-specific analyses, set RHO to estimated cell type proportions and set XX_trecPC to proportion-adjusted residual TReC PCs.

  • Example codes

    Click to expand!

    Inputs

     devtools::install_github("pllittle/smarter")
     devtools::install_github("pllittle/CSeQTL")
     
     # Sample-specific variables
     RHO 			# cell type proportions matrix
     XX_base 	# baseline covariates, continuous variables centered
     XX_genoPC 	# genotype PCs, centered
     XX_trecPC 	# residual TReC PCs, centered
     XX = cbind(Int = 1,XX_base,XX_genoPC,XX_trecPC)
     
     # Gene and sample variables
     TREC 	# TReC vector
     SNP	# phased genotype vector
     hap2	# 2nd haplotype counts
     ASREC 	# total haplotype counts = hap1 + hap2
     PHASE 	# Indicator vector of whether or not to use haplotype counts
     
     # Tuning arguments
     trim 		# TRUE for trimmed analysis, FALSE for untrimmed
     thres_TRIM 	# if trim = TRUE, the Cooks' distance cutoff to trim sample TReCs
     ncores 	# number of threads/cores to parallelize the loop, improve runtime
     
     # ASREC-related cutoffs to satisfy to use allele-specific read counts
     numAS 		# cutoff to determine if a sample has sufficient read counts
     numASn 	# cutoff of samples having ASREC >= numAS
     numAS_het 	# minimum number for sum(ASREC >= numAS & SNP %in% c(1,2))
     cistrans 	# p-value cutoff on cis/trans testing to determine which p-value 
     		# (TReC-only or TReC+ASReC) to report

    eQTL mapping for one gene

     gs_out = CSeQTL_GS(XX = XX,TREC = TREC,SNP = SNP,hap2 = hap2,
     	ASREC = ASREC,PHASE = PHASE,RHO = RHO,trim = trim,
     	thres_TRIM = 20,numAS = 5,numASn = 5,numAS_het = 5,
     	cistrans = 0.01,ncores = ncores,show = TRUE)
     

    Hypothesis testing output. Matrices where rows are genomic loci and columns are cell types for CTS mode or a single column for BULK mode.

     # TReC-only likelihood ratio test statistics with 1 DF
     gs_out$LRT_trec
     
     # TReC+ASReC likelihood ratio test statistics with 1 DF
     gs_out$LRT_trecase
     
     # Cis/Trans likelihood ratio test statistics with 1 DF
     gs_out$LRT_cistrans
     

Enrichment

Click to expand!
  • Functional Enrichment with Torus with example code provided below based on this markdown with examples and documentation.

     torus -d eqtls.gz \
     	-smap snpMap.gz \
     	-gmap geneMap.gz \
     	-annot annot.gz \
     	-est > output_enrich
  • GWAS Enrichment with Jackknife-based inference

     library(CSeQTL)
     
     work_dir = "." # specify a work directory
     
     # Download GWAS catalog
     gwas = CSeQTL:::get_GWAS_catalog(work_dir = work_dir)

    Inputs

    • DATA: R data.frame containing headers 'Chr', 'POS', and columns leading with 'EE_' such as 'EE_Astrocyte', 'EE_Excitatory' corresponding to eQTL binary indicators 0 or 1. Rows correspond to gene/SNP pairings
    • which_gwas: Either 'gwas_catalog' or some specific grouped phenotype
    • nBLOCKS: Specify the block size for jackknife estimation and inference

    Code

     # Run GWAS enrichment
     enrich_final = c()
     
     ## Across all phenotypes
     enrich_catalog = CSeQTL:::run_gwasEnrich_analysis(DATA = DATA,
     	work_dir = work_dir,which_gwas = "gwas_catalog",
     	nBLOCKS = 200,verbose = TRUE)
     enrich_final = rbind(enrich_final,enrich_catalog)
     
     ## Per grouped phenotype
     all_phenos = unique(gwas$myPHENO)
     all_phenos = all_phenos[all_phenos != "gwas_catalog"]
     for(pheno in all_phenos){
     	enrich_pheno = CSeQTL:::run_gwasEnrich_analysis(DATA = DATA,
     		work_dir = work_dir,which_gwas = pheno,
     		nBLOCKS = 200,verbose = TRUE)
     	enrich_final = rbind(enrich_final,enrich_pheno)
     }