Skip to content

Commit

Permalink
adapting WGS QC pipeline for the AMPAD joint VCF - this requires crea…
Browse files Browse the repository at this point in the history
…ting exceptions for empty chunks, upping memory requirements, dealing with lifting over later than originally intended
  • Loading branch information
jackhump committed Jun 29, 2020
1 parent 8538a69 commit 92fd63b
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 73 deletions.
150 changes: 80 additions & 70 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ import numpy as np
import itertools as iter
import math
import csv

import subprocess
#Declare alleles variable to establish a later used wildcard
alleles = ['Biallelic']
#alleles = ['Biallelic','Triallelic'] #old alleles variable used when also considering
Expand Down Expand Up @@ -258,7 +258,7 @@ rule Filter0_separateBiallelics:
chunked = tempFolder + '{chr}_{chunk}_filtered.vcf.gz'
#chunked = tempFolder + '{chr}_{chunk}_chunked.vcf.gz'
output:
biallelic_Filter0 = tempFolder + '{chr}' + '_{chunk}_Biallelic.recode.vcf.gz',
biallelic_vcf = tempFolder + '{chr}' + '_{chunk}_Biallelic.recode.vcf.gz',
stats = statsFolder + '{chr}' + '_{chunk}_separateBiallelic.stats.txt'
#group: "per_chunk_filter"
shell:
Expand All @@ -275,7 +275,7 @@ rule Filter0_separateBiallelics:
# input:
# vcfgz = inFolder + '{chr}.vcf.gz'
# output:
# triallelic_Filter0 = outFolder + 'chrAll_Triallelic_QCFinished.recode.vcf',
# triallelic_vcf = outFolder + 'chrAll_Triallelic_QCFinished.recode.vcf',
# stats = statsFolder + '{chr}' + '_Triallelic_Filter0_stats.txt'
# params:
# unsplit_file = tempFolder + '{chr}' + '_Triallelic_unsplit.recode.vcf.gz'
Expand All @@ -293,24 +293,25 @@ rule Filter0_separateBiallelics:
#7) Keep sites that "PASS" by GATK
rule Filter1_GATK_PASS:
input:
Filter0 = tempFolder + '{chr}_{chunk}_Biallelic.recode.vcf.gz'
vcf = tempFolder + '{chr}_{chunk}_Biallelic.recode.vcf.gz'
output:
Filter1 = tempFolder + '{chr}_{chunk}_Filter1.recode.vcf.gz',
vcf = tempFolder + '{chr}_{chunk}_Filter1.recode.vcf.gz',
stats = statsFolder + '{chr}_{chunk}_Filter1_stats.txt'
#group: "per_chunk_filter"
shell:
"ml vcftools/0.1.15;"
"ml bcftools/1.9;"
"vcftools --gzvcf {input.Filter0} --remove-filtered-all --stdout --recode --recode-INFO-all | bgzip -c > {output.Filter1};"
#"bcftools view --threads 5 -f PASS {input.Filter0} > {output.Filter1};" #may speed up QC pipeline slightly. Haven't tested
"bcftools stats {output.Filter1} > {output.stats};"
"nLine=$(bcftools view -H {input.vcf} | wc -l); if [ $nLine == 0 ]; then cp {input.vcf} {output.vcf}; touch {output.stats}; exit 0; fi;"
"vcftools --gzvcf {input.vcf} --remove-filtered-all --stdout --recode --recode-INFO-all | bgzip -c > {output.vcf};"
#"bcftools view --threads 5 -f PASS {input.vcf} > {output.vcf};" #may speed up QC pipeline slightly. Haven't tested
"bcftools stats {output.vcf} > {output.stats};"

#8) Filter for genotype level depth (DP) (GDP)
rule Filter2_GDP:
input:
Filter1 = tempFolder + '{chr}_{chunk}_Filter1.recode.vcf.gz'
vcf = tempFolder + '{chr}_{chunk}_Filter1.recode.vcf.gz'
output:
Filter2 = tempFolder + '{chr}_{chunk}_Filter2.recode.vcf.gz',
vcf = tempFolder + '{chr}_{chunk}_Filter2.recode.vcf.gz',
stats = statsFolder + '{chr}_{chunk}_Filter2_stats.txt'
params:
GDP_thresh = GDP
Expand All @@ -319,149 +320,159 @@ rule Filter2_GDP:
"ml vcflib/v1.0.0-rc0; ml bcftools/1.9;"

#tabix -f must be created in this step and not step before because of how snakemake temporally creates files. Because tabix does not have an "output", it would be created before the output from the previous rule if placed within the rule.
"tabix -f -p vcf {input.Filter1};"
"vcffilter -g \"DP > {params.GDP_thresh}\" {input.Filter1} | bgzip -c > {output.Filter2};"
"bcftools stats {output.Filter2} > {output.stats};"
"tabix -f -p vcf {input.vcf};"
"nLine=$(bcftools view -H {input.vcf} | wc -l); if [ $nLine == 0 ]; then cp {input.vcf} {output.vcf}; touch {output.stats}; exit 0; fi;"
"vcffilter -g \"DP > {params.GDP_thresh}\" {input.vcf} | bgzip -c > {output.vcf};"
"bcftools stats {output.vcf} > {output.stats};"


#9) Filter for Genome Quality (GQ)
rule Filter3_GQ:
input:
Filter2 = tempFolder + '{chr}_{chunk}_Filter2.recode.vcf.gz'
vcf = tempFolder + '{chr}_{chunk}_Filter2.recode.vcf.gz'
output:
Filter3 = tempFolder + '{chr}_{chunk}_Filter3.recode.vcf.gz',
vcf = tempFolder + '{chr}_{chunk}_Filter3.recode.vcf.gz',
stats = statsFolder + '{chr}_{chunk}_Filter3_stats.txt'
params:
GQ_thresh = GQ
#group: "per_chunk_filter"
shell:
"ml vcflib/v1.0.0-rc0; ml bcftools/1.9;"
"tabix -f -p vcf {input.Filter2};"
" if [ $(bcftools view -H {input.Filter2} | wc -l) == 0 ]; then cp {input.Filter2} {output.Filter3}; exit 0; fi"
"vcffilter -g \"GQ > {params.GQ_thresh}\" {input.Filter2} | bgzip -c > {output.Filter3};"
"bcftools stats {output.Filter3} > {output.stats};"
"tabix -f -p vcf {input.vcf};"
"nLine=$(bcftools view -H {input.vcf} | wc -l); if [ $nLine == 0 ]; then cp {input.vcf} {output.vcf}; touch {output.stats}; exit 0; fi;"
"vcffilter -g \"GQ > {params.GQ_thresh}\" {input.vcf} | bgzip -c > {output.vcf};"
"bcftools stats {output.vcf} > {output.stats};"

#10) Filter for SNP missingness
rule Filter4_SNP_Missingess:
input:
Filter3 = tempFolder + '{chr}_{chunk}_Filter3.recode.vcf.gz'
vcf = tempFolder + '{chr}_{chunk}_Filter3.recode.vcf.gz'
output:
Filter4 = tempFolder + '{chr}_{chunk}_Filter4.recode.vcf.gz',
vcf = tempFolder + '{chr}_{chunk}_Filter4.recode.vcf.gz',
stats = statsFolder + '{chr}_{chunk}_Filter4_stats.txt'
params:
MISS_THRESH_SNP = MISS_THRESH_SNP
#group: "per_chunk_filter"
shell:
"ml vcftools/0.1.15; ml bcftools/1.9;"
"vcftools --gzvcf {input.Filter3} --max-missing {params.MISS_THRESH_SNP} --stdout --recode --recode-INFO-all | bgzip -c > {output.Filter4};"
"bcftools stats {output.Filter4} > {output.stats};"
"nLine=$(bcftools view -H {input.vcf} | wc -l); if [ $nLine == 0 ]; then cp {input.vcf} {output.vcf}; touch {output.stats}; exit 0; fi;"
"vcftools --gzvcf {input.vcf} --max-missing {params.MISS_THRESH_SNP} --stdout --recode --recode-INFO-all | bgzip -c > {output.vcf};"
"bcftools stats {output.vcf} > {output.stats};"

#11) Filter for Overall Read Depth (DP) (ODP)
rule Filter5_ODP:
input:
Filter4 = tempFolder + '{chr}_{chunk}_Filter4.recode.vcf.gz'
vcf = tempFolder + '{chr}_{chunk}_Filter4.recode.vcf.gz'
output:
Filter5 = tempFolder + '{chr}_{chunk}_Filter5.recode.vcf.gz',
vcf = tempFolder + '{chr}_{chunk}_Filter5.recode.vcf.gz',
stats = statsFolder + '{chr}_{chunk}_Filter5_stats.txt'
params:
ODP_thresh = ODP
#group: "per_chunk_filter"
shell:
"ml vcflib/v1.0.0-rc0; ml bcftools/1.9;"
"tabix -f -p vcf {input.Filter4};"
"vcffilter -f \"DP > {params.ODP_thresh}\" {input.Filter4} | bgzip -c > {output.Filter5};"
"bcftools stats {output.Filter5} > {output.stats};"
"tabix -f -p vcf {input.vcf};"
"nLine=$(bcftools view -H {input.vcf} | wc -l); if [ $nLine == 0 ]; then cp {input.vcf} {output.vcf}; touch {output.stats}; exit 0; fi;"
"vcffilter -f \"DP > {params.ODP_thresh}\" {input.vcf} | bgzip -c > {output.vcf};"
"bcftools stats {output.vcf} > {output.stats};"

#12) Filter for Mapping Quality (MQ)
rule Filter6_MQ:
input:
Filter5 = tempFolder + '{chr}_{chunk}_Filter5.recode.vcf.gz'
vcf = tempFolder + '{chr}_{chunk}_Filter5.recode.vcf.gz'
output:
Filter6 = tempFolder + '{chr}_{chunk}_Filter6.recode.vcf.gz',
vcf = tempFolder + '{chr}_{chunk}_Filter6.recode.vcf.gz',
stats = statsFolder + '{chr}_{chunk}_Filter6_stats.txt'
params:
MQ_MAX_THRESH = MQ_MAX,
MQ_MIN_THRESH = MQ_MIN
#group: "per_chunk_filter"
shell:
"ml vcflib/v1.0.0-rc0; ml bcftools/1.9;"
"tabix -f -p vcf {input.Filter5};"

#"vcffilter -f \"MQ > {params.MQ_MIN_THRESH} & MQ < {params.MQ_MAX_THRESH}\" {input.Filter5} | bgzip -c > {output.Filter6};"
"bcftools filter -i \"MQ > {params.MQ_MIN_THRESH} && MQ < {params.MQ_MAX_THRESH}\" {input.Filter5} | bgzip -c > {output.Filter6};"
"bcftools stats {output.Filter6} > {output.stats};"
"tabix -f -p vcf {input.vcf};"
"nLine=$(bcftools view -H {input.vcf} | wc -l); if [ $nLine == 0 ]; then cp {input.vcf} {output.vcf}; touch {output.stats}; exit 0; fi;"
#"vcffilter -f \"MQ > {params.MQ_MIN_THRESH} & MQ < {params.MQ_MAX_THRESH}\" {input.vcf} | bgzip -c > {output.vcf};"
"bcftools filter -i \"MQ > {params.MQ_MIN_THRESH} && MQ < {params.MQ_MAX_THRESH}\" {input.vcf} | bgzip -c > {output.vcf};"
"bcftools stats {output.vcf} > {output.stats};"

#13) Separate out Indel files and SNP files in Biallelic files.
rule Biallelic_Separate_Indels_and_SNPs:
input:
Filter6 = tempFolder + '{chr}_{chunk}_Filter6.recode.vcf.gz'
vcf = tempFolder + '{chr}_{chunk}_Filter6.recode.vcf.gz'
output:
Filter6_SNPs = tempFolder + '{chr}_{chunk}_Filter6_SNPs.recode.vcf.gz',
Filter6_SNPs_stats = statsFolder + '{chr}_{chunk}_Filter6_SNPs_stats.txt',
Filter6_Indels = tempFolder + '{chr}_{chunk}_Filter6_Indels.recode.vcf.gz',
Filter6_Indels_stats = statsFolder + '{chr}_{chunk}_Filter6_Indels_stats.txt'
vcf_SNPs = tempFolder + '{chr}_{chunk}_Filter6_SNPs.recode.vcf.gz',
vcf_SNPs_stats = statsFolder + '{chr}_{chunk}_Filter6_SNPs_stats.txt',
vcf_Indels = tempFolder + '{chr}_{chunk}_Filter6_Indels.recode.vcf.gz',
vcf_Indels_stats = statsFolder + '{chr}_{chunk}_Filter6_Indels_stats.txt'
#group: "per_chunk_filter"
shell:
"ml vcftools/0.1.15; ml bcftools/1.9;"
"nLine=$(bcftools view -H {input.vcf} | wc -l); if [ $nLine == 0 ]; then cp {input.vcf} {output.vcf_SNPs}; cp {input.vcf} {output.vcf_Indels}; touch {output.vcf_SNPs_stats}; touch {output.vcf_Indels_stats}; exit 0; fi;"
"vcftools --gzvcf {input.vcf} --remove-indels --stdout --recode --recode-INFO-all | bgzip -c > {output.vcf_SNPs};"
"vcftools --gzvcf {input.vcf} --keep-only-indels --stdout --recode --recode-INFO-all | bgzip -c > {output.vcf_Indels};"

"vcftools --gzvcf {input.Filter6} --remove-indels --stdout --recode --recode-INFO-all | bgzip -c > {output.Filter6_SNPs};"
"vcftools --gzvcf {input.Filter6} --keep-only-indels --stdout --recode --recode-INFO-all | bgzip -c > {output.Filter6_Indels};"

"bcftools stats {output.Filter6_SNPs} > {output.Filter6_SNPs_stats};"
"bcftools stats {output.Filter6_Indels} > {output.Filter6_Indels_stats};"
"bcftools stats {output.vcf_SNPs} > {output.vcf_SNPs_stats};"
"bcftools stats {output.vcf_Indels} > {output.vcf_Indels_stats};"


#14) Filter Biallelic SNPs for VQSLOD
rule Biallelic_SNPs_Filter7_VQSLOD:
input:
Filter6_SNPs = tempFolder + '{chr}_{chunk}_Filter6_SNPs.recode.vcf.gz'
vcf = tempFolder + '{chr}_{chunk}_Filter6_SNPs.recode.vcf.gz'
output:
Filter7_SNPs = tempFolder + '{chr}_{chunk}_Filter7_SNPs.recode.vcf.gz',
vcf = tempFolder + '{chr}_{chunk}_Filter7_SNPs.recode.vcf.gz',
stats = statsFolder + '{chr}_{chunk}_Filter7_SNPs_stats.txt'
params:
VQSLOD_thresh = VQSLOD
#group: "per_chunk_filter"
shell:
"ml vcflib/v1.0.0-rc0; ml bcftools/1.9;"
"tabix -f -p vcf {input.Filter6_SNPs};"
"vcffilter -f \"VQSLOD > {params.VQSLOD_thresh}\" {input.Filter6_SNPs} | bgzip -c > {output.Filter7_SNPs};"
"bcftools stats {output.Filter7_SNPs} > {output.stats};"
"tabix -f -p vcf {input.vcf};"
"nLine=$(bcftools view -H {input.vcf} | wc -l); if [ $nLine == 0 ]; then cp {input.vcf} {output.vcf}; touch {output.stats}; exit 0; fi;"
"vcffilter -f \"VQSLOD > {params.VQSLOD_thresh}\" {input.vcf} | bgzip -c > {output.vcf};"
"bcftools stats {output.vcf} > {output.stats};"

#15) Combine Biallelic Indels and SNPs
rule Biallelic_Combine_Indels_and_SNPs:
input:
Filter7_SNPs = tempFolder + '{chr}_{chunk}_Filter7_SNPs.recode.vcf.gz',
Filter6_Indels = tempFolder + '{chr}_{chunk}_Filter6_Indels.recode.vcf.gz'
SNPs = tempFolder + '{chr}_{chunk}_Filter7_SNPs.recode.vcf.gz',
Indels = tempFolder + '{chr}_{chunk}_Filter6_Indels.recode.vcf.gz'
output:
Filter7 = tempFolder + '{chr}_{chunk}_CombineSNPsIndels.recode.vcf.gz',
vcf = tempFolder + '{chr}_{chunk}_CombineSNPsIndels.recode.vcf.gz',
stats = statsFolder + '{chr}_{chunk}_CombineSNPsIndels_stats.txt'
#group: "per_chunk_filter"
shell:
"ml bcftools/1.9;"
"bcftools concat {input.Filter6_Indels} {input.Filter7_SNPs} | bcftools sort | bgzip -c > {output.Filter7};"
"bcftools stats {output.Filter7} > {output.stats};"
"nLine=$(bcftools view -H {input.SNPs} | wc -l); if [ $nLine == 0 ]; then cp {input.SNPs} {output.vcf}; touch {output.stats}; exit 0; fi;"
"bcftools concat {input.Indels} {input.SNPs} | bcftools sort | bgzip -c > {output.vcf};"
"bcftools stats {output.vcf} > {output.stats};"

#16) Filter via Inbreeding_Coef
rule Filter8_Inbreeding_Coef:
input:
Filter7 = tempFolder + '{chr}_{chunk}_CombineSNPsIndels.recode.vcf.gz'
vcf = tempFolder + '{chr}_{chunk}_CombineSNPsIndels.recode.vcf.gz'
output:
Filter8 = tempFolder + '{chr}_{chunk}_Filter8.recode.vcf.gz',
vcf = tempFolder + '{chr}_{chunk}_Filter8.recode.vcf.gz',
stats = statsFolder + '{chr}_{chunk}_Filter8_stats.txt'
params:
INBREEDING_COEF = INBREEDING_COEF
#group: "per_chunk_filter"
run:
shell("ml vcflib/v1.0.0-rc0; ml bcftools/1.9;tabix -f -p vcf {input.Filter7};")

#Negative values for inbreeding coeficient are expressed as (0 - [value]) due to how vcffilter is built. Hence, an if statement to separate negative from positive input INBREEDING_COEF values
if params.INBREEDING_COEF >= 0:
shell("ml vcflib/v1.0.0-rc0; ml bcftools/1.9;vcffilter -f \"InbreedingCoeff > {params.INBREEDING_COEF}\" {input.Filter7} |bgzip -c > {output.Filter8};")
else:
params.INBREEDING_COEF = -params.INBREEDING_COEF
shell("ml vcflib/v1.0.0-rc0; ml bcftools/1.9;vcffilter -f \"InbreedingCoeff > ( 0 - {params.INBREEDING_COEF} )\" {input.Filter7} |bgzip -c > {output.Filter8};")

shell("ml bcftools/1.9;bcftools stats {output.Filter8} > {output.stats};")
shell("ml vcflib/v1.0.0-rc0; ml bcftools/1.9;tabix -f -p vcf {input.vcf};")
cmd = "ml bcftools/1.9;bcftools view -H " + input.vcf + "| wc -l"
nLine = subprocess.check_output(cmd, shell = True)
nLine = int(nLine.decode("utf-8").strip() )
if nLine == 0:
shell("cp {input.vcf} {output.vcf}; touch {output.stats}")
else:
#Negative values for inbreeding coeficient are expressed as (0 - [value]) due to how vcffilter is built. Hence, an if statement to separate negative from positive input INBREEDING_COEF values
if params.INBREEDING_COEF >= 0:
shell("ml vcflib/v1.0.0-rc0; ml bcftools/1.9;vcffilter -f \"InbreedingCoeff > {params.INBREEDING_COEF}\" {input.vcf} |bgzip -c > {output.vcf};")
else:
params.INBREEDING_COEF = -params.INBREEDING_COEF
shell("ml vcflib/v1.0.0-rc0; ml bcftools/1.9;vcffilter -f \"InbreedingCoeff > ( 0 - {params.INBREEDING_COEF} )\" {input.vcf} |bgzip -c > {output.vcf};")

shell("ml bcftools/1.9;bcftools stats {output.vcf} > {output.stats};")

## STEP 3: AGGREGATE ALL CHUNKS TOGETHER

Expand Down Expand Up @@ -531,7 +542,7 @@ rule Filter9_Sample_Missingness:
bed = tempFolder + 'chrAll_Recombined.bed',
bim = tempFolder + 'chrAll_Recombined.bim',
fam = tempFolder + 'chrAll_Recombined.fam'
#Filter8 = tempFolder + 'all_Filter8_sorted.recode.vcf.gz'
#vcf = tempFolder + 'all_Filter8_sorted.recode.vcf.gz'
output:
bed = tempFolder + 'chrAll_Filter9.bed',
bim = tempFolder + 'chrAll_Filter9.bim',
Expand Down Expand Up @@ -632,5 +643,4 @@ rule collateStats:
script = "scripts/collate_all_stats.R"
shell:
"ml R/3.6.0;"
"Rscript {params.script} {statsFolder} {output}"

"Rscript {params.script} {statsFolder} {output}"
6 changes: 3 additions & 3 deletions cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@ liftOverVCFs:
mem: 3000
time: '360'
Biallelic_Combine_Indels_and_SNPs:
cores: 4
cores: 16
mem: 3750
time: '180'
recombineChunks:
cores: 4
cores: 8
mem: 3750
time: '180'
recombineChromosomes:
cores: 4
cores: 8
mem: 3750
time: '180'
KingRelatedness:
Expand Down
1 change: 1 addition & 0 deletions snakejob
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ bsub=("bsub -K -J $jname:{rule}:{wildcards}"

snakemake -u cluster.yaml --cluster-sync "${bsub[*]}" \
--local-cores 4 --max-jobs-per-second 5 \
--keep-going \
--jobs 1000 \
--restart-times 3 \
-s $snakefile --configfile $config \
Expand Down

0 comments on commit 92fd63b

Please sign in to comment.