adapting WGS QC pipeline for the AMPAD joint VCF - this requires crea…

…ting exceptions for empty chunks, upping memory requirements, dealing with lifting over later than originally intended
jackhump · Jun 29, 2020 · 92fd63b · 92fd63b
1 parent 8538a69
commit 92fd63b
Show file tree

Hide file tree

Showing 3 changed files with 84 additions and 73 deletions.
diff --git a/Snakefile b/Snakefile
@@ -73,7 +73,7 @@ import numpy as np
 import itertools as iter
 import math
 import csv
-
+import subprocess
 #Declare alleles variable to establish a later used wildcard
 alleles = ['Biallelic']
 #alleles = ['Biallelic','Triallelic'] #old alleles variable used when also considering
@@ -258,7 +258,7 @@ rule Filter0_separateBiallelics:
         chunked = tempFolder + '{chr}_{chunk}_filtered.vcf.gz'
         #chunked = tempFolder + '{chr}_{chunk}_chunked.vcf.gz'
     output:
-        biallelic_Filter0 = tempFolder + '{chr}' + '_{chunk}_Biallelic.recode.vcf.gz',
+        biallelic_vcf = tempFolder + '{chr}' + '_{chunk}_Biallelic.recode.vcf.gz',
         stats = statsFolder + '{chr}' + '_{chunk}_separateBiallelic.stats.txt'
     #group: "per_chunk_filter"
     shell:
@@ -275,7 +275,7 @@ rule Filter0_separateBiallelics:
 #     input:
 #         vcfgz = inFolder + '{chr}.vcf.gz'
 #     output:
-#         triallelic_Filter0 = outFolder + 'chrAll_Triallelic_QCFinished.recode.vcf',
+#         triallelic_vcf = outFolder + 'chrAll_Triallelic_QCFinished.recode.vcf',
 #         stats = statsFolder + '{chr}' + '_Triallelic_Filter0_stats.txt'
 #     params:
 #         unsplit_file = tempFolder + '{chr}' + '_Triallelic_unsplit.recode.vcf.gz'
@@ -293,24 +293,25 @@ rule Filter0_separateBiallelics:
 #7) Keep sites that "PASS" by GATK
 rule Filter1_GATK_PASS:
     input:
-        Filter0 = tempFolder + '{chr}_{chunk}_Biallelic.recode.vcf.gz'
+        vcf = tempFolder + '{chr}_{chunk}_Biallelic.recode.vcf.gz'
     output:
-        Filter1 = tempFolder + '{chr}_{chunk}_Filter1.recode.vcf.gz',
+        vcf = tempFolder + '{chr}_{chunk}_Filter1.recode.vcf.gz',
         stats = statsFolder + '{chr}_{chunk}_Filter1_stats.txt'
     #group: "per_chunk_filter"
     shell:
         "ml vcftools/0.1.15;"
         "ml bcftools/1.9;"
-        "vcftools --gzvcf {input.Filter0} --remove-filtered-all --stdout --recode --recode-INFO-all | bgzip -c > {output.Filter1};"
-        #"bcftools view --threads 5 -f PASS {input.Filter0} > {output.Filter1};" #may speed up QC pipeline slightly. Haven't tested
-        "bcftools stats {output.Filter1} > {output.stats};"
+        "nLine=$(bcftools view -H {input.vcf} | wc -l); if [ $nLine == 0 ]; then cp {input.vcf} {output.vcf}; touch {output.stats}; exit 0; fi;"
+        "vcftools --gzvcf {input.vcf} --remove-filtered-all --stdout --recode --recode-INFO-all | bgzip -c > {output.vcf};"
+        #"bcftools view --threads 5 -f PASS {input.vcf} > {output.vcf};" #may speed up QC pipeline slightly. Haven't tested
+        "bcftools stats {output.vcf} > {output.stats};"
 
 #8) Filter for genotype level depth (DP) (GDP)
 rule Filter2_GDP:
     input:
-        Filter1 = tempFolder + '{chr}_{chunk}_Filter1.recode.vcf.gz'
+        vcf = tempFolder + '{chr}_{chunk}_Filter1.recode.vcf.gz'
     output:
-        Filter2 = tempFolder + '{chr}_{chunk}_Filter2.recode.vcf.gz',
+        vcf = tempFolder + '{chr}_{chunk}_Filter2.recode.vcf.gz',
         stats = statsFolder + '{chr}_{chunk}_Filter2_stats.txt'
     params:
         GDP_thresh = GDP
@@ -319,149 +320,159 @@ rule Filter2_GDP:
         "ml vcflib/v1.0.0-rc0; ml bcftools/1.9;"
 
         #tabix -f  must be created in this step and not step before because of how snakemake temporally creates files. Because tabix does not have an "output", it would be created before the output from the previous rule if placed within the rule.
-        "tabix -f  -p vcf {input.Filter1};"
-        "vcffilter -g \"DP > {params.GDP_thresh}\" {input.Filter1} | bgzip -c > {output.Filter2};"
-        "bcftools stats {output.Filter2} > {output.stats};"
+        "tabix -f  -p vcf {input.vcf};"
+        "nLine=$(bcftools view -H {input.vcf} | wc -l); if [ $nLine == 0 ]; then cp {input.vcf} {output.vcf}; touch {output.stats}; exit 0; fi;"
+        "vcffilter -g \"DP > {params.GDP_thresh}\" {input.vcf} | bgzip -c > {output.vcf};"
+        "bcftools stats {output.vcf} > {output.stats};"
 
 
 #9) Filter for Genome Quality (GQ)
 rule Filter3_GQ:
     input:
-        Filter2 = tempFolder + '{chr}_{chunk}_Filter2.recode.vcf.gz'
+        vcf = tempFolder + '{chr}_{chunk}_Filter2.recode.vcf.gz'
     output:
-        Filter3 = tempFolder + '{chr}_{chunk}_Filter3.recode.vcf.gz',
+        vcf = tempFolder + '{chr}_{chunk}_Filter3.recode.vcf.gz',
         stats = statsFolder + '{chr}_{chunk}_Filter3_stats.txt'
     params:
         GQ_thresh = GQ
     #group: "per_chunk_filter"
     shell:
         "ml vcflib/v1.0.0-rc0; ml bcftools/1.9;"
-        "tabix -f  -p vcf {input.Filter2};"
-        " if [ $(bcftools view -H {input.Filter2} | wc -l) == 0 ]; then cp {input.Filter2} {output.Filter3}; exit 0; fi"
-        "vcffilter -g \"GQ > {params.GQ_thresh}\" {input.Filter2} | bgzip -c > {output.Filter3};"
-        "bcftools stats {output.Filter3} > {output.stats};"
+        "tabix -f  -p vcf {input.vcf};"
+        "nLine=$(bcftools view -H {input.vcf} | wc -l); if [ $nLine == 0 ]; then cp {input.vcf} {output.vcf}; touch {output.stats}; exit 0; fi;"
+        "vcffilter -g \"GQ > {params.GQ_thresh}\" {input.vcf} | bgzip -c > {output.vcf};"
+        "bcftools stats {output.vcf} > {output.stats};"
 
 #10) Filter for SNP missingness
 rule Filter4_SNP_Missingess:
     input:
-        Filter3 = tempFolder + '{chr}_{chunk}_Filter3.recode.vcf.gz'
+        vcf = tempFolder + '{chr}_{chunk}_Filter3.recode.vcf.gz'
     output:
-        Filter4 = tempFolder + '{chr}_{chunk}_Filter4.recode.vcf.gz',
+        vcf = tempFolder + '{chr}_{chunk}_Filter4.recode.vcf.gz',
         stats = statsFolder + '{chr}_{chunk}_Filter4_stats.txt'
     params:
         MISS_THRESH_SNP = MISS_THRESH_SNP
     #group: "per_chunk_filter" 
     shell:
         "ml vcftools/0.1.15; ml bcftools/1.9;"
-        "vcftools --gzvcf {input.Filter3} --max-missing {params.MISS_THRESH_SNP} --stdout --recode --recode-INFO-all | bgzip -c > {output.Filter4};"
-        "bcftools stats {output.Filter4} > {output.stats};"
+        "nLine=$(bcftools view -H {input.vcf} | wc -l); if [ $nLine == 0 ]; then cp {input.vcf} {output.vcf}; touch {output.stats}; exit 0; fi;"        
+        "vcftools --gzvcf {input.vcf} --max-missing {params.MISS_THRESH_SNP} --stdout --recode --recode-INFO-all | bgzip -c > {output.vcf};"
+        "bcftools stats {output.vcf} > {output.stats};"
 
 #11) Filter for Overall Read Depth (DP) (ODP)
 rule Filter5_ODP:
     input:
-        Filter4 = tempFolder + '{chr}_{chunk}_Filter4.recode.vcf.gz'
+        vcf = tempFolder + '{chr}_{chunk}_Filter4.recode.vcf.gz'
     output:
-        Filter5 = tempFolder + '{chr}_{chunk}_Filter5.recode.vcf.gz',
+        vcf = tempFolder + '{chr}_{chunk}_Filter5.recode.vcf.gz',
         stats = statsFolder + '{chr}_{chunk}_Filter5_stats.txt'
     params:
         ODP_thresh = ODP
     #group: "per_chunk_filter"
     shell:
         "ml vcflib/v1.0.0-rc0; ml bcftools/1.9;"
-        "tabix -f  -p vcf {input.Filter4};"
-        "vcffilter -f \"DP > {params.ODP_thresh}\" {input.Filter4} | bgzip -c > {output.Filter5};"
-        "bcftools stats {output.Filter5} > {output.stats};"
+        "tabix -f  -p vcf {input.vcf};"
+        "nLine=$(bcftools view -H {input.vcf} | wc -l); if [ $nLine == 0 ]; then cp {input.vcf} {output.vcf}; touch {output.stats}; exit 0; fi;"
+        "vcffilter -f \"DP > {params.ODP_thresh}\" {input.vcf} | bgzip -c > {output.vcf};"
+        "bcftools stats {output.vcf} > {output.stats};"
 
 #12) Filter for Mapping Quality (MQ)
 rule Filter6_MQ:
     input:
-        Filter5 = tempFolder + '{chr}_{chunk}_Filter5.recode.vcf.gz'
+        vcf = tempFolder + '{chr}_{chunk}_Filter5.recode.vcf.gz'
     output:
-        Filter6 = tempFolder + '{chr}_{chunk}_Filter6.recode.vcf.gz',
+        vcf = tempFolder + '{chr}_{chunk}_Filter6.recode.vcf.gz',
         stats = statsFolder + '{chr}_{chunk}_Filter6_stats.txt'
     params:
         MQ_MAX_THRESH = MQ_MAX,
         MQ_MIN_THRESH = MQ_MIN
     #group: "per_chunk_filter"
     shell:
         "ml vcflib/v1.0.0-rc0; ml bcftools/1.9;"
-        "tabix -f  -p vcf {input.Filter5};"
-
-        #"vcffilter -f \"MQ > {params.MQ_MIN_THRESH} & MQ < {params.MQ_MAX_THRESH}\" {input.Filter5} | bgzip -c > {output.Filter6};"
-        "bcftools filter -i \"MQ > {params.MQ_MIN_THRESH} && MQ < {params.MQ_MAX_THRESH}\" {input.Filter5} | bgzip -c > {output.Filter6};"
-        "bcftools stats {output.Filter6} > {output.stats};"
+        "tabix -f  -p vcf {input.vcf};"
+        "nLine=$(bcftools view -H {input.vcf} | wc -l); if [ $nLine == 0 ]; then cp {input.vcf} {output.vcf}; touch {output.stats}; exit 0; fi;" 
+        #"vcffilter -f \"MQ > {params.MQ_MIN_THRESH} & MQ < {params.MQ_MAX_THRESH}\" {input.vcf} | bgzip -c > {output.vcf};"
+        "bcftools filter -i \"MQ > {params.MQ_MIN_THRESH} && MQ < {params.MQ_MAX_THRESH}\" {input.vcf} | bgzip -c > {output.vcf};"
+        "bcftools stats {output.vcf} > {output.stats};"
 
 #13) Separate out Indel files and SNP files in Biallelic files.
 rule Biallelic_Separate_Indels_and_SNPs:
     input:
-        Filter6 = tempFolder + '{chr}_{chunk}_Filter6.recode.vcf.gz'
+        vcf = tempFolder + '{chr}_{chunk}_Filter6.recode.vcf.gz'
     output:
-        Filter6_SNPs = tempFolder + '{chr}_{chunk}_Filter6_SNPs.recode.vcf.gz',
-        Filter6_SNPs_stats = statsFolder + '{chr}_{chunk}_Filter6_SNPs_stats.txt',
-        Filter6_Indels = tempFolder + '{chr}_{chunk}_Filter6_Indels.recode.vcf.gz',
-        Filter6_Indels_stats = statsFolder + '{chr}_{chunk}_Filter6_Indels_stats.txt'
+        vcf_SNPs = tempFolder + '{chr}_{chunk}_Filter6_SNPs.recode.vcf.gz',
+        vcf_SNPs_stats = statsFolder + '{chr}_{chunk}_Filter6_SNPs_stats.txt',
+        vcf_Indels = tempFolder + '{chr}_{chunk}_Filter6_Indels.recode.vcf.gz',
+        vcf_Indels_stats = statsFolder + '{chr}_{chunk}_Filter6_Indels_stats.txt'
     #group: "per_chunk_filter"
     shell:
         "ml vcftools/0.1.15; ml bcftools/1.9;"
+        "nLine=$(bcftools view -H {input.vcf} | wc -l); if [ $nLine == 0 ]; then cp {input.vcf} {output.vcf_SNPs}; cp {input.vcf} {output.vcf_Indels}; touch {output.vcf_SNPs_stats}; touch {output.vcf_Indels_stats}; exit 0; fi;"
+        "vcftools --gzvcf {input.vcf} --remove-indels --stdout --recode --recode-INFO-all | bgzip -c > {output.vcf_SNPs};"
+        "vcftools --gzvcf {input.vcf} --keep-only-indels --stdout --recode --recode-INFO-all | bgzip -c > {output.vcf_Indels};"
 
-        "vcftools --gzvcf {input.Filter6} --remove-indels --stdout --recode --recode-INFO-all | bgzip -c > {output.Filter6_SNPs};"
-        "vcftools --gzvcf {input.Filter6} --keep-only-indels --stdout --recode --recode-INFO-all | bgzip -c > {output.Filter6_Indels};"
-
-        "bcftools stats {output.Filter6_SNPs} > {output.Filter6_SNPs_stats};"
-        "bcftools stats {output.Filter6_Indels} > {output.Filter6_Indels_stats};"
+        "bcftools stats {output.vcf_SNPs} > {output.vcf_SNPs_stats};"
+        "bcftools stats {output.vcf_Indels} > {output.vcf_Indels_stats};"
 
 
 #14) Filter Biallelic SNPs for VQSLOD
 rule Biallelic_SNPs_Filter7_VQSLOD:
     input:
-        Filter6_SNPs = tempFolder + '{chr}_{chunk}_Filter6_SNPs.recode.vcf.gz'
+        vcf = tempFolder + '{chr}_{chunk}_Filter6_SNPs.recode.vcf.gz'
     output:
-        Filter7_SNPs = tempFolder + '{chr}_{chunk}_Filter7_SNPs.recode.vcf.gz',
+        vcf = tempFolder + '{chr}_{chunk}_Filter7_SNPs.recode.vcf.gz',
         stats = statsFolder + '{chr}_{chunk}_Filter7_SNPs_stats.txt'
     params:
         VQSLOD_thresh = VQSLOD
     #group: "per_chunk_filter"
     shell:
         "ml vcflib/v1.0.0-rc0; ml bcftools/1.9;"
-        "tabix -f -p vcf {input.Filter6_SNPs};"
-        "vcffilter -f \"VQSLOD > {params.VQSLOD_thresh}\" {input.Filter6_SNPs} | bgzip -c > {output.Filter7_SNPs};"
-        "bcftools stats {output.Filter7_SNPs} > {output.stats};"
+        "tabix -f -p vcf {input.vcf};"
+        "nLine=$(bcftools view -H {input.vcf} | wc -l); if [ $nLine == 0 ]; then cp {input.vcf} {output.vcf}; touch {output.stats}; exit 0; fi;"
+        "vcffilter -f \"VQSLOD > {params.VQSLOD_thresh}\" {input.vcf} | bgzip -c > {output.vcf};"
+        "bcftools stats {output.vcf} > {output.stats};"
 
 #15) Combine Biallelic Indels and SNPs
 rule Biallelic_Combine_Indels_and_SNPs:
     input:
-        Filter7_SNPs = tempFolder + '{chr}_{chunk}_Filter7_SNPs.recode.vcf.gz',
-        Filter6_Indels = tempFolder + '{chr}_{chunk}_Filter6_Indels.recode.vcf.gz'
+        SNPs = tempFolder + '{chr}_{chunk}_Filter7_SNPs.recode.vcf.gz',
+        Indels = tempFolder + '{chr}_{chunk}_Filter6_Indels.recode.vcf.gz'
     output:
-        Filter7 = tempFolder + '{chr}_{chunk}_CombineSNPsIndels.recode.vcf.gz',
+        vcf = tempFolder + '{chr}_{chunk}_CombineSNPsIndels.recode.vcf.gz',
         stats = statsFolder + '{chr}_{chunk}_CombineSNPsIndels_stats.txt'
     #group: "per_chunk_filter"
     shell:
         "ml bcftools/1.9;"
-        "bcftools concat {input.Filter6_Indels} {input.Filter7_SNPs} | bcftools sort | bgzip -c > {output.Filter7};"
-        "bcftools stats {output.Filter7} > {output.stats};"
+        "nLine=$(bcftools view -H {input.SNPs} | wc -l); if [ $nLine == 0 ]; then cp {input.SNPs} {output.vcf}; touch {output.stats}; exit 0; fi;"
+        "bcftools concat {input.Indels} {input.SNPs} | bcftools sort | bgzip -c > {output.vcf};"
+        "bcftools stats {output.vcf} > {output.stats};"
 
 #16) Filter via Inbreeding_Coef
 rule Filter8_Inbreeding_Coef:
     input:
-        Filter7 = tempFolder + '{chr}_{chunk}_CombineSNPsIndels.recode.vcf.gz'
+        vcf = tempFolder + '{chr}_{chunk}_CombineSNPsIndels.recode.vcf.gz'
     output:
-        Filter8 = tempFolder + '{chr}_{chunk}_Filter8.recode.vcf.gz',
+        vcf = tempFolder + '{chr}_{chunk}_Filter8.recode.vcf.gz',
         stats = statsFolder + '{chr}_{chunk}_Filter8_stats.txt'
     params:
         INBREEDING_COEF = INBREEDING_COEF
     #group: "per_chunk_filter"
     run:
-        shell("ml vcflib/v1.0.0-rc0; ml bcftools/1.9;tabix -f -p vcf {input.Filter7};")
-
-        #Negative values for inbreeding coeficient are expressed as (0 - [value]) due to how vcffilter is built. Hence, an if statement to separate negative from positive input INBREEDING_COEF values
-        if params.INBREEDING_COEF >= 0:
-            shell("ml vcflib/v1.0.0-rc0; ml bcftools/1.9;vcffilter -f \"InbreedingCoeff > {params.INBREEDING_COEF}\" {input.Filter7} |bgzip -c > {output.Filter8};")
-        else:
-            params.INBREEDING_COEF = -params.INBREEDING_COEF
-            shell("ml vcflib/v1.0.0-rc0; ml bcftools/1.9;vcffilter -f \"InbreedingCoeff > ( 0 - {params.INBREEDING_COEF} )\" {input.Filter7} |bgzip -c > {output.Filter8};")
-
-        shell("ml bcftools/1.9;bcftools stats {output.Filter8} > {output.stats};")
+        shell("ml vcflib/v1.0.0-rc0; ml bcftools/1.9;tabix -f -p vcf {input.vcf};")
+        cmd = "ml bcftools/1.9;bcftools view -H " + input.vcf + "| wc -l"
+        nLine = subprocess.check_output(cmd, shell = True)
+        nLine = int(nLine.decode("utf-8").strip() )
+        if nLine == 0:
+            shell("cp {input.vcf} {output.vcf}; touch {output.stats}")
+        else:     
+            #Negative values for inbreeding coeficient are expressed as (0 - [value]) due to how vcffilter is built. Hence, an if statement to separate negative from positive input INBREEDING_COEF values
+            if params.INBREEDING_COEF >= 0:
+                shell("ml vcflib/v1.0.0-rc0; ml bcftools/1.9;vcffilter -f \"InbreedingCoeff > {params.INBREEDING_COEF}\" {input.vcf} |bgzip -c > {output.vcf};")
+            else:
+                params.INBREEDING_COEF = -params.INBREEDING_COEF
+                shell("ml vcflib/v1.0.0-rc0; ml bcftools/1.9;vcffilter -f \"InbreedingCoeff > ( 0 - {params.INBREEDING_COEF} )\" {input.vcf} |bgzip -c > {output.vcf};")
+
+            shell("ml bcftools/1.9;bcftools stats {output.vcf} > {output.stats};")
 
 ## STEP 3: AGGREGATE ALL CHUNKS TOGETHER
 
@@ -531,7 +542,7 @@ rule Filter9_Sample_Missingness:
         bed = tempFolder + 'chrAll_Recombined.bed',
         bim = tempFolder + 'chrAll_Recombined.bim',
         fam = tempFolder + 'chrAll_Recombined.fam'
-        #Filter8 = tempFolder + 'all_Filter8_sorted.recode.vcf.gz'
+        #vcf = tempFolder + 'all_Filter8_sorted.recode.vcf.gz'
     output:
         bed = tempFolder + 'chrAll_Filter9.bed',
         bim = tempFolder + 'chrAll_Filter9.bim',
@@ -632,5 +643,4 @@ rule collateStats:
         script = "scripts/collate_all_stats.R"
     shell:
         "ml R/3.6.0;"
-        "Rscript {params.script} {statsFolder} {output}"
-
+        "Rscript {params.script} {statsFolder} {output}" 
diff --git a/cluster.yaml b/cluster.yaml
@@ -16,15 +16,15 @@ liftOverVCFs:
     mem: 3000
     time: '360'
 Biallelic_Combine_Indels_and_SNPs:
-    cores: 4
+    cores: 16
     mem: 3750
     time: '180'
 recombineChunks:
-    cores: 4
+    cores: 8
     mem: 3750
     time: '180'
 recombineChromosomes:
-    cores: 4
+    cores: 8
     mem: 3750
     time: '180'
 KingRelatedness:

diff --git a/snakejob b/snakejob
@@ -68,6 +68,7 @@ bsub=("bsub -K -J $jname:{rule}:{wildcards}"
 
 snakemake -u cluster.yaml --cluster-sync "${bsub[*]}" \
   --local-cores 4 --max-jobs-per-second 5 \
+  --keep-going \
   --jobs 1000 \
   --restart-times 3 \
   -s $snakefile --configfile $config \