Skip to content

Commit 0d947ec

Browse files
committed
Jasmine benchmarks seems worse. Back to SURVIVOR for now.
1 parent b40ef9b commit 0d947ec

File tree

5 files changed

+51
-20
lines changed

5 files changed

+51
-20
lines changed
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
OUT_FOLDER: "results_study_case_1_nopanel"
2+
SAMPLE_KEY: "example/aj_trio/study_case_1/sampleKey.txt"
3+
4+
TOOLS:
5+
- "manta"
6+
- "smoove"
7+
- "delly"
8+
9+
# Reference genome files
10+
REFERENCE_FASTA: "data/ref/human_g1k_v37.fasta"
11+
REF_BUILD: "37"
12+
13+
# Gencode GTF for SV annotation
14+
GENCODE_GTF: "data/annotation/gtf/gencode.v38lift37.annotation.nochr.gtf.gz"

workflow/Snakefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import glob
33
import yaml
44
from snakemake.io import *
55

6-
version="0.3"
6+
version="0.4"
77

88
###################################################################################################
99
## Check inputs

workflow/envs/jasmine.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
channels:
22
- bioconda
33
dependencies:
4-
- jasminesv=1.1.0
4+
- jasminesv=1.1.2
55
- bcftools=1.9
66
- samtools=1.9

workflow/rules/benchmark.smk

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
###################################################################################################
22
## Benchmarking against GiaB
3-
###################################################################################################
43
# Deactivated in the pipeline.
4+
###################################################################################################
5+
# Default
6+
GIAB_VCF = "data/bench/HG002_SVs_Tier1_v0.6.vcf.gz"
7+
GIAB_BED = "data/bench/HG002_SVs_Tier1_v0.6.22XY.bed"
8+
9+
# Benchmark raw SV callers
510
rule benchmark_raw:
611
input:
712
vcf = OUT_FOLDER + "/sv_discovery/{tool}/{sample}/{sample}.{tool}.vcf.gz"
@@ -10,8 +15,8 @@ rule benchmark_raw:
1015
conda:
1116
SNAKEDIR + "envs/truvari.yaml"
1217
params:
13-
base = "data/bench/HG002_SVs_Tier1_v0.6.vcf.gz",
14-
bed = "data/bench/HG002_SVs_Tier1_v0.6.22XY.bed",
18+
base = GIAB_VCF,
19+
bed = GIAB_BED,
1520
ref = REFERENCE_FASTA,
1621
outdir = OUT_FOLDER + "/bench/{sample}/sv_discovery/{tool}/"
1722
shell:
@@ -27,14 +32,15 @@ rule benchmark_raw:
2732
-r 2000 \
2833
--giabreport; "
2934

35+
# Benchmark each SV caller after genotyping (e.g. Manta + GraphTyper, or Delly + GraphTyper)
3036
rule benchmark_raw_gt:
3137
input:
3238
vcf = OUT_FOLDER + "/sv_discovery/{tool}/{sample}/{sample}.{tool}.gt.vcf.gz"
3339
output:
3440
OUT_FOLDER + "/bench/{sample}/sv_discovery/{tool}.gt/giab_report.txt"
3541
params:
36-
base = "data/bench/HG002_SVs_Tier1_v0.6.vcf.gz",
37-
bed = "data/bench/HG002_SVs_Tier1_v0.6.22XY.bed",
42+
base = GIAB_VCF,
43+
bed = GIAB_BED,
3844
ref = REFERENCE_FASTA,
3945
outdir = OUT_FOLDER + "/bench/{sample}/sv_discovery/{tool}.gt/"
4046
conda:
@@ -54,14 +60,15 @@ rule benchmark_raw_gt:
5460
-r 2000 \
5561
--giabreport; "
5662

63+
# Benchmark each SV caller after joint-genotying. This may also include SV reference panels, if specified.
5764
rule benchmark_gt:
5865
input:
5966
vcf = OUT_FOLDER + "/sv_genotyping/{sample}/{sample}.vcf.gz"
6067
output:
6168
OUT_FOLDER + "/bench/{sample}/sv_genotyping/{tool}/giab_report.txt"
6269
params:
63-
base = "data/bench/HG002_SVs_Tier1_v0.6.vcf.gz",
64-
bed = "data/bench/HG002_SVs_Tier1_v0.6.22XY.bed",
70+
base = GIAB_VCF,
71+
bed = GIAB_BED,
6572
ref = REFERENCE_FASTA,
6673
outdir = OUT_FOLDER + "/bench/{sample}/sv_genotyping/{tool}/"
6774
conda:
@@ -81,19 +88,23 @@ rule benchmark_gt:
8188
-r 2000 \
8289
--giabreport; "
8390

91+
# Benchmark SVs from the final genotyped and merged call set.
8492
rule benchmark_merged_gt:
8593
input:
8694
vcf = OUT_FOLDER + "/merged_cohort/gt_merged.vcf.gz"
8795
output:
88-
OUT_FOLDER + "/bench/{sample}/merged/giab_report.txt"
96+
vcf = OUT_FOLDER + "/bench/{sample}/merged/vcf.gz",
97+
report = OUT_FOLDER + "/bench/{sample}/merged/giab_report.txt"
8998
params:
90-
base = "data/bench/HG002_SVs_Tier1_v0.6.vcf.gz",
91-
bed = "data/bench/HG002_SVs_Tier1_v0.6.22XY.bed",
99+
base = GIAB_VCF,
100+
bed = GIAB_BED,
92101
ref = REFERENCE_FASTA,
93102
outdir = OUT_FOLDER + "/bench/{sample}/merged/"
94103
conda:
95104
SNAKEDIR + "envs/truvari.yaml"
96105
shell:
106+
"bcftools view -c1 -s {wildcards.sample} -Oz -o {output.vcf} {input.vcf}; "
107+
"tabix -p vcf {output.vcf}; "
97108
"rm -rf {params.outdir}; "
98109
"truvari bench \
99110
-b {params.base} \
@@ -104,3 +115,5 @@ rule benchmark_merged_gt:
104115
-p 0 \
105116
-r 2000 \
106117
--giabreport; "
118+
119+

workflow/rules/genotype.smk

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ rule merge_samples_01:
5454
shell("ls {sv_ref} >> {output.vcf_list}; ")
5555

5656
# Options of using SURVIVOR or Jasmine. Jasmine by default.
57-
useJAMINE=True
57+
useJAMINE=False
5858
if (useJAMINE):
5959
checkpoint merge_samples_02:
6060
input:
@@ -69,6 +69,7 @@ if (useJAMINE):
6969
vcf_list = OUT_FOLDER + "/merged_cohort/vcf.list",
7070
shell:
7171
"jasmine file_list={params.vcf_list} out_file={output.vcf}; "
72+
"jasmine --dup_to_ins --postprocess_only out_file={output.vcf}; "
7273
"bcftools sort -Oz -o {output.vcfgz} {output.vcf}; "
7374
"tabix -p vcf {output.vcfgz}; "
7475
else:
@@ -83,7 +84,7 @@ else:
8384
SNAKEDIR + "envs/survivor.yaml"
8485
params:
8586
vcf_list = OUT_FOLDER + "/merged_cohort/vcf.list",
86-
breakpoint_dist = "500",
87+
breakpoint_dist = "100",
8788
min_num_calls = "1",
8889
use_type = "1",
8990
use_strand = "1",
@@ -146,14 +147,18 @@ rule genotype_02:
146147
output:
147148
vcf = OUT_FOLDER + "/sv_genotyping/{sample}/{sample}.vcf",
148149
vcfgz = OUT_FOLDER + "/sv_genotyping/{sample}/{sample}.vcf.gz",
149-
tbi = OUT_FOLDER + "/sv_genotyping/{sample}/{sample}.vcf.gz.tbi"
150+
tbi = OUT_FOLDER + "/sv_genotyping/{sample}/{sample}.vcf.gz.tbi",
151+
filt_vcfgz = OUT_FOLDER + "/sv_genotyping/{sample}/{sample}.filt.vcf.gz",
152+
filt_tbi = OUT_FOLDER + "/sv_genotyping/{sample}/{sample}.filt.vcf.gz.tbi"
150153
conda:
151154
SNAKEDIR + "envs/graphtyper.yaml"
152155
priority: 0
153156
shell:
154-
"graphtyper vcf_concatenate {input.vcf} | bcftools sort > {output.vcf}; "
157+
"graphtyper vcf_concatenate {input.vcf} | bcftools view --include \"SVMODEL='AGGREGATED'\" | bcftools sort > {output.vcf}; "
155158
"bgzip -c {output.vcf} > {output.vcfgz}; "
156159
"tabix -p vcf {output.vcfgz}; "
160+
"bcftools view -e QUAL==0 -Oz -o {output.filt_vcfgz} {output.vcfgz}; "
161+
"tabix -p vcf {output.filt_vcfgz}; "
157162

158163
rule merge_genotyped_samples:
159164
input:
@@ -168,11 +173,10 @@ rule merge_genotyped_samples:
168173
conda:
169174
SNAKEDIR + "envs/graphtyper.yaml"
170175
shell:
171-
"graphtyper vcf_merge {input.vcf} --sv | \
172-
grep -E -v \"BREAKPOINT|COVERAGE\" | grep -E -v \"VarType=XG|VarType=IG\" > {output.vcf_full}; "
176+
"graphtyper vcf_merge {input.vcf} --sv | grep -E -v \"VarType=XG|VarType=IG\" > {output.vcf_full}; "
173177
"bcftools sort -Oz -o {output.vcfgz_full} {output.vcf_full}; "
174178
"tabix -p vcf {output.vcfgz_full}; "
175-
"bcftools view -f PASS {output.vcfgz_full} > {output.vcf}; "
179+
"bcftools view -e QUAL==0 {output.vcfgz_full} > {output.vcf}; "
176180
"bgzip -c {output.vcf} > {output.vcfgz}; "
177181
"tabix -p vcf {output.vcfgz}; "
178182

@@ -207,6 +211,6 @@ rule genotype_discovery:
207211
--output={params.outdir}; \
208212
done; "
209213
"graphtyper vcf_concatenate --no_sort {params.outdir}/*/*.vcf.gz | \
210-
grep -E -v \"BREAKPOINT|COVERAGE\" | bcftools sort > {output.vcf};"
214+
bcftools view --include \"SVMODEL='AGGREGATED'\" | bcftools view -e QUAL==0 | bcftools sort > {output.vcf};"
211215
"bgzip -c {output.vcf} > {output.vcfgz}; "
212216
"tabix -p vcf {output.vcfgz}; "

0 commit comments

Comments
 (0)