Skip to content

Commit

Permalink
implement variant normalization within pipeline to avoid another call…
Browse files Browse the repository at this point in the history
… to nextflow
  • Loading branch information
priesgo committed Jun 22, 2021
1 parent c6683a5 commit a600091
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 42 deletions.
22 changes: 0 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,25 +114,3 @@ Output:
- Wilm, A., Aw, P. P. K., Bertrand, D., Yeo, G. H. T., Ong, S. H., Wong, C. H., Khor, C. C., Petric, R., Hibberd, M. L., & Nagarajan, N. (2012). LoFreq: A sequence-quality aware, ultra-sensitive variant caller for uncovering cell-population heterogeneity from high-throughput sequencing datasets. Nucleic Acids Research, 40(22), 11189–11201. https://doi.org/10.1093/nar/gks918
- Grubaugh, N. D., Gangavarapu, K., Quick, J., Matteson, N. L., De Jesus, J. G., Main, B. J., Tan, A. L., Paul, L. M., Brackney, D. E., Grewal, S., Gurfield, N., Van Rompay, K. K. A., Isern, S., Michael, S. F., Coffey, L. L., Loman, N. J., & Andersen, K. G. (2019). An amplicon-based sequencing framework for accurately measuring intrahost virus diversity using PrimalSeq and iVar. Genome Biology, 20(1), 8. https://doi.org/10.1186/s13059-018-1618-7
- Shifu Chen, Yanqing Zhou, Yaru Chen, Jia Gu; fastp: an ultra-fast all-in-one FASTQ preprocessor, Bioinformatics, Volume 34, Issue 17, 1 September 2018, Pages i884–i890, https://doi.org/10.1093/bioinformatics/bty560


## Known issues

The variant normalization workflow may appear as corrupt when calling to it concurrently.
A workaround to this situation is to clone the tronflow dependencies and let covigator pipeline know of the location of the `main.nf` files.

For instance:
```
cd /covigator/dependencies
git clone --branch v1.4.1 https://github.com/TRON-Bioinformatics/tronflow-bwa.git
git clone --branch v1.5.0 https://github.com/TRON-Bioinformatics/tronflow-bam-preprocessing.git
git clone --branch v1.1.1 https://github.com/TRON-Bioinformatics/tronflow-variant-normalization.git
```

And then use the following parameters:
```
nextflow run tron-bioinformatics/covigator-ngs-pipeline -profile conda \
--tronflow_bwa /covigator/dependencies/tronflow-bwa/main.nf \
--tronflow_bam_preprocessing /covigator/dependencies/tronflow-bam-preprocessing/main.nf \
--tronflow_variant_normalization /covigator/dependencies/tronflow-variant-normalization/main.nf ...
```
11 changes: 5 additions & 6 deletions bin/assembly_variant_caller.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class Variant:

def to_vcf_line(self):
# transform 0-based position to 1-based position
return CHROMOSOME, str(self.position + 1), ".", self.reference, self.alternate, ".", "PASS", "."
return [CHROMOSOME, str(self.position + 1), ".", self.reference, self.alternate, ".", "PASS", "."]


class AssemblyVariantCaller:
Expand Down Expand Up @@ -57,7 +57,7 @@ def _call_mutations(self, alignment: PairwiseAlignment) -> List[Variant]:
if prev_ref_end is not None and prev_ref_end != ref_start:
# deletion
if ref_start - prev_ref_end <= 50: # skips deletions longer than 50 bp
ref = reference[prev_ref_end - 1: ref_start]
ref = str(reference[prev_ref_end - 1: ref_start])
if not any(self._is_ambiguous_base(r) for r in ref): # do not call deletions with Ns
variants.append(Variant(
position=prev_ref_end - 1,
Expand All @@ -67,7 +67,7 @@ def _call_mutations(self, alignment: PairwiseAlignment) -> List[Variant]:
# insertion
if alt_start - prev_alt_end <= 50: # skips insertions longer than 50 bp
ref = reference[prev_ref_end - 1]
alt = alternate[prev_alt_end:alt_start]
alt = str(alternate[prev_alt_end:alt_start])
# do not call insertions with ambiguous bases
if not self._is_ambiguous_base(ref) and not any(self._is_ambiguous_base(a) for a in alt):
variants.append(Variant(
Expand All @@ -92,7 +92,6 @@ def _is_ambiguous_base(self, base):
return base not in "ACGT"



def write_vcf(mutations, output_vcf):
with open(output_vcf, "w") as vcf_out:
header = (
Expand All @@ -103,8 +102,8 @@ def write_vcf(mutations, output_vcf):
)
for row in header:
vcf_out.write(row + "\n")
for row in mutations:
vcf_out.write("\t".join(row.to_vcf_line()) + "\n")
for mutation in mutations:
vcf_out.write("\t".join(mutation.to_vcf_line()) + "\n")


def main():
Expand Down
3 changes: 2 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,5 @@ dependencies:
- bioconda::gatk4=4.2.0.0
- bioconda::ivar=1.3.1
- bioconda::fastp=0.20.1
- bioconda::snpeff=5.0
- bioconda::snpeff=5.0
- bioconda::vt=0.57721
27 changes: 15 additions & 12 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -411,20 +411,23 @@ process variantNormalization {
set name, file(vcf) from vcfs_to_normalize

output:
set name, file("${vcf.baseName}.normalized.vcf") into normalized_vcf_files
set name, file("${vcf.baseName}.normalized.vcf") into normalized_vcf_files

script:
"""
# initial sort of the VCF
bcftools sort ${vcf} | \
# checks reference genome, decompose multiallelics, trim and left align indels
bcftools norm --multiallelics -any --check-ref e --fasta-ref ${params.reference} \
--old-rec-tag OLD_CLUMPED - | \
# decompose complex variants
vt decompose_blocksub -a -p - | \
# remove duplicates after normalisation
bcftools norm --rm-dup exact -o ${vcf.baseName}.normalized.vcf -
"""
# --input_files needs to be forced, otherwise it is inherited from profile in tests
nextflow run ${params.tronflow_variant_normalization} \
--input_vcf ${vcf} \
--input_files false \
--output . \
--reference ${reference} \
-profile ${workflow.profile} \
-work-dir ${workflow.workDir}
mv ${vcf.baseName}/${vcf.baseName}.normalized.vcf ${vcf.baseName}.normalized.vcf
"""
}

process variantAnnotation {
Expand Down
1 change: 0 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

params.tronflow_bwa = "tron-bioinformatics/tronflow-bwa -r v1.4.1"
params.tronflow_bam_preprocessing = "tron-bioinformatics/tronflow-bam-preprocessing -r v1.5.0"
params.tronflow_variant_normalization = "tron-bioinformatics/tronflow-variant-normalization -r v1.1.1"

params.reference = "$baseDir/reference/Sars_cov_2.ASM985889v3.dna.toplevel.fa"
params.gff = "$baseDir/reference/Sars_cov_2.ASM985889v3.101.gff3"
Expand Down

0 comments on commit a600091

Please sign in to comment.