diff --git a/.github/workflows/automated_tests.yml b/.github/workflows/automated_tests.yml index a4cedde..9878af9 100644 --- a/.github/workflows/automated_tests.yml +++ b/.github/workflows/automated_tests.yml @@ -15,7 +15,7 @@ jobs: - uses: conda-incubator/setup-miniconda@v2 with: auto-update-conda: true - channels: defaults,conda-forge,bioconda + channels: conda-forge,bioconda,defaults - name: Install dependencies run: | apt-get update && apt-get --assume-yes install wget build-essential procps software-properties-common libgsl0-dev @@ -29,4 +29,5 @@ jobs: key: ${{ runner.os }}-covigator-ngs-pipeline - name: Run tests run: | + export NXF_VER=22.04.5 make \ No newline at end of file diff --git a/README.md b/README.md index f2a16dd..1ded9db 100644 --- a/README.md +++ b/README.md @@ -45,8 +45,8 @@ analysed differently. Also, the output data that we can obtain from each of thes When FASTQ files are provided the pipeline includes the following steps: - **Trimming**. `fastp` is used to trim reads with default values. This step also includes QC filtering. -- **Alignment**. `BWA mem` is used for the alignment of single or paired end samples. -- **BAM preprocessing**. BAM files are prepared and duplicate reads are marked using GATK and Picard tools. +- **Alignment**. `BWA mem 2` is used for the alignment of single or paired end samples. +- **BAM preprocessing**. BAM files are prepared and duplicate reads are marked using GATK and Sambamba tools. - **Primer trimming**. When a BED with primers is provided, these are trimmed from the reads using iVar. This is applicable to the results from all variant callers. - **Coverage analysis**. `samtools coverage` and `samtools depth` are used to compute the horizontal and vertical coverage respectively. @@ -333,6 +333,7 @@ Optional input: * --skip_gatk: skips calling variants with GATK * --skip_bcftools: skips calling variants with BCFTools * --skip_ivar: skips calling variants with iVar + * --skip_pangolin: skips lineage determination with pangolin * --match_score: global alignment match score, only applicable for assemblies (default: 2) * --mismatch_score: global alignment mismatch score, only applicable for assemblies (default: -1) * --open_gap_score: global alignment open gap score, only applicable for assemblies (default: -3) @@ -347,7 +348,6 @@ Output: * Only when FASTQs are provided: * FASTP statistics * Depth and breadth of coverage analysis results - * Picard's deduplication metrics ``` @@ -411,7 +411,7 @@ is a technical artifact that would need to be avoided. ## Bibliography - Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316–319. https://doi.org/10.1038/nbt.3820 -- Li H. and Durbin R. (2009) Fast and accurate short read alignment with Burrows-Wheeler Transform. Bioinformatics, 25:1754-60. [PMID: 19451168] +- Vasimuddin Md, Sanchit Misra, Heng Li, Srinivas Aluru. Efficient Architecture-Aware Acceleration of BWA-MEM for Multicore Systems. IEEE Parallel and Distributed Processing Symposium (IPDPS), 2019. - Adrian Tan, Gonçalo R. Abecasis and Hyun Min Kang. Unified Representation of Genetic Variants. Bioinformatics (2015) 31(13): 2202-2204](http://bioinformatics.oxfordjournals.org/content/31/13/2202) and uses bcftools [Li, H. (2011). A statistical framework for SNP calling, mutation discovery, association mapping and population genetical parameter estimation from sequencing data. Bioinformatics (Oxford, England), 27(21), 2987–2993. 10.1093/bioinformatics/btr509 - Danecek P, Bonfield JK, Liddle J, Marshall J, Ohan V, Pollard MO, Whitwham A, Keane T, McCarthy SA, Davies RM, Li H. Twelve years of SAMtools and BCFtools. Gigascience. 2021 Feb 16;10(2):giab008. doi: 10.1093/gigascience/giab008. PMID: 33590861; PMCID: PMC7931819. - Van der Auwera GA, Carneiro M, Hartl C, Poplin R, del Angel G, Levy-Moonshine A, Jordan T, Shakir K, Roazen D, Thibault J, Banks E, Garimella K, Altshuler D, Gabriel S, DePristo M. (2013). From FastQ Data to High-Confidence Variant Calls: The Genome Analysis Toolkit Best Practices Pipeline. Curr Protoc Bioinformatics, 43:11.10.1-11.10.33. DOI: 10.1002/0471250953.bi1110s43. @@ -422,3 +422,4 @@ is a technical artifact that would need to be avoided. - Shifu Chen, Yanqing Zhou, Yaru Chen, Jia Gu; fastp: an ultra-fast all-in-one FASTQ preprocessor, Bioinformatics, Volume 34, Issue 17, 1 September 2018, Pages i884–i890, https://doi.org/10.1093/bioinformatics/bty560 - Kwon, S. Bin, & Ernst, J. (2021). Single-nucleotide conservation state annotation of the SARS-CoV-2 genome. Communications Biology, 4(1), 1–11. https://doi.org/10.1038/s42003-021-02231-w - Cock, P. J., Antao, T., Chang, J. T., Chapman, B. A., Cox, C. J., Dalke, A., et al. (2009). Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics, 25(11), 1422–1423. +- Artem Tarasov, Albert J. Vilella, Edwin Cuppen, Isaac J. Nijman, Pjotr Prins, Sambamba: fast processing of NGS alignment formats, Bioinformatics, Volume 31, Issue 12, 15 June 2015, Pages 2032–2034, https://doi.org/10.1093/bioinformatics/btv098 diff --git a/main.nf b/main.nf index 50771eb..82d4308 100755 --- a/main.nf +++ b/main.nf @@ -26,6 +26,7 @@ params.skip_lofreq = false params.skip_ivar = false params.skip_bcftools = false params.skip_gatk = false +params.skip_pangolin = false // references params.reference = false @@ -191,7 +192,8 @@ workflow { bam_files = ALIGNMENT_SINGLE_END.out } BAM_PREPROCESSING(bam_files, reference) - preprocessed_bams = BAM_PREPROCESSING.out.preprocessed_bam + preprocessed_bams = BAM_PREPROCESSING.out.preprocessed_bams + if (primers) { PRIMER_TRIMMING_IVAR(preprocessed_bams, primers) preprocessed_bams = PRIMER_TRIMMING_IVAR.out.trimmed_bam @@ -223,12 +225,16 @@ workflow { } // pangolin from VCF - VCF2FASTA(vcfs_to_normalize, reference) - PANGOLIN_LINEAGE(VCF2FASTA.out) + if (!params.skip_pangolin) { + VCF2FASTA(vcfs_to_normalize, reference) + PANGOLIN_LINEAGE(VCF2FASTA.out) + } } else if (input_fastas) { - // pangolin from fasta - PANGOLIN_LINEAGE(input_fastas) + if (!params.skip_pangolin) { + // pangolin from fasta + PANGOLIN_LINEAGE(input_fastas) + } // assembly variant calling VARIANT_CALLING_ASSEMBLY(input_fastas, reference) diff --git a/modules/02_bwa.nf b/modules/02_bwa.nf index a5d8540..b5714ad 100644 --- a/modules/02_bwa.nf +++ b/modules/02_bwa.nf @@ -7,7 +7,7 @@ process ALIGNMENT_PAIRED_END { memory params.memory tag "${name}" - conda (params.enable_conda ? "bioconda::bwa=0.7.17 bioconda::samtools=1.12" : null) + conda (params.enable_conda ? "bioconda::bwa-mem2=2.2.1 bioconda::samtools=1.12" : null) input: tuple val(name), file(fastq1), file(fastq2) @@ -17,7 +17,7 @@ process ALIGNMENT_PAIRED_END { tuple val(name), file("${name}.bam") """ - bwa mem -t ${task.cpus} ${reference} ${fastq1} ${fastq2} | \ + bwa-mem2 mem -t ${task.cpus} ${reference} ${fastq1} ${fastq2} | \ samtools view -uS - | \ samtools sort - > ${name}.bam """ @@ -28,7 +28,7 @@ process ALIGNMENT_SINGLE_END { memory params.memory tag "${name}" - conda (params.enable_conda ? "bioconda::bwa=0.7.17 bioconda::samtools=1.12" : null) + conda (params.enable_conda ? "bioconda::bwa-mem2=2.2.1 bioconda::samtools=1.12" : null) input: tuple val(name), file(fastq1) @@ -38,7 +38,7 @@ process ALIGNMENT_SINGLE_END { tuple val(name), file("${name}.bam") """ - bwa mem -t ${task.cpus} ${reference} ${fastq1} | \ + bwa-mem2 mem -t ${task.cpus} ${reference} ${fastq1} | \ samtools view -uS - | \ samtools sort - > ${name}.bam """ diff --git a/modules/03_bam_preprocessing.nf b/modules/03_bam_preprocessing.nf index d7155bd..8e18eb4 100644 --- a/modules/03_bam_preprocessing.nf +++ b/modules/03_bam_preprocessing.nf @@ -7,21 +7,19 @@ params.keep_intermediate = false process BAM_PREPROCESSING { cpus params.cpus memory params.memory + tag "${name}" if (params.keep_intermediate) { publishDir "${params.output}", mode: "copy" } - publishDir "${params.output}", mode: "copy", pattern: "${name}.deduplication_metrics.txt" - tag "${name}" - conda (params.enable_conda ? "bioconda::gatk4=4.2.0.0" : null) + conda (params.enable_conda ? "bioconda::gatk4=4.2.0.0 bioconda::sambamba=0.8.2" : null) input: tuple val(name), file(bam) val(reference) output: - tuple val(name), file("${name}.preprocessed.bam"), file("${name}.preprocessed.bai"), emit: preprocessed_bam - file("${name}.deduplication_metrics.txt") + tuple val(name), file("${name}.preprocessed.bam"), file("${name}.preprocessed.bai"), emit: preprocessed_bams """ mkdir tmp @@ -34,29 +32,28 @@ process BAM_PREPROCESSING { --java-options '-Xmx${params.memory} -Djava.io.tmpdir=./tmp' \ --VALIDATION_STRINGENCY SILENT \ --INPUT /dev/stdin \ - --OUTPUT ${bam.baseName}.prepared.bam \ + --OUTPUT ${name}.prepared.bam \ --REFERENCE_SEQUENCE ${reference} \ --RGPU 1 \ --RGID 1 \ --RGSM ${name} \ --RGLB 1 \ - --RGPL ILLUMINA \ - --SORT_ORDER queryname - - gatk MarkDuplicates \ - --java-options '-Xmx${params.memory} -Djava.io.tmpdir=./tmp' \ - --INPUT ${bam.baseName}.prepared.bam \ - --METRICS_FILE ${name}.deduplication_metrics.txt \ - --OUTPUT ${bam.baseName}.dedup.bam \ - --REMOVE_DUPLICATES true - - gatk SortSam \ - --java-options '-Xmx${params.memory} -Djava.io.tmpdir=./tmp' \ - --INPUT ${bam.baseName}.dedup.bam \ - --OUTPUT ${bam.baseName}.preprocessed.bam \ - --SORT_ORDER coordinate - - gatk BuildBamIndex --INPUT ${bam.baseName}.preprocessed.bam + --RGPL ILLUMINA + + # removes duplicates (sorted from the alignment process) + sambamba markdup \ + -r \ + -t ${task.cpus} \ + --tmpdir=./tmp \ + ${name}.prepared.bam ${name}.preprocessed.bam + + # removes intermediate BAM files + rm -f ${name}.prepared.bam + + # indexes the output BAM file + sambamba index \ + -t ${task.cpus} \ + ${name}.preprocessed.bam ${name}.preprocessed.bai """ } diff --git a/modules/06_variant_annotation.nf b/modules/06_variant_annotation.nf index a729db5..41abc6d 100644 --- a/modules/06_variant_annotation.nf +++ b/modules/06_variant_annotation.nf @@ -33,11 +33,13 @@ process VARIANT_ANNOTATION { output: tuple val(name), val(caller), file("${name}.${caller}.annotated.vcf"), emit: annotated_vcfs + script: + memory = "${params.memory}".replaceAll(" ", "").toLowerCase() """ # for some reason the snpEff.config file needs to be in the folder where snpeff runs... cp ${snpeff_config} . - snpEff eff -dataDir ${snpeff_data} \ + snpEff eff -Xmx${memory} -dataDir ${snpeff_data} \ -noStats -no-downstream -no-upstream -no-intergenic -no-intron -onlyProtein -hgvs1LetterAa -noShiftHgvs \ ${snpeff_organism} ${vcf} > ${name}.${caller}.annotated.vcf """ diff --git a/nextflow.config b/nextflow.config index 2f4a986..dfe5185 100644 --- a/nextflow.config +++ b/nextflow.config @@ -62,7 +62,7 @@ process.shell = ['/bin/bash', '-euo', 'pipefail'] cleanup = true conda.createTimeout = '1 h' -VERSION = '0.13.0' +VERSION = '0.14.0' manifest { name = 'TRON-Bioinformatics/covigator-ngs-pipeline' @@ -111,6 +111,7 @@ Optional input: * --skip_gatk: skips calling variants with GATK * --skip_bcftools: skips calling variants with BCFTools * --skip_ivar: skips calling variants with iVar + * --skip_pangolin: skips lineage determination with pangolin * --match_score: global alignment match score, only applicable for assemblies (default: 2) * --mismatch_score: global alignment mismatch score, only applicable for assemblies (default: -1) * --open_gap_score: global alignment open gap score, only applicable for assemblies (default: -3) diff --git a/reference/Sars_cov_2.ASM985889v3.dna.toplevel.fa.0123 b/reference/Sars_cov_2.ASM985889v3.dna.toplevel.fa.0123 new file mode 100644 index 0000000..c9c4735 Binary files /dev/null and b/reference/Sars_cov_2.ASM985889v3.dna.toplevel.fa.0123 differ diff --git a/reference/Sars_cov_2.ASM985889v3.dna.toplevel.fa.bwt.2bit.64 b/reference/Sars_cov_2.ASM985889v3.dna.toplevel.fa.bwt.2bit.64 new file mode 100644 index 0000000..f7cd697 Binary files /dev/null and b/reference/Sars_cov_2.ASM985889v3.dna.toplevel.fa.bwt.2bit.64 differ diff --git a/tests/test_00_test_mode.sh b/tests/test_00_test_mode.sh index 794a961..1bb2a19 100644 --- a/tests/test_00_test_mode.sh +++ b/tests/test_00_test_mode.sh @@ -23,7 +23,6 @@ test -s covigator_fastq_test/test.fastp_stats.json || { echo "Missing VCF output test -s covigator_fastq_test/test.fastp_stats.html || { echo "Missing VCF output file!"; exit 1; } test -s covigator_fastq_test/test.coverage.tsv || { echo "Missing coverage output file!"; exit 1; } test -s covigator_fastq_test/test.depth.tsv || { echo "Missing depth output file!"; exit 1; } -test -s covigator_fastq_test/test.deduplication_metrics.txt || { echo "Missing deduplication metrics file!"; exit 1; } test -s covigator_fastq_test/test.bcftools.pangolin.csv || { echo "Missing pangolin output file!"; exit 1; } test -s covigator_fastq_test/test.gatk.pangolin.csv || { echo "Missing pangolin output file!"; exit 1; } test -s covigator_fastq_test/test.lofreq.pangolin.csv || { echo "Missing pangolin output file!"; exit 1; } diff --git a/tests/test_01.sh b/tests/test_01.sh index c2ee242..979cabb 100644 --- a/tests/test_01.sh +++ b/tests/test_01.sh @@ -21,7 +21,6 @@ test -s $output/test_data.fastp_stats.json || { echo "Missing VCF output file!"; test -s $output/test_data.fastp_stats.html || { echo "Missing VCF output file!"; exit 1; } test -s $output/test_data.coverage.tsv || { echo "Missing coverage output file!"; exit 1; } test -s $output/test_data.depth.tsv || { echo "Missing depth output file!"; exit 1; } -test -s $output/test_data.deduplication_metrics.txt || { echo "Missing deduplication metrics file!"; exit 1; } test -s $output/test_data.lofreq.pangolin.csv || { echo "Missing pangolin output file!"; exit 1; } assert_eq `zcat $output/test_data.lofreq.vcf.gz | grep -v '#' | wc -l` 54 "Wrong number of variants" diff --git a/tests/test_02.sh b/tests/test_02.sh index a8e826f..aa7d52a 100644 --- a/tests/test_02.sh +++ b/tests/test_02.sh @@ -18,7 +18,6 @@ test -s $output/test_data.fastp_stats.json || { echo "Missing VCF output file!"; test -s $output/test_data.fastp_stats.html || { echo "Missing VCF output file!"; exit 1; } test -s $output/test_data.coverage.tsv || { echo "Missing coverage output file!"; exit 1; } test -s $output/test_data.depth.tsv || { echo "Missing depth output file!"; exit 1; } -test -s $output/test_data.deduplication_metrics.txt || { echo "Missing deduplication metrics file!"; exit 1; } test -s $output/test_data.lofreq.pangolin.csv || { echo "Missing pangolin output file!"; exit 1; } # these are the intermediate files kept by --keep_intermediate diff --git a/tests/test_03.sh b/tests/test_03.sh index a5ffa22..5c9e3f5 100644 --- a/tests/test_03.sh +++ b/tests/test_03.sh @@ -20,7 +20,6 @@ test -s $output/test_data.fastp_stats.json || { echo "Missing VCF output file!"; test -s $output/test_data.fastp_stats.html || { echo "Missing VCF output file!"; exit 1; } test -s $output/test_data.coverage.tsv || { echo "Missing coverage output file!"; exit 1; } test -s $output/test_data.depth.tsv || { echo "Missing depth output file!"; exit 1; } -test -s $output/test_data.deduplication_metrics.txt || { echo "Missing deduplication metrics file!"; exit 1; } test -s $output/test_data.lofreq.pangolin.csv || { echo "Missing pangolin output file!"; exit 1; } # these are the intermediate files kept by --keep_intermediate diff --git a/tests/test_06.sh b/tests/test_06.sh index d9d8fe7..39e7cbd 100644 --- a/tests/test_06.sh +++ b/tests/test_06.sh @@ -18,7 +18,6 @@ test -s $output/test_data.fastp_stats.json || { echo "Missing VCF output file!"; test -s $output/test_data.fastp_stats.html || { echo "Missing VCF output file!"; exit 1; } test -s $output/test_data.coverage.tsv || { echo "Missing coverage output file!"; exit 1; } test -s $output/test_data.depth.tsv || { echo "Missing depth output file!"; exit 1; } -test -s $output/test_data.deduplication_metrics.txt || { echo "Missing deduplication metrics file!"; exit 1; } test -s $output/test_data.lofreq.pangolin.csv || { echo "Missing pangolin output file!"; exit 1; } assert_eq `zcat $output/test_data.lofreq.vcf.gz | grep -v '#' | wc -l` 54 "Wrong number of variants" diff --git a/tests/test_07.sh b/tests/test_07.sh index cb78f91..b957658 100644 --- a/tests/test_07.sh +++ b/tests/test_07.sh @@ -18,7 +18,6 @@ test -s $output/test_data.fastp_stats.json || { echo "Missing VCF output file!"; test -s $output/test_data.fastp_stats.html || { echo "Missing VCF output file!"; exit 1; } test -s $output/test_data.coverage.tsv || { echo "Missing coverage output file!"; exit 1; } test -s $output/test_data.depth.tsv || { echo "Missing depth output file!"; exit 1; } -test -s $output/test_data.deduplication_metrics.txt || { echo "Missing deduplication metrics file!"; exit 1; } test -s $output/test_data.lofreq.pangolin.csv || { echo "Missing pangolin output file!"; exit 1; } assert_eq `zcat $output/test_data.lofreq.vcf.gz | grep -v '#' | wc -l` 11 "Wrong number of variants" diff --git a/tests/test_08.sh b/tests/test_08.sh index 7b49b0e..49d31e4 100644 --- a/tests/test_08.sh +++ b/tests/test_08.sh @@ -19,7 +19,6 @@ test -s $output/test_data.fastp_stats.json || { echo "Missing VCF output file!"; test -s $output/test_data.fastp_stats.html || { echo "Missing VCF output file!"; exit 1; } test -s $output/test_data.coverage.tsv || { echo "Missing coverage output file!"; exit 1; } test -s $output/test_data.depth.tsv || { echo "Missing depth output file!"; exit 1; } -test -s $output/test_data.deduplication_metrics.txt || { echo "Missing deduplication metrics file!"; exit 1; } test -s $output/test_data.lofreq.pangolin.csv || { echo "Missing pangolin output file!"; exit 1; } assert_eq `zcat $output/test_data.lofreq.vcf.gz | grep -v '#' | wc -l` 54 "Wrong number of variants" diff --git a/tests/test_10.sh b/tests/test_10.sh index 5a66231..529acfd 100644 --- a/tests/test_10.sh +++ b/tests/test_10.sh @@ -18,7 +18,6 @@ test -s $output/test_data.fastp_stats.json || { echo "Missing VCF output file!"; test -s $output/test_data.fastp_stats.html || { echo "Missing VCF output file!"; exit 1; } test -s $output/test_data.coverage.tsv || { echo "Missing coverage output file!"; exit 1; } test -s $output/test_data.depth.tsv || { echo "Missing depth output file!"; exit 1; } -test -s $output/test_data.deduplication_metrics.txt || { echo "Missing deduplication metrics file!"; exit 1; } test -s $output/test_data.lofreq.pangolin.csv || { echo "Missing pangolin output file!"; exit 1; } assert_eq `zcat $output/test_data.lofreq.vcf.gz | grep -v '#' | wc -l` 11 "Wrong number of variants" diff --git a/tests/test_11.sh b/tests/test_11.sh index 83ddb87..d2387c9 100644 --- a/tests/test_11.sh +++ b/tests/test_11.sh @@ -16,7 +16,6 @@ test -s $output/test_data.fastp_stats.json || { echo "Missing VCF output file!"; test -s $output/test_data.fastp_stats.html || { echo "Missing VCF output file!"; exit 1; } test -s $output/test_data.coverage.tsv || { echo "Missing coverage output file!"; exit 1; } test -s $output/test_data.depth.tsv || { echo "Missing depth output file!"; exit 1; } -test -s $output/test_data.deduplication_metrics.txt || { echo "Missing deduplication metrics file!"; exit 1; } test -s $output/test_data.lofreq.pangolin.csv || { echo "Missing pangolin output file!"; exit 1; } assert_eq `zcat $output/test_data.lofreq.vcf.gz | grep -v '#' | wc -l` 3 "Wrong number of variants"