diff --git a/asm_annot.nf b/asm_annot.nf index 2118828..02ff2c4 100644 --- a/asm_annot.nf +++ b/asm_annot.nf @@ -41,7 +41,7 @@ process run_fastqc { set pair_id, file(reads) from fastqc_reads output: - file "$pair_id" into fastqc_results + file "$pair_id" into fastqc_multiqc """ mkdir ${pair_id} @@ -49,21 +49,6 @@ process run_fastqc { """ } -process run_multiqc { - publishDir "${params.out_dir}/multiqc", mode: "${params.savemode}" - tag {"multiqc"} - label 'one' - - input: - file "fastqc_output/*" from fastqc_results.toSortedList() - - output: - file "multiqc_report.html" into multiqc_report - - """ - multiqc fastqc_output - """ -} // if there are more than two data files, we need to cat them together // because spades becomes complicated with more than two files @@ -79,9 +64,11 @@ process collate_data { output: set pair_id, file("${pair_id}*_concat.fq.gz") into (reads, pilon_reads) + """ - cat ${pair_id}*R1* > ${pair_id}_R1_concat.fq.gz - cat ${pair_id}*R2* > ${pair_id}_R2_concat.fq.gz + shopt -s extglob + cat ${pair_id}*_?(R)1[_.]*.gz > ${pair_id}_R1_concat.fq.gz + cat ${pair_id}*_?(R)2[_.]*.gz > ${pair_id}_R2_concat.fq.gz """ } @@ -90,8 +77,9 @@ process collate_data { * Strip PhiX with bbmap */ process run_strip { - - publishDir "${params.out_dir}/bbduk", mode: "${params.savemode}" + publishDir "${params.out_dir}/bbduk", + saveAs: {filename -> filename.endsWith('.gz') ? null:filename}, + mode: "${params.savemode}" tag { pair_id } input: @@ -100,6 +88,9 @@ process run_strip { output: set pair_id, file("${pair_id}*_concat_stripped.fq.gz") into reads_stripped file "${pair_id}_bbduk_output.log" + file "${pair_id}_stats.txt" + file "${pair_id}_stats.txt" into bbduk_stats_stripped_multiqc + """ bbduk.sh threads=$task.cpus ref=${params.stripgenome} \ @@ -108,7 +99,7 @@ process run_strip { outm=${pair_id}_matched.fq.gz \ out1=${pair_id}_R1_concat_stripped.fq.gz \ out2=${pair_id}_R2_concat_stripped.fq.gz \ - k=31 hdist=1 stats=stats.txt &> ${pair_id}_bbduk_output.log + k=31 hdist=1 stats=${pair_id}_stats.txt &> ${pair_id}_bbduk_output.log """ } @@ -124,49 +115,75 @@ process run_trim { set pair_id, file(reads) from reads_stripped output: - set pair_id, file("${pair_id}*_concat_stripped_trimmed.fq.gz") into reads_trimmed - file "${pair_id}_concat_stripped_trimmed.log" + set pair_id, file("${pair_id}*_concat_stripped_trimmed.fq.gz") into (reads_trimmed, trimmed_fastqc) + file "${pair_id}_stripped_trimmed_*.log" + file "${pair_id}_stripped_trimmed_stderr.log" into bbduk_trimmed_multiqc + """ trimmomatic PE -threads $task.cpus -trimlog ${pair_id}_concat_stripped_trimmed.log ${pair_id}*_concat_stripped.fq.gz \ -baseout ${pair_id}_trimmed.fq.gz ILLUMINACLIP:${params.adapter_dir}/${params.adapters}:${params.illuminaClipOptions} \ - SLIDINGWINDOW:${params.slidingwindow} \ LEADING:${params.leading} TRAILING:${params.trailing} \ - MINLEN:${params.minlen} &> ${pair_id}_run.log + SLIDINGWINDOW:${params.slidingwindow} \ + MINLEN:${params.minlen} \ + 2> ${pair_id}_stripped_trimmed_stderr.log 1> ${pair_id}_stripped_trimmed_stdout.log mv ${pair_id}_trimmed_1P.fq.gz ${pair_id}_R1_concat_stripped_trimmed.fq.gz mv ${pair_id}_trimmed_2P.fq.gz ${pair_id}_R2_concat_stripped_trimmed.fq.gz cat ${pair_id}_trimmed_1U.fq.gz ${pair_id}_trimmed_2U.fq.gz > ${pair_id}_S_concat_stripped_trimmed.fq.gz """ } +process run_fastqc_trimmed { + publishDir "${params.out_dir}/fastqc_bbduk_trimmed", mode: "${params.savemode}" + tag { pair_id } + label 'one' + + input: + set pair_id, file(reads) from trimmed_fastqc + + output: + file "$pair_id" + file "${pair_id}" into fastqc_bbduk_trimmed_multiqc + + """ + mkdir ${pair_id} + fastqc -q ${reads} -o ${pair_id} -t $task.cpus + """ +} + + /* * Build assembly with SPAdes */ process run_spadesasm { - publishDir "${params.out_dir}/spades", mode: "${params.savemode}" - tag { pair_id } - label 'longtime' + publishDir "${params.out_dir}/spades", mode: "${params.savemode}" + tag { pair_id } + label 'longtime' - input: - set pair_id, file(reads) from reads_trimmed + input: + set pair_id, file(reads) from reads_trimmed - output: - set pair_id, file("${pair_id}_spades_scaffolds_min${params.min_contig_len}.fasta") \ - into (assembly_results, tobwa_results) - file "${pair_id}_spades_scaffolds.fasta" - file "${pair_id}_spades.log" + output: + set pair_id, file("${pair_id}_spades_scaffolds_min${params.min_contig_len}.fasta") \ + into (assembly_results, tobwa_results) + file "${pair_id}_spades_scaffolds.fasta" + file "${pair_id}_spades.log" - """ - spades.py ${params.careful} --cov-cutoff=${params.cov_cutoff} \ - -1 ${pair_id}_R1_concat_stripped_trimmed.fq.gz \ - -2 ${pair_id}_R2_concat_stripped_trimmed.fq.gz \ - -s ${pair_id}_S_concat_stripped_trimmed.fq.gz -t $task.cpus -o ${pair_id}_spades - filter_fasta_length.py -i ${pair_id}_spades/scaffolds.fasta \ - -o ${pair_id}_spades_scaffolds_min${params.min_contig_len}.fasta \ - -m ${params.min_contig_len} - cp ${pair_id}_spades/scaffolds.fasta ${pair_id}_spades_scaffolds.fasta - cp ${pair_id}_spades/spades.log ${pair_id}_spades.log + + // For 2022 version, params.careful was removed to do --isolate and --only-assembler + """ + spades.py --cov-cutoff=${params.cov_cutoff} \ + -1 ${pair_id}_R1_concat_stripped_trimmed.fq.gz \ + -2 ${pair_id}_R2_concat_stripped_trimmed.fq.gz \ + -s ${pair_id}_S_concat_stripped_trimmed.fq.gz \ + -t $task.cpus --isolate --only-assembler \ + -o ${pair_id}_spades + filter_fasta_length.py -i ${pair_id}_spades/scaffolds.fasta \ + -o ${pair_id}_spades_scaffolds_min${params.min_contig_len}.fasta \ + -m ${params.min_contig_len} + cp ${pair_id}_spades/scaffolds.fasta ${pair_id}_spades_scaffolds.fasta + cp ${pair_id}_spades/spades.log ${pair_id}_spades.log """ } @@ -177,24 +194,25 @@ process run_spadesasm { * Map reads to the spades assembly */ process run_bwamem { - publishDir "${params.out_dir}/bwamem", mode: "${params.savemode}" - tag { pair_id } - label 'longtime' + publishDir "${params.out_dir}/bwamem", mode: "${params.savemode}" + tag { pair_id } + label 'longtime' - input: - set pair_id, file("${pair_id}_spades_scaffolds_min${params.min_contig_len}.fasta"), \ - file(reads) from tobwa_results.join(pilon_reads) + input: + set pair_id, file("${pair_id}_spades_scaffolds_min${params.min_contig_len}.fasta"), \ + file(reads) from tobwa_results.join(pilon_reads) - output: - set pair_id, file("${pair_id}_mapped_sorted.bam"), \ - file("${pair_id}_mapped_sorted.bam.bai") into bwamem_results + output: + set pair_id, file("${pair_id}_mapped_sorted.bam"), \ + file("${pair_id}_mapped_sorted.bam.bai") into bwamem_results + file "${pair_id}_bwa.log" - """ - bwa index ${pair_id}_spades_scaffolds_min${params.min_contig_len}.fasta - bwa mem -t $task.cpus ${pair_id}_spades_scaffolds_min${params.min_contig_len}.fasta \ - *.fq.gz | samtools sort -o ${pair_id}_mapped_sorted.bam - - samtools index ${pair_id}_mapped_sorted.bam - """ + """ + bwa index ${pair_id}_spades_scaffolds_min${params.min_contig_len}.fasta + bwa mem -t $task.cpus ${pair_id}_spades_scaffolds_min${params.min_contig_len}.fasta \ + *.fq.gz 2> ${pair_id}_bwa.log| samtools sort -o ${pair_id}_mapped_sorted.bam - + samtools index ${pair_id}_mapped_sorted.bam + """ } /* @@ -236,12 +254,13 @@ process run_prokka { output: set pair_id, file("${pair_id}.*") into annotation_results + file "${pair_id}.*" into annotation_multiqc """ prokka --compliant --force --usegenus --cpus $task.cpus \ --centre ${params.centre} --prefix ${pair_id} --locustag ${params.locustag} \ --genus ${params.genus} --species ${params.species} \ - --kingdom ${params.kingdom} ${params.prokka_additional} \ + --kingdom ${params.kingdom} --strain ${pair_id}_prokka_info ${params.prokka_additional} \ --outdir . ${pair_id}_pilon_spades.fasta """ } @@ -249,7 +268,7 @@ process run_prokka { /* * Evaluate ALL assemblies with QUAST */ -process quast_eval { +process run_quast { // The output here is a directory in and of itself // thus not creating a new one publishDir "${params.out_dir}/", mode: "${params.savemode}" @@ -258,12 +277,34 @@ process quast_eval { input: file asm_list from asms_for_quast.toSortedList() - //TODO: fix this, is why output is not going anywhere output: file quast_evaluation_all into quast_evaluation_all + file quast_evaluation_all into quast_multiqc """ quast --threads $task.cpus -o quast_evaluation_all \ -g ${params.quast_genes} -R ${params.quast_ref} ${asm_list} """ } + +process run_multiqc_final { + publishDir "${params.out_dir}/multiqc_final", mode: "${params.savemode}" + tag {"multiqc"} + label 'one' + + input: + file "fastqc_output/*" from fastqc_multiqc.collect() + file "bbduk/*" from bbduk_stats_stripped_multiqc.collect() + file "bbduk_trimmed/*" from bbduk_trimmed_multiqc.collect() + file "bbduk_trimmed_fastqc/*" from fastqc_bbduk_trimmed_multiqc.collect() + file "prokka/*" from annotation_multiqc.collect() + file quast_evaluation_all from quast_multiqc + + output: + file("*") + + """ + multiqc --fullnames --config ${params.multiqc_config} . + """ + +} diff --git a/bin/printversions.sh b/bin/printversions.sh new file mode 100755 index 0000000..a9fd9b1 --- /dev/null +++ b/bin/printversions.sh @@ -0,0 +1,36 @@ +## This script attempts to print whatever versions we have of tools +## There are two scenarios, conda, and and not conda + +STR=$1 +output_file=$2 + +SUBSTR='conda' +if [[ "$STR" == *"$SUBSTR"* ]]; then + conda list -n bifrost2022-fastqc >> ${output_file} + conda list -n bifrost2022-multiqc >> ${output_file} + conda list -n bifrost2022-bbtools >> ${output_file} + conda list -n bifrost2022-trimmomatic >> ${output_file} + conda list -n bifrost2022-spades >> ${output_file} + conda list -n bifrost2022-bwa >> ${output_file} + conda list -n bifrost2022-pilon >> ${output_file} + conda list -n bifrost2022-prokka >> ${output_file} + conda list -n bifrost2022-quast >> ${output_file} + conda list -n bifrost2022-ariba >> ${output_file} + + else + fastqc --version >> ${output_file} + multiqc --version >> ${output_file} + echo "bbduk.sh " `bbversion.sh` >> ${output_file} + echo "trimmomatic " `trimmomatic -version` >> ${output_file} + spades.py --version >> ${output_file} + bwa &> tmpfile + cat tmpfile |grep Version | awk '{print "bwa " $0}' >> ${output_file} + rm tmpfile + samtools --version |head -1 >> ${output_file} + pilon --version >> ${output_file} + prokka -v >> ${output_file} + quast -v >> ${output_file} + ariba version |head -1 >> ${output_file} + + +fi \ No newline at end of file diff --git a/conf/asm_annot_template.config b/conf/asm_annot_template.config index 89bcc47..7c28c54 100644 --- a/conf/asm_annot_template.config +++ b/conf/asm_annot_template.config @@ -16,17 +16,19 @@ params.reads = "../testdata/risk_short/*L00{1,2}_R{1,2}_001.fastq.gz" params.setsize = 4 -// Specify the name of the output directory, relative to where the script is being run -params.out_dir = "track_three" - // General configuration variables params.pwd = "$PWD" params.help = false params.savemode = "copy" + +// Directory to where data is stored +params.data_dir = "/cluster/projects/nn9305k/vi_pipeline_data/bifrost_data" +params.multiqc_config = "/cluster/projects/nn9305k/Bifrost22/conf/multiqc_config.yaml" + // BBDuk params, has to be absolute paths -params.stripgenome = "/cluster/projects/nn9305k/genome_references/genomes/PhiX/PhiX.fasta" -params.stripdir = "/cluster/projects/nn9305k/genome_references/bbmap_refs" +params.stripgenome = "${params.data_dir}/genome_references/genomes/PhiX/PhiX.fasta" +params.stripdir = "${params.data_dir}/genome_references/bbmap_refs" // Trimmomatic configuration variables @@ -36,14 +38,13 @@ params.slidingwindow = "4:15" params.leading = 3 params.trailing = 3 params.minlen = 36 -params.adapters = "TruSeq3-PE.fa" -params.adapter_dir = "/cluster/projects/nn9305k/db_flatfiles/trimmomatic_adapters" +params.adapters = "NexteraPE-PE.fa" +params.adapter_dir = "${params.data_dir}/trimmomatic_adapters" // SPAdes configuration variables params.assembly = "spades_asm" -params.careful = "--careful" -params.cov_cutoff = "off" +params.cov_cutoff = "auto" params.min_contig_len = "500" // PROKKA configuration variables @@ -58,6 +59,5 @@ params.centre = "NVI" // QUAST variables -params.genome_directory = "/cluster/projects/nn9305k/genome_references/genomes/" -params.quast_ref = "${params.genome_directory}ecoli/GCF_000005845.2_ASM584v2_genomic.fna" -params.quast_genes = "${params.genome_directory}ecoli/GCF_000005845.2_ASM584v2_genomic.gff" +params.quast_ref = "${params.data_dir}/genome_references/genomes/ecoli/GCF_000005845.2_ASM584v2_genomic.fna" +params.quast_genes = "${params.data_dir}/genome_references/genomes/ecoli/GCF_000005845.2_ASM584v2_genomic.gff" diff --git a/conf/conda.config b/conf/conda.config new file mode 100644 index 0000000..62bef30 --- /dev/null +++ b/conf/conda.config @@ -0,0 +1,78 @@ +/* +* Standard profile file for use with conda. +* This setup will ensure that all command line programs +* are available on the command line as expected. +* condahome should be set in condastandard or condaslurm files +*/ + +process { + + withName: run_fastqc { + conda = "${params.condahome}/bifrost2022-fastqc" + } + + withName: run_fastqc_trimmed { + conda = "${params.condahome}/bifrost2022-fastqc" + } + + withName: run_multiqc { + conda = "${params.condahome}/bifrost2022-multiqc" + } + + withName: run_multiqc_final { + conda = "${params.condahome}/bifrost2022-multiqc" + } + + withName: run_strip { + conda = "${params.condahome}/bifrost2022-bbtools" + } + + withName: run_trim { + conda = "${params.condahome}/bifrost2022-trimmomatic" + } + + withName: run_spadesasm { + conda = "${params.condahome}/bifrost2022-spades" + } + + withName: run_bwamem { + conda = "${params.condahome}/bifrost2022-bwa" + } + + withName: run_pilon { + conda = "${params.condahome}/bifrost2022-pilon" + } + + withName: run_prokka { + conda = "${params.condahome}/bifrost2022-prokka" + } + + withName: run_quast { + conda = "${params.condahome}/bifrost2022-quast" + } + + withName:run_ariba_mlst_pred { + conda = "${params.condahome}/bifrost2022-ariba" + } + + withName:run_ariba_mlst_summarize { + conda = "${params.condahome}/bifrost2022-ariba" + } + + withName:run_ariba_amr_pred { + conda = "${params.condahome}/bifrost2022-ariba" + } + + withName:run_ariba_amr_summarize { + conda = "${params.condahome}/bifrost2022-ariba" + } + + withName:run_ariba_vir_pred { + conda = "${params.condahome}/bifrost2022-ariba" + } + + withName:run_ariba_vir_summarize { + conda = "${params.condahome}/bifrost2022-ariba" + } + +} \ No newline at end of file diff --git a/conf/condaslurm.config b/conf/condaslurm.config index dce1f40..83ed53a 100644 --- a/conf/condaslurm.config +++ b/conf/condaslurm.config @@ -4,6 +4,6 @@ * are available on the command line as expected. */ -process { - conda = '/cluster/projects/nn9305k/src/miniconda/envs/bifrost' -} +params { + condahome = '/cluster/projects/nn9305k/src/miniconda/envs' + } diff --git a/conf/condastandard.config b/conf/condastandard.config index e2d8e5a..26838c3 100644 --- a/conf/condastandard.config +++ b/conf/condastandard.config @@ -5,6 +5,6 @@ * are available on the command line as expected. */ -process { - conda = '/home/karinlag/src/anaconda3/envs/bifrost' -} +params { + condahome = "/home/karinlag/src/miniconda/envs" +} \ No newline at end of file diff --git a/conf/multiqc_config.yaml b/conf/multiqc_config.yaml new file mode 100644 index 0000000..2f15b51 --- /dev/null +++ b/conf/multiqc_config.yaml @@ -0,0 +1,7 @@ +report_comment: > + This MultiQC report has been generated on results from running the + Bifrost 2022 + analysis pipeline. +trimmomatic: + s_name_filenames: true \ No newline at end of file diff --git a/conf/qc_track_template.config b/conf/qc_track_template.config index d0ec741..686b9b9 100644 --- a/conf/qc_track_template.config +++ b/conf/qc_track_template.config @@ -15,9 +15,6 @@ params.reads = "../testdata/risk_short/*L00{1,2}_R{1,2}_001.fastq.gz" params.setsize = 2 -// Specify the name of the output directory, relative to where the script is being run -params.out_dir = "track_one" - // General configuration variables params.pwd = "$PWD" params.help = false diff --git a/conf/slurm.config b/conf/slurm.config index 649ac51..f3ba8a2 100644 --- a/conf/slurm.config +++ b/conf/slurm.config @@ -20,7 +20,7 @@ process { executor = 'slurm' clusterOptions = '--job-name=nxf_test --account=nn9305k --mem-per-cpu=4700M' - queueSize = 24 + queueSize = 40 maxRetries = 3 errorStrategy='retry' diff --git a/conf/specific_genes_template.config b/conf/specific_genes_template.config index 2625e3a..ccc941a 100644 --- a/conf/specific_genes_template.config +++ b/conf/specific_genes_template.config @@ -9,8 +9,6 @@ */ params.reads = "../testdata/fastq_files/*L00{1,2}_R{1,2}_001.fastq.gz" params.setsize = 2 -// Specify the name of the output directory, relative to where the script is being run -params.out_dir = "track_two" // General configuration variables params.pwd = "$PWD" diff --git a/conf/third_party_software.md b/conf/third_party_software.md deleted file mode 100644 index 46793dd..0000000 --- a/conf/third_party_software.md +++ /dev/null @@ -1,46 +0,0 @@ -# Third party software - -The Bifrost pipeline depends on several third party packages. -These have to be made available to the pipeline in some way. -The way that these are made available to the pipeline depends -on which system the pipeline is being run on. - -Please note: not all of the software is used for all tracks. -The track(s) that each software is used in is noted below. - - -## Currently used software - -* FastQC - Track One and Three -* Ariba - Track Two -* Trimmomatic - Track Three -* SPAdes - Track Three -* QUAST - Track Three - - -## Tracks - -There are currently three tracks: - -* Track One: FastQC on input reads -* Track Two: Ariba MLST, virulence and AMR analysis -* Track Three: Trimming with trimmomatic followed by assembly - with SPAdes. Trimming results are evaluated with MultiQC, and - assemblies with QUAST - -## Profiles - -We currently have two profiles set up, standard and slurm. - -### standard.config -This profile is used when running on a normal stand-alone -computer. This assumes that all software is available on -the command line, unless otherwise noted with a full path in -the standard.config file. - -### slurm.config -This profile is used when running on a system that uses the -slurm queue management system. At present, this also depends -heavily on the module system. Any software not in the module -system needs to either be available on the command line, or -should be specified using the full path. diff --git a/qc_track.nf b/fastqc_multi.nf similarity index 100% rename from qc_track.nf rename to fastqc_multi.nf diff --git a/nextflow.config b/nextflow.config index 3f2b0f4..d8a8efa 100644 --- a/nextflow.config +++ b/nextflow.config @@ -18,6 +18,8 @@ profiles { condalocal { includeConfig 'conf/standard.config' includeConfig 'conf/condastandard.config' + includeConfig 'conf/conda.config' + } slurm { includeConfig 'conf/slurm.config' @@ -25,6 +27,8 @@ profiles { condaslurm { includeConfig 'conf/slurm.config' includeConfig 'conf/condaslurm.config' + includeConfig 'conf/conda.config' + } } diff --git a/run_track.sh b/run_track.sh index e48349f..660882a 100755 --- a/run_track.sh +++ b/run_track.sh @@ -14,10 +14,16 @@ profile=$3 out_directory=$4 workdir=${5:-$USERWORK/bifrost_work} +#to add to report name +now=$(date +"%Y%m%d_%H%M") + mkdir -p ${out_directory}/config_files git --git-dir ${script_directory}/.git branch -v |grep "\*" | awk '{print $2, $3}' > ${out_directory}/config_files/pipeline_version.log +bash ${script_directory}/bin/printversions.sh ${profile} ${out_directory}/config_files/software_versions.txt cp ${script_directory}/${track_script} ${out_directory}/config_files cp ${template} ${out_directory}/config_files + echo "TEMPORARY WORKING DIRECTORY IS ${workdir}" -nextflow -c ${template} run -resume ${script_directory}/${track_script} -profile ${profile} --out_dir=${out_directory} -work-dir ${workdir} +nextflow -c ${template} run -resume ${script_directory}/${track_script} \ +-profile ${profile} --out_dir=${out_directory} -work-dir ${workdir} -with-report ${now}_run_report.html diff --git a/specific_genes.nf b/specific_genes.nf index eb862f6..7545c77 100644 --- a/specific_genes.nf +++ b/specific_genes.nf @@ -59,9 +59,11 @@ process collate_data { set pair_id, file("${pair_id}*_concat.fq.gz") into \ (read_pairs_mlst, read_pairs_amr, read_pairs_vir) + """ - cat ${pair_id}*R1* > ${pair_id}_R1_concat.fq.gz - cat ${pair_id}*R2* > ${pair_id}_R2_concat.fq.gz + shopt -s extglob + cat ${pair_id}*_?(R)1[_.]*.gz > ${pair_id}_R1_concat.fq.gz + cat ${pair_id}*_?(R)2[_.]*.gz > ${pair_id}_R2_concat.fq.gz """ }