Skip to content

Commit

Permalink
Merge pull request #27 from karinlag/202207-v2
Browse files Browse the repository at this point in the history
202207 v2
  • Loading branch information
karinlag authored Aug 17, 2022
2 parents c59f69f + b3b656c commit 1252383
Show file tree
Hide file tree
Showing 15 changed files with 259 additions and 136 deletions.
167 changes: 104 additions & 63 deletions asm_annot.nf
Original file line number Diff line number Diff line change
Expand Up @@ -41,29 +41,14 @@ process run_fastqc {
set pair_id, file(reads) from fastqc_reads

output:
file "$pair_id" into fastqc_results
file "$pair_id" into fastqc_multiqc

"""
mkdir ${pair_id}
fastqc -q ${reads} -o ${pair_id} -t $task.cpus
"""
}

process run_multiqc {
publishDir "${params.out_dir}/multiqc", mode: "${params.savemode}"
tag {"multiqc"}
label 'one'

input:
file "fastqc_output/*" from fastqc_results.toSortedList()

output:
file "multiqc_report.html" into multiqc_report

"""
multiqc fastqc_output
"""
}

// if there are more than two data files, we need to cat them together
// because spades becomes complicated with more than two files
Expand All @@ -79,9 +64,11 @@ process collate_data {
output:
set pair_id, file("${pair_id}*_concat.fq.gz") into (reads, pilon_reads)


"""
cat ${pair_id}*R1* > ${pair_id}_R1_concat.fq.gz
cat ${pair_id}*R2* > ${pair_id}_R2_concat.fq.gz
shopt -s extglob
cat ${pair_id}*_?(R)1[_.]*.gz > ${pair_id}_R1_concat.fq.gz
cat ${pair_id}*_?(R)2[_.]*.gz > ${pair_id}_R2_concat.fq.gz
"""
}

Expand All @@ -90,8 +77,9 @@ process collate_data {
* Strip PhiX with bbmap
*/
process run_strip {

publishDir "${params.out_dir}/bbduk", mode: "${params.savemode}"
publishDir "${params.out_dir}/bbduk",
saveAs: {filename -> filename.endsWith('.gz') ? null:filename},
mode: "${params.savemode}"
tag { pair_id }

input:
Expand All @@ -100,6 +88,9 @@ process run_strip {
output:
set pair_id, file("${pair_id}*_concat_stripped.fq.gz") into reads_stripped
file "${pair_id}_bbduk_output.log"
file "${pair_id}_stats.txt"
file "${pair_id}_stats.txt" into bbduk_stats_stripped_multiqc


"""
bbduk.sh threads=$task.cpus ref=${params.stripgenome} \
Expand All @@ -108,7 +99,7 @@ process run_strip {
outm=${pair_id}_matched.fq.gz \
out1=${pair_id}_R1_concat_stripped.fq.gz \
out2=${pair_id}_R2_concat_stripped.fq.gz \
k=31 hdist=1 stats=stats.txt &> ${pair_id}_bbduk_output.log
k=31 hdist=1 stats=${pair_id}_stats.txt &> ${pair_id}_bbduk_output.log
"""
}

Expand All @@ -124,49 +115,75 @@ process run_trim {
set pair_id, file(reads) from reads_stripped

output:
set pair_id, file("${pair_id}*_concat_stripped_trimmed.fq.gz") into reads_trimmed
file "${pair_id}_concat_stripped_trimmed.log"
set pair_id, file("${pair_id}*_concat_stripped_trimmed.fq.gz") into (reads_trimmed, trimmed_fastqc)
file "${pair_id}_stripped_trimmed_*.log"
file "${pair_id}_stripped_trimmed_stderr.log" into bbduk_trimmed_multiqc


"""
trimmomatic PE -threads $task.cpus -trimlog ${pair_id}_concat_stripped_trimmed.log ${pair_id}*_concat_stripped.fq.gz \
-baseout ${pair_id}_trimmed.fq.gz ILLUMINACLIP:${params.adapter_dir}/${params.adapters}:${params.illuminaClipOptions} \
SLIDINGWINDOW:${params.slidingwindow} \
LEADING:${params.leading} TRAILING:${params.trailing} \
MINLEN:${params.minlen} &> ${pair_id}_run.log
SLIDINGWINDOW:${params.slidingwindow} \
MINLEN:${params.minlen} \
2> ${pair_id}_stripped_trimmed_stderr.log 1> ${pair_id}_stripped_trimmed_stdout.log
mv ${pair_id}_trimmed_1P.fq.gz ${pair_id}_R1_concat_stripped_trimmed.fq.gz
mv ${pair_id}_trimmed_2P.fq.gz ${pair_id}_R2_concat_stripped_trimmed.fq.gz
cat ${pair_id}_trimmed_1U.fq.gz ${pair_id}_trimmed_2U.fq.gz > ${pair_id}_S_concat_stripped_trimmed.fq.gz
"""
}

process run_fastqc_trimmed {
publishDir "${params.out_dir}/fastqc_bbduk_trimmed", mode: "${params.savemode}"
tag { pair_id }
label 'one'

input:
set pair_id, file(reads) from trimmed_fastqc

output:
file "$pair_id"
file "${pair_id}" into fastqc_bbduk_trimmed_multiqc

"""
mkdir ${pair_id}
fastqc -q ${reads} -o ${pair_id} -t $task.cpus
"""
}



/*
* Build assembly with SPAdes
*/
process run_spadesasm {
publishDir "${params.out_dir}/spades", mode: "${params.savemode}"
tag { pair_id }
label 'longtime'
publishDir "${params.out_dir}/spades", mode: "${params.savemode}"
tag { pair_id }
label 'longtime'

input:
set pair_id, file(reads) from reads_trimmed
input:
set pair_id, file(reads) from reads_trimmed

output:
set pair_id, file("${pair_id}_spades_scaffolds_min${params.min_contig_len}.fasta") \
into (assembly_results, tobwa_results)
file "${pair_id}_spades_scaffolds.fasta"
file "${pair_id}_spades.log"
output:
set pair_id, file("${pair_id}_spades_scaffolds_min${params.min_contig_len}.fasta") \
into (assembly_results, tobwa_results)
file "${pair_id}_spades_scaffolds.fasta"
file "${pair_id}_spades.log"

"""
spades.py ${params.careful} --cov-cutoff=${params.cov_cutoff} \
-1 ${pair_id}_R1_concat_stripped_trimmed.fq.gz \
-2 ${pair_id}_R2_concat_stripped_trimmed.fq.gz \
-s ${pair_id}_S_concat_stripped_trimmed.fq.gz -t $task.cpus -o ${pair_id}_spades
filter_fasta_length.py -i ${pair_id}_spades/scaffolds.fasta \
-o ${pair_id}_spades_scaffolds_min${params.min_contig_len}.fasta \
-m ${params.min_contig_len}
cp ${pair_id}_spades/scaffolds.fasta ${pair_id}_spades_scaffolds.fasta
cp ${pair_id}_spades/spades.log ${pair_id}_spades.log

// For 2022 version, params.careful was removed to do --isolate and --only-assembler
"""
spades.py --cov-cutoff=${params.cov_cutoff} \
-1 ${pair_id}_R1_concat_stripped_trimmed.fq.gz \
-2 ${pair_id}_R2_concat_stripped_trimmed.fq.gz \
-s ${pair_id}_S_concat_stripped_trimmed.fq.gz \
-t $task.cpus --isolate --only-assembler \
-o ${pair_id}_spades
filter_fasta_length.py -i ${pair_id}_spades/scaffolds.fasta \
-o ${pair_id}_spades_scaffolds_min${params.min_contig_len}.fasta \
-m ${params.min_contig_len}
cp ${pair_id}_spades/scaffolds.fasta ${pair_id}_spades_scaffolds.fasta
cp ${pair_id}_spades/spades.log ${pair_id}_spades.log
"""
}

Expand All @@ -177,24 +194,25 @@ process run_spadesasm {
* Map reads to the spades assembly
*/
process run_bwamem {
publishDir "${params.out_dir}/bwamem", mode: "${params.savemode}"
tag { pair_id }
label 'longtime'
publishDir "${params.out_dir}/bwamem", mode: "${params.savemode}"
tag { pair_id }
label 'longtime'

input:
set pair_id, file("${pair_id}_spades_scaffolds_min${params.min_contig_len}.fasta"), \
file(reads) from tobwa_results.join(pilon_reads)
input:
set pair_id, file("${pair_id}_spades_scaffolds_min${params.min_contig_len}.fasta"), \
file(reads) from tobwa_results.join(pilon_reads)

output:
set pair_id, file("${pair_id}_mapped_sorted.bam"), \
file("${pair_id}_mapped_sorted.bam.bai") into bwamem_results
output:
set pair_id, file("${pair_id}_mapped_sorted.bam"), \
file("${pair_id}_mapped_sorted.bam.bai") into bwamem_results
file "${pair_id}_bwa.log"

"""
bwa index ${pair_id}_spades_scaffolds_min${params.min_contig_len}.fasta
bwa mem -t $task.cpus ${pair_id}_spades_scaffolds_min${params.min_contig_len}.fasta \
*.fq.gz | samtools sort -o ${pair_id}_mapped_sorted.bam -
samtools index ${pair_id}_mapped_sorted.bam
"""
"""
bwa index ${pair_id}_spades_scaffolds_min${params.min_contig_len}.fasta
bwa mem -t $task.cpus ${pair_id}_spades_scaffolds_min${params.min_contig_len}.fasta \
*.fq.gz 2> ${pair_id}_bwa.log| samtools sort -o ${pair_id}_mapped_sorted.bam -
samtools index ${pair_id}_mapped_sorted.bam
"""
}

/*
Expand Down Expand Up @@ -236,20 +254,21 @@ process run_prokka {

output:
set pair_id, file("${pair_id}.*") into annotation_results
file "${pair_id}.*" into annotation_multiqc

"""
prokka --compliant --force --usegenus --cpus $task.cpus \
--centre ${params.centre} --prefix ${pair_id} --locustag ${params.locustag} \
--genus ${params.genus} --species ${params.species} \
--kingdom ${params.kingdom} ${params.prokka_additional} \
--kingdom ${params.kingdom} --strain ${pair_id}_prokka_info ${params.prokka_additional} \
--outdir . ${pair_id}_pilon_spades.fasta
"""
}

/*
* Evaluate ALL assemblies with QUAST
*/
process quast_eval {
process run_quast {
// The output here is a directory in and of itself
// thus not creating a new one
publishDir "${params.out_dir}/", mode: "${params.savemode}"
Expand All @@ -258,12 +277,34 @@ process quast_eval {
input:
file asm_list from asms_for_quast.toSortedList()

//TODO: fix this, is why output is not going anywhere
output:
file quast_evaluation_all into quast_evaluation_all
file quast_evaluation_all into quast_multiqc

"""
quast --threads $task.cpus -o quast_evaluation_all \
-g ${params.quast_genes} -R ${params.quast_ref} ${asm_list}
"""
}

process run_multiqc_final {
publishDir "${params.out_dir}/multiqc_final", mode: "${params.savemode}"
tag {"multiqc"}
label 'one'

input:
file "fastqc_output/*" from fastqc_multiqc.collect()
file "bbduk/*" from bbduk_stats_stripped_multiqc.collect()
file "bbduk_trimmed/*" from bbduk_trimmed_multiqc.collect()
file "bbduk_trimmed_fastqc/*" from fastqc_bbduk_trimmed_multiqc.collect()
file "prokka/*" from annotation_multiqc.collect()
file quast_evaluation_all from quast_multiqc

output:
file("*")

"""
multiqc --fullnames --config ${params.multiqc_config} .
"""

}
36 changes: 36 additions & 0 deletions bin/printversions.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
## This script attempts to print whatever versions we have of tools
## There are two scenarios, conda, and and not conda

STR=$1
output_file=$2

SUBSTR='conda'
if [[ "$STR" == *"$SUBSTR"* ]]; then
conda list -n bifrost2022-fastqc >> ${output_file}
conda list -n bifrost2022-multiqc >> ${output_file}
conda list -n bifrost2022-bbtools >> ${output_file}
conda list -n bifrost2022-trimmomatic >> ${output_file}
conda list -n bifrost2022-spades >> ${output_file}
conda list -n bifrost2022-bwa >> ${output_file}
conda list -n bifrost2022-pilon >> ${output_file}
conda list -n bifrost2022-prokka >> ${output_file}
conda list -n bifrost2022-quast >> ${output_file}
conda list -n bifrost2022-ariba >> ${output_file}

else
fastqc --version >> ${output_file}
multiqc --version >> ${output_file}
echo "bbduk.sh " `bbversion.sh` >> ${output_file}
echo "trimmomatic " `trimmomatic -version` >> ${output_file}
spades.py --version >> ${output_file}
bwa &> tmpfile
cat tmpfile |grep Version | awk '{print "bwa " $0}' >> ${output_file}
rm tmpfile
samtools --version |head -1 >> ${output_file}
pilon --version >> ${output_file}
prokka -v >> ${output_file}
quast -v >> ${output_file}
ariba version |head -1 >> ${output_file}


fi
24 changes: 12 additions & 12 deletions conf/asm_annot_template.config
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,19 @@
params.reads = "../testdata/risk_short/*L00{1,2}_R{1,2}_001.fastq.gz"
params.setsize = 4

// Specify the name of the output directory, relative to where the script is being run
params.out_dir = "track_three"

// General configuration variables
params.pwd = "$PWD"
params.help = false
params.savemode = "copy"


// Directory to where data is stored
params.data_dir = "/cluster/projects/nn9305k/vi_pipeline_data/bifrost_data"
params.multiqc_config = "/cluster/projects/nn9305k/Bifrost22/conf/multiqc_config.yaml"

// BBDuk params, has to be absolute paths
params.stripgenome = "/cluster/projects/nn9305k/genome_references/genomes/PhiX/PhiX.fasta"
params.stripdir = "/cluster/projects/nn9305k/genome_references/bbmap_refs"
params.stripgenome = "${params.data_dir}/genome_references/genomes/PhiX/PhiX.fasta"
params.stripdir = "${params.data_dir}/genome_references/bbmap_refs"


// Trimmomatic configuration variables
Expand All @@ -36,14 +38,13 @@ params.slidingwindow = "4:15"
params.leading = 3
params.trailing = 3
params.minlen = 36
params.adapters = "TruSeq3-PE.fa"
params.adapter_dir = "/cluster/projects/nn9305k/db_flatfiles/trimmomatic_adapters"
params.adapters = "NexteraPE-PE.fa"
params.adapter_dir = "${params.data_dir}/trimmomatic_adapters"


// SPAdes configuration variables
params.assembly = "spades_asm"
params.careful = "--careful"
params.cov_cutoff = "off"
params.cov_cutoff = "auto"
params.min_contig_len = "500"

// PROKKA configuration variables
Expand All @@ -58,6 +59,5 @@ params.centre = "NVI"


// QUAST variables
params.genome_directory = "/cluster/projects/nn9305k/genome_references/genomes/"
params.quast_ref = "${params.genome_directory}ecoli/GCF_000005845.2_ASM584v2_genomic.fna"
params.quast_genes = "${params.genome_directory}ecoli/GCF_000005845.2_ASM584v2_genomic.gff"
params.quast_ref = "${params.data_dir}/genome_references/genomes/ecoli/GCF_000005845.2_ASM584v2_genomic.fna"
params.quast_genes = "${params.data_dir}/genome_references/genomes/ecoli/GCF_000005845.2_ASM584v2_genomic.gff"
Loading

0 comments on commit 1252383

Please sign in to comment.