diff --git a/bin/taxref_reformat_sidle.sh b/bin/taxref_reformat_sidle.sh new file mode 100755 index 00000000..67174b8e --- /dev/null +++ b/bin/taxref_reformat_sidle.sh @@ -0,0 +1,22 @@ +#!/bin/sh + +# Untar any tar file in the working directory +tar xzf database.tar.gz + +# Greengenes 13_8 +if [ -d "gg_13_8_otus" ]; then + mv gg_13_8_otus/rep_set/99_otus.fasta gg_13_8_otus_rep_set_99_otus.seq.fasta + mv gg_13_8_otus/rep_set_aligned/99_otus.fasta gg_13_8_otus_rep_set_aligned_99_otus.alnseq.fasta + mv gg_13_8_otus/taxonomy/99_otu_taxonomy.txt gg_13_8_otus_taxonomy_99_otu_taxonomy.tax.txt +elif [ -d "gg_13_8_otus" ]; then + mv SILVA_128_QIIME_release/rep_set/rep_set_all/99/99_otus.fasta SILVA_128_QIIME_release_rep_set_all_99_otus.seq.fasta + gunzip -c /SILVA_128_QIIME_release/rep_set_aligned/99/99_otus_aligned.fasta.gz > SILVA_128_QIIME_release_rep_set_aligned_99_otus_aligned.alnseq.fasta + mv SILVA_128_QIIME_release/taxonomy/taxonomy_all/99/consensus_taxonomy_7_levels SILVA_128_QIIME_release_taxonomy_all_99_consensus_taxonomy_7_levels.tax.txt +else + echo "Didnt detect any expected directory" +fi + + + + + diff --git a/conf/ref_databases.config b/conf/ref_databases.config index e89df338..7b702dad 100644 --- a/conf/ref_databases.config +++ b/conf/ref_databases.config @@ -488,4 +488,41 @@ params { taxlevels = "D,P,C,O,F,G,S" } } + // Sidle reference databases + sidle_ref_databases { + 'silva' { + title = "SILVA - Version 128" + file = [ "https://www.arb-silva.de/fileadmin/silva_databases/qiime/Silva_128_release.tgz" ] + tree_qza = [ "https://data.qiime2.org/2021.4/common/sepp-refs-silva-128.qza" ] + citation = "https://www.arb-silva.de/; Bokulich, N.A., Robeson, M., Dillon, M.R. bokulich-lab/RESCRIPt. Zenodo. http://doi.org/10.5281/zenodo.3891931" + license = "https://www.arb-silva.de/silva-license-information/" + fmtscript = "taxref_reformat_sidle.sh" + taxlevels = "D,P,C,O,F,G" + } + 'silva=128' { + title = "SILVA - Version 128" + file = [ "https://www.arb-silva.de/fileadmin/silva_databases/qiime/Silva_128_release.tgz" ] + tree_qza = [ "https://data.qiime2.org/2021.4/common/sepp-refs-silva-128.qza" ] + citation = "https://www.arb-silva.de/; Bokulich, N.A., Robeson, M., Dillon, M.R. bokulich-lab/RESCRIPt. Zenodo. http://doi.org/10.5281/zenodo.3891931" + license = "https://www.arb-silva.de/silva-license-information/" + fmtscript = "taxref_reformat_sidle.sh" + taxlevels = "D,P,C,O,F,G" + } + 'greengenes' { + title = "Greengenes - Version 13_8" + file = [ "ftp://greengenes.microbio.me/greengenes_release/gg_13_5/gg_13_8_otus.tar.gz" ] + tree_qza = [ "https://data.qiime2.org/2021.4/common/sepp-refs-gg-13-8.qza" ] + citation = "McDonald, D., Price, M., Goodrich, J. et al. An improved Greengenes taxonomy with explicit ranks for ecological and evolutionary analyses of bacteria and archaea. ISME J 6, 610–618 (2012). https://doi.org/10.1038/ismej.2011.139" + fmtscript = "taxref_reformat_sidle.sh" + taxlevels = "D,P,C,O,F,G,S" + } + 'greengenes=13_8' { + title = "Greengenes - Version 13_8" + file = [ "ftp://greengenes.microbio.me/greengenes_release/gg_13_5/gg_13_8_otus.tar.gz" ] + tree_qza = [ "https://data.qiime2.org/2021.4/common/sepp-refs-gg-13-8.qza" ] + citation = "McDonald, D., Price, M., Goodrich, J. et al. An improved Greengenes taxonomy with explicit ranks for ecological and evolutionary analyses of bacteria and archaea. ISME J 6, 610–618 (2012). https://doi.org/10.1038/ismej.2011.139" + fmtscript = "taxref_reformat_sidle.sh" + taxlevels = "D,P,C,O,F,G,S" + } + } } diff --git a/modules/local/format_taxonomy_sidle.nf b/modules/local/format_taxonomy_sidle.nf new file mode 100644 index 00000000..a6b909f5 --- /dev/null +++ b/modules/local/format_taxonomy_sidle.nf @@ -0,0 +1,38 @@ +process FORMAT_TAXONOMY_SIDLE { + label 'process_low' + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img' : + 'docker.io/biocontainers/biocontainers:v1.2.0_cv1' }" + + input: + path('database.tar.gz') + val(suffix) + + output: + path( "*.seq.fasta" ) , emit: seq + path( "*.alnseq.fasta") , emit: alnseq + path( "*.tax.txt") , emit: tax + path( "ref_taxonomy.*.txt") , emit: ref_tax_info + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + ${params.sidle_ref_databases[params.sidle_ref_taxonomy]["fmtscript"]} + + #Giving out information + echo -e "--sidle_ref_taxonomy: ${params.sidle_ref_taxonomy}\n" >ref_taxonomy.${suffix}.txt + echo -e "Title: ${params.sidle_ref_databases[params.sidle_ref_taxonomy]["title"]}\n" >>ref_taxonomy.${suffix}.txt + echo -e "Citation: ${params.sidle_ref_databases[params.sidle_ref_taxonomy]["citation"]}\n" >>ref_taxonomy.${suffix}.txt + echo "All entries: ${params.sidle_ref_databases[params.sidle_ref_taxonomy]}" >>ref_taxonomy.${suffix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bash: \$(bash --version | sed -n 1p | sed 's/GNU bash, version //g') + END_VERSIONS + """ +} diff --git a/nextflow.config b/nextflow.config index d1d582a1..7c93e99a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -118,10 +118,9 @@ params { kraken2_assign_taxlevels = null kraken2_ref_tax_custom = null kraken2_confidence = 0.0 - sidle_ref_sequences = null - sidle_ref_alignedseq = null sidle_ref_taxonomy = null - sidle_ref_tree = null + sidle_ref_tax_custom = null + sidle_ref_tree_custom = null // MultiQC options multiqc_config = null @@ -159,7 +158,7 @@ params { // Schema validation default options validationFailUnrecognisedParams = false validationLenientMode = false - validationSchemaIgnoreParams = 'dada_ref_databases,qiime_ref_databases,sintax_ref_databases,kraken2_ref_databases,genomes,igenomes_base' + validationSchemaIgnoreParams = 'dada_ref_databases,qiime_ref_databases,sintax_ref_databases,kraken2_ref_databases,sidle_ref_databases,genomes,igenomes_base' validationShowHiddenParams = false validate_params = true diff --git a/nextflow_schema.json b/nextflow_schema.json index 39c34d82..f8e31a9a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -504,24 +504,25 @@ "minimum": 0, "maximum": 1 }, - "sidle_ref_sequences": { - "type": "string", - "help_text": "", - "description": "Path to SIDLE reference taxonomy sequences (*.fasta)" - }, - "sidle_ref_alignedseq": { + "sidle_ref_taxonomy": { "type": "string", "help_text": "", - "description": "Path to SIDLE aligned reference taxonomy sequences (*.fasta)" + "description": "Name of supported database, and optionally also version number", + "enum": [ + "silva", + "silva=128", + "greengenes", + "greengenes=13_8" + ] }, - "sidle_ref_taxonomy": { + "sidle_ref_tax_custom": { "type": "string", - "help_text": "", - "description": "Path to SIDLE reference taxonomy strings (*.txt)" + "help_text": "Consider also setting `--sidle_ref_tree_custom`. Example usage: `--sidle_ref_tax_custom 'rep_set_99.fasta,rep_set_aligned_99.fasta,taxonomy_99_taxonomy.txt'`", + "description": "Comma separated paths to three files: reference taxonomy sequences (*.fasta), reference taxonomy strings (*.txt)" }, - "sidle_ref_tree": { + "sidle_ref_tree_custom": { "type": "string", - "help_text": "", + "help_text": "Overwrites tree chosen by `--sidle_ref_taxonomy`", "description": "Path to SIDLE reference taxonomy tree (*.qza)" }, "sintax_ref_taxonomy": { diff --git a/subworkflows/local/sidle_wf.nf b/subworkflows/local/sidle_wf.nf index 54101cf9..865952da 100644 --- a/subworkflows/local/sidle_wf.nf +++ b/subworkflows/local/sidle_wf.nf @@ -2,33 +2,44 @@ * Training of a classifier with QIIME2 */ -include { SIDLE_INDB } from '../../modules/local/sidle_indb' -include { SIDLE_INDBALIGNED } from '../../modules/local/sidle_indbaligned' -include { SIDLE_DBFILT } from '../../modules/local/sidle_dbfilt' -include { SIDLE_IN } from '../../modules/local/sidle_in' -include { SIDLE_TRIM } from '../../modules/local/sidle_trim' -include { SIDLE_DBEXTRACT } from '../../modules/local/sidle_dbextract' -include { SIDLE_ALIGN } from '../../modules/local/sidle_align' -include { SIDLE_DBRECON } from '../../modules/local/sidle_dbrecon' -include { SIDLE_TABLERECON } from '../../modules/local/sidle_tablerecon' -include { SIDLE_TAXRECON } from '../../modules/local/sidle_taxrecon' -include { SIDLE_FILTTAX } from '../../modules/local/sidle_filttax' -include { SIDLE_SEQRECON } from '../../modules/local/sidle_seqrecon' -include { SIDLE_TREERECON } from '../../modules/local/sidle_treerecon' - +include { FORMAT_TAXONOMY_SIDLE } from '../../modules/local/format_taxonomy_sidle' +include { SIDLE_INDB } from '../../modules/local/sidle_indb' +include { SIDLE_INDBALIGNED } from '../../modules/local/sidle_indbaligned' +include { SIDLE_DBFILT } from '../../modules/local/sidle_dbfilt' +include { SIDLE_IN } from '../../modules/local/sidle_in' +include { SIDLE_TRIM } from '../../modules/local/sidle_trim' +include { SIDLE_DBEXTRACT } from '../../modules/local/sidle_dbextract' +include { SIDLE_ALIGN } from '../../modules/local/sidle_align' +include { SIDLE_DBRECON } from '../../modules/local/sidle_dbrecon' +include { SIDLE_TABLERECON } from '../../modules/local/sidle_tablerecon' +include { SIDLE_TAXRECON } from '../../modules/local/sidle_taxrecon' +include { SIDLE_FILTTAX } from '../../modules/local/sidle_filttax' +include { SIDLE_SEQRECON } from '../../modules/local/sidle_seqrecon' +include { SIDLE_TREERECON } from '../../modules/local/sidle_treerecon' workflow SIDLE_WF { take: ch_asv_tables_sequences - ch_db_sequences - ch_db_alignedsequences - ch_db_taxonomy + ch_sidle_ref_taxonomy + val_sidle_ref_taxonomy ch_db_tree main: ch_sidle_versions = Channel.empty() // DB + if (!params.sidle_ref_tax_custom) { + //standard ref taxonomy input from conf/ref_databases.config, one tar.gz / tgz with all files + FORMAT_TAXONOMY_SIDLE ( ch_sidle_ref_taxonomy, val_sidle_ref_taxonomy ) + ch_db_sequences = FORMAT_TAXONOMY_SIDLE.out.seqs + ch_db_alignedsequences = FORMAT_TAXONOMY_SIDLE.out.alnseq + ch_db_taxonomy = FORMAT_TAXONOMY_SIDLE.out.tax + } else { + //input from params.sidle_ref_tax_custom: it[0] = fasta = ch_db_sequences, it[1] = aligned fasta = ch_db_alignedsequences, it[2] = taxonomy txt = ch_db_taxonomy + ch_db_sequences = ch_sidle_ref_taxonomy.map{ it[0] } + ch_db_alignedsequences = ch_sidle_ref_taxonomy.map{ it[1] } + ch_db_taxonomy = ch_sidle_ref_taxonomy.map{ it[2] } + } SIDLE_INDB ( ch_db_sequences, ch_db_taxonomy ) ch_sidle_versions = ch_sidle_versions.mix(SIDLE_INDB.out.versions) SIDLE_INDBALIGNED ( ch_db_alignedsequences ) diff --git a/workflows/ampliseq.nf b/workflows/ampliseq.nf index 447907dd..880203d1 100644 --- a/workflows/ampliseq.nf +++ b/workflows/ampliseq.nf @@ -42,6 +42,28 @@ if (params.classifier) { ch_qiime_classifier = Channel.fromPath("${params.classifier}", checkIfExists: true) } else { ch_qiime_classifier = Channel.empty() } +if (params.sidle_ref_tax_custom) { + if ("${params.sidle_ref_tax_custom}".contains(",")) { + sidle_ref_paths = "${params.sidle_ref_tax_custom}".split(",") + if (sidle_ref_paths.length != 3) { + error "--sidle_ref_tax_custom exately three filepaths separated by a comma (fasta, aligned fasta, taxonomy). Please review input." + } + ch_sidle_ref_taxonomy = Channel.fromPath( Arrays.asList(sidle_ref_paths), checkIfExists: true ) + } else { + error "--sidle_ref_tax_custom accepts exately three filepaths separated by a comma. Please review input." + } + val_sidle_ref_taxonomy = "user" + ch_sidle_ref_taxonomy_tree = params.sidle_ref_tree_custom ? Channel.fromPath("${params.sidle_ref_tree_custom}", checkIfExists: true) : Channel.empty() +} else if (params.sidle_ref_taxonomy) { + ch_sidle_ref_taxonomy = Channel.fromList( params.sidle_ref_databases[params.sidle_ref_taxonomy]["file"] ).map { file(it) } + ch_sidle_ref_taxonomy_tree = params.sidle_ref_tree_custom ? Channel.fromPath("${params.sidle_ref_tree_custom}", checkIfExists: true) : Channel.fromList( params.sidle_ref_databases[params.sidle_ref_taxonomy]["tree_qza"] ).map { file(it) } + val_sidle_ref_taxonomy = params.sidle_ref_taxonomy.replace('=','_').replace('.','_') +} else { + ch_sidle_ref_taxonomy = Channel.empty() + ch_sidle_ref_taxonomy_tree = Channel.empty() + val_sidle_ref_taxonomy = "none" +} + if (params.dada_ref_tax_custom) { //custom ref taxonomy input from params.dada_ref_tax_custom & params.dada_ref_tax_custom_sp ch_assigntax = Channel.fromPath("${params.dada_ref_tax_custom}", checkIfExists: true) @@ -439,10 +461,9 @@ workflow AMPLISEQ { // run q2-sidle SIDLE_WF ( DADA2_SPLITREGIONS.out.for_sidle, - file( params.sidle_ref_sequences, checkIfExists: true ), //TODO: ch_sidle_ref_sequences // "gg_13_8_otus_rep_set_99_otus.fasta" - file( params.sidle_ref_alignedseq, checkIfExists: true ), //TODO: ch_sidle_ref_alignedseq // "gg_13_8_otus_taxonomy_99_otu_taxonomy.txt" - file( params.sidle_ref_taxonomy, checkIfExists: true ), //TODO: ch_sidle_ref_taxonomy // "gg_13_8_otus_taxonomy_99_otu_taxonomy.txt" - file( params.sidle_ref_tree, checkIfExists: true ) //TODO: ch_sidle_ref_tree // https://data.qiime2.org/2021.4/common/sepp-refs-gg-13-8.qza + ch_sidle_ref_taxonomy.collect(), + val_sidle_ref_taxonomy, + ch_sidle_ref_taxonomy_tree ) ch_versions = ch_versions.mix(SIDLE_WF.out.versions) }