diff --git a/CHANGELOG.md b/CHANGELOG.md index 9cdb7a0b..f3bbf9a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,26 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[1.7](https://github.com/nf-core/fetchngs/releases/tag/1.7)] - 2022-07-01 + +### :warning: Major enhancements + +Support for GEO ids has been dropped in this release due to breaking changes introduced in the NCBI API. For more detailed information please see [this PR](https://github.com/nf-core/fetchngs/pull/102). + +As a workaround, if you have a GEO accession you can directly download a text file containing the appropriate SRA ids to pass to the pipeline: + +- Search for your GEO accession on [GEO](https://www.ncbi.nlm.nih.gov/geo) +- Click `SRA Run Selector` at the bottom of the GEO accession page +- Select the desired samples in the `SRA Run Selector` and then download the `Accession List` + +This downloads a text file called `SRR_Acc_List.txt` that can be directly provided to the pipeline e.g. `--input SRR_Acc_List.txt`. + +### Enhancements & fixes + +- [#97](https://github.com/nf-core/fetchngs/pull/97) - Add support for generating nf-core/taxprofiler compatible samplesheets. +- [#99](https://github.com/nf-core/fetchngs/issues/99) - SRA_IDS_TO_RUNINFO fails due to bad request +- Add `enum` field for `--nf_core_pipeline` to parameter schema so only accept supported pipelines are accepted + ## [[1.6](https://github.com/nf-core/fetchngs/releases/tag/1.6)] - 2022-05-17 - [#57](https://github.com/nf-core/fetchngs/pull/57) - fetchngs fails if FTP is blocked diff --git a/README.md b/README.md index 18b27ca3..1151db3d 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ ## Introduction -**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from both public and private databases. At present, the pipeline supports SRA / ENA / DDBJ / GEO / Synapse ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)). +**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from both public and private databases. At present, the pipeline supports SRA / ENA / DDBJ / Synapse ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)). The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. @@ -27,7 +27,7 @@ On release, automated continuous integration tests run the pipeline on a full-si Via a single file of ids, provided one-per-line (see [example input file](https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/sra_ids_test.txt)) the pipeline performs the following steps: -### SRA / ENA / DDBJ / GEO ids +### SRA / ENA / DDBJ ids 1. Resolve database ids back to appropriate experiment-level ids and to be compatible with the [ENA API](https://ena-docs.readthedocs.io/en/latest/retrieval/programmatic-access.html) 2. Fetch extensive id metadata via ENA API @@ -36,6 +36,18 @@ Via a single file of ids, provided one-per-line (see [example input file](https: - Otherwise use [`sra-tools`](https://github.com/ncbi/sra-tools) to download `.sra` files and convert them to FastQ 4. Collate id metadata and paths to FastQ files in a single samplesheet +### GEO ids + +Support for GEO ids was dropped in [[v1.7](https://github.com/nf-core/fetchngs/releases/tag/1.7)] due to breaking changes introduced in the NCBI API. For more detailed information please see [this PR](https://github.com/nf-core/fetchngs/pull/102). + +As a workaround, if you have a GEO accession you can directly download a text file containing the appropriate SRA ids to pass to the pipeline instead: + +- Search for your GEO accession on [GEO](https://www.ncbi.nlm.nih.gov/geo) +- Click `SRA Run Selector` at the bottom of the GEO accession page +- Select the desired samples in the `SRA Run Selector` and then download the `Accession List` + +This downloads a text file called `SRR_Acc_List.txt` that can be directly provided to the pipeline e.g. `--input SRR_Acc_List.txt`. + ### Synapse ids 1. Resolve Synapse directory ids to their corresponding FastQ files ids via the `synapse list` command. @@ -45,7 +57,13 @@ Via a single file of ids, provided one-per-line (see [example input file](https: ### Samplesheet format -The columns in the auto-created samplesheet can be tailored to be accepted out-of-the-box by selected nf-core pipelines, these currently include [nf-core/rnaseq](https://nf-co.re/rnaseq/usage#samplesheet-input) and the Illumina processing mode of [nf-core/viralrecon](https://nf-co.re/viralrecon/usage#illumina-samplesheet-format). You can use the `--nf_core_pipeline` parameter to customise this behaviour e.g. `--nf_core_pipeline rnaseq`. More pipelines will be supported in due course as we adopt and standardise samplesheet input across nf-core. +The columns in the auto-created samplesheet can be tailored to be accepted out-of-the-box by selected nf-core pipelines, these currently include: + +- [nf-core/rnaseq](https://nf-co.re/rnaseq/usage#samplesheet-input) +- Ilumina processing mode of [nf-core/viralrecon](https://nf-co.re/viralrecon/usage#illumina-samplesheet-format) +- [nf-core/taxprofiler](https://nf-co.re/nf-core/taxprofiler) + +You can use the `--nf_core_pipeline` parameter to customise this behaviour e.g. `--nf_core_pipeline rnaseq`. More pipelines will be supported in due course as we adopt and standardise samplesheet input across nf-core. ## Quick Start diff --git a/assets/schema_input.json b/assets/schema_input.json index 9a800216..71f0f976 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -8,8 +8,8 @@ "type": "array", "items": { "type": "string", - "pattern": "^(((SR|ER|DR)[APRSX])|(SAM(N|EA|D))|(PRJ(NA|EB|DB))|(GS[EM])|(syn))(\\d+)$", - "errorMessage": "Please provide a valid SRA, ENA, DDBJ or GEO identifier" + "pattern": "^(((SR|ER|DR)[APRSX])|(SAM(N|EA|D))|(PRJ(NA|EB|DB))|(syn))(\\d+)$", + "errorMessage": "Please provide a valid SRA, ENA, DDBJ identifier" } } } diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index bbd1e6f0..21c7225d 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -193,9 +193,11 @@ def is_valid(cls, identifier): class DatabaseResolver: """Define a service class for resolving various identifiers to experiments.""" - _GEO_PREFIXES = {"GSE"} + _GEO_PREFIXES = { + "GSE", + "GSM" + } _SRA_PREFIXES = { - "GSM", "PRJNA", "SAMN", "SRR", @@ -207,7 +209,9 @@ class DatabaseResolver: "PRJDB", "SAMD", } - _ENA_PREFIXES = {"ERR"} + _ENA_PREFIXES = { + "ERR" + } @classmethod def expand_identifier(cls, identifier): @@ -246,13 +250,13 @@ def _content_check(cls, response, identifier): def _id_to_srx(cls, identifier): """Resolve the identifier to SRA experiments.""" params = { - "save": "efetch", + "id": identifier, "db": "sra", "rettype": "runinfo", - "term": identifier, + "retmode": "text" } response = fetch_url( - f"https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?{urlencode(params)}" + f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{urlencode(params)}" ) cls._content_check(response, identifier) return [row["Experiment"] for row in open_table(response, delimiter=",")] @@ -261,9 +265,14 @@ def _id_to_srx(cls, identifier): def _gse_to_srx(cls, identifier): """Resolve the identifier to SRA experiments.""" ids = [] - params = {"acc": identifier, "targ": "gsm", "view": "data", "form": "text"} + params = { + "id": identifier, + "db": "gds", + "rettype": "runinfo", + "retmode": "text" + } response = fetch_url( - f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?{urlencode(params)}" + f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{urlencode(params)}" ) cls._content_check(response, identifier) gsm_ids = [ diff --git a/docs/output.md b/docs/output.md index 7402976c..daaca914 100644 --- a/docs/output.md +++ b/docs/output.md @@ -9,19 +9,19 @@ This document describes the output produced by the pipeline. The directories lis The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data depending on the type of ids provided: - Download FastQ files and create samplesheet from: - 1. [SRA / ENA / DDBJ / GEO ids](#sra--ena--ddbj--geo-ids) + 1. [SRA / ENA / DDBJ ids](#sra--ena--ddbj-ids) 2. [Synapse ids](#synapse-ids) - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution Please see the [usage documentation](https://nf-co.re/fetchngs/usage#introduction) for a list of supported public repository identifiers and how to provide them to the pipeline. -### SRA / ENA / DDBJ / GEO ids +### SRA / ENA / DDBJ ids
Output files - `fastq/` - - `*.fastq.gz`: Paired-end/single-end reads downloaded from the SRA / ENA / DDBJ / GEO. + - `*.fastq.gz`: Paired-end/single-end reads downloaded from the SRA / ENA / DDBJ. - `fastq/md5/` - `*.md5`: Files containing `md5` sum for FastQ files downloaded from the ENA. - `samplesheet/` diff --git a/docs/usage.md b/docs/usage.md index b6c97e33..ba4de1b2 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -8,15 +8,15 @@ The pipeline has been set-up to automatically download and process the raw FastQ files from both public and private repositories. Identifiers can be provided in a file, one-per-line via the `--input` parameter. Currently, the following types of example identifiers are supported: -| `SRA` | `ENA` | `DDBJ` | `GEO` | `Synapse` | -| ------------ | ------------ | ------------ | ---------- | ----------- | -| SRR11605097 | ERR4007730 | DRR171822 | GSM4432381 | syn26240435 | -| SRX8171613 | ERX4009132 | DRX162434 | GSE147507 | | -| SRS6531847 | ERS4399630 | DRS090921 | | | -| SAMN14689442 | SAMEA6638373 | SAMD00114846 | | | -| SRP256957 | ERP120836 | DRP004793 | | | -| SRA1068758 | ERA2420837 | DRA008156 | | | -| PRJNA625551 | PRJEB37513 | PRJDB4176 | | | +| `SRA` | `ENA` | `DDBJ` | `Synapse` | +| ------------ | ------------ | ------------ | ----------- | +| SRR11605097 | ERR4007730 | DRR171822 | syn26240435 | +| SRX8171613 | ERX4009132 | DRX162434 | | +| SRS6531847 | ERS4399630 | DRS090921 | | +| SAMN14689442 | SAMEA6638373 | SAMD00114846 | | +| SRP256957 | ERP120836 | DRP004793 | | +| SRA1068758 | ERA2420837 | DRA008156 | | +| PRJNA625551 | PRJEB37513 | PRJDB4176 | | ### SRR / ERR / DRR ids @@ -55,7 +55,13 @@ The final sample information for the FastQ files used for samplesheet generation ### Samplesheet format -As a bonus, the columns in the auto-created samplesheet can be tailored to be accepted out-of-the-box by selected nf-core pipelines, these currently include [nf-core/rnaseq](https://nf-co.re/rnaseq/usage#samplesheet-input) and the Illumina processing mode of [nf-core/viralrecon](https://nf-co.re/viralrecon/usage#illumina-samplesheet-format). You can use the `--nf_core_pipeline` parameter to customise this behaviour e.g. `--nf_core_pipeline rnaseq`. More pipelines will be supported in due course as we adopt and standardise samplesheet input across nf-core. It is highly recommended that you double-check that all of the identifiers you defined using `--input` are represented in the samplesheet. Also, public databases don't reliably hold information such as strandedness information so you may need to amend these entries too if for example your samplesheet was created by providing `--nf_core_pipeline rnaseq`. +As a bonus, the columns in the auto-created samplesheet can be tailored to be accepted out-of-the-box by selected nf-core pipelines, these currently include: + +- [nf-core/rnaseq](https://nf-co.re/rnaseq/usage#samplesheet-input) +- Ilumina processing mode of [nf-core/viralrecon](https://nf-co.re/viralrecon/usage#illumina-samplesheet-format) +- [nf-core/taxprofiler](https://nf-co.re/nf-core/taxprofiler) + +You can use the `--nf_core_pipeline` parameter to customise this behaviour e.g. `--nf_core_pipeline rnaseq`. More pipelines will be supported in due course as we adopt and standardise samplesheet input across nf-core. It is highly recommended that you double-check that all of the identifiers you defined using `--input` are represented in the samplesheet. Also, public databases don't reliably hold information such as strandedness information so you may need to amend these entries too if for example your samplesheet was created by providing `--nf_core_pipeline rnaseq`. ### Bypass `FTP` data download diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index f64fa80f..77b7ffde 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -104,7 +104,7 @@ class WorkflowMain { if (num_match == total_ids) { is_sra = true } else { - log.error "Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ / GEO or Synapse ids!" + log.error "Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ or Synapse ids!" System.exit(1) } } @@ -129,7 +129,7 @@ class WorkflowMain { if (num_match == total_ids) { is_synapse = true } else { - log.error "Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ / GEO or Synapse ids!" + log.error "Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ or Synapse ids!" System.exit(1) } } diff --git a/lib/WorkflowSra.groovy b/lib/WorkflowSra.groovy index a2c16219..90d86f1c 100755 --- a/lib/WorkflowSra.groovy +++ b/lib/WorkflowSra.groovy @@ -29,4 +29,21 @@ class WorkflowSra { " running nf-core/other pipelines.\n" + "===================================================================================" } + + // Fail pipeline if input ids are from the GEO + public static void isGeoFail(ids, log) { + def pattern = /^(GS[EM])(\d+)$/ + for (id in ids) { + if (id =~ pattern) { + log.error "===================================================================================\n" + + " GEO id detected: ${id}\n" + + " Support for GEO ids was dropped in v1.7 due to breaking changes in the NCBI API.\n" + + " Please remove any GEO ids from the input samplesheet.\n\n" + + " Please see:\n" + + " https://github.com/nf-core/fetchngs/pull/102\n" + + "===================================================================================" + System.exit(1) + } + } + } } diff --git a/main.nf b/main.nf index c6303a41..2c4b52f2 100644 --- a/main.nf +++ b/main.nf @@ -44,7 +44,7 @@ if (WorkflowMain.isSraId(ch_input, log)) { } else if (WorkflowMain.isSynapseId(ch_input, log)) { input_type = 'synapse' } else { - exit 1, 'Ids provided via --input not recognised please make sure they are either SRA / ENA / DDBJ / GEO or Synapse ids!' + exit 1, 'Ids provided via --input not recognised please make sure they are either SRA / ENA / DDBJ or Synapse ids!' } if (params.input_type == input_type) { @@ -63,7 +63,7 @@ if (params.input_type == input_type) { workflow NFCORE_FETCHNGS { // - // WORKFLOW: Download FastQ files for SRA / ENA / DDBJ / GEO ids + // WORKFLOW: Download FastQ files for SRA / ENA / DDBJ ids // if (params.input_type == 'sra') { SRA ( ch_ids ) diff --git a/modules/local/sra_to_samplesheet.nf b/modules/local/sra_to_samplesheet.nf index 465144d7..4b448e89 100644 --- a/modules/local/sra_to_samplesheet.nf +++ b/modules/local/sra_to_samplesheet.nf @@ -39,6 +39,8 @@ process SRA_TO_SAMPLESHEET { if (pipeline) { if (pipeline == 'rnaseq') { pipeline_map << [ strandedness: 'unstranded' ] + } else if (pipeline == 'taxprofiler') { + pipeline_map << [ fasta: '' ] } } pipeline_map << meta_map diff --git a/nextflow.config b/nextflow.config index 237190ec..80ed6507 100644 --- a/nextflow.config +++ b/nextflow.config @@ -158,7 +158,7 @@ manifest { description = 'Pipeline to fetch metadata and raw FastQ files from public databases' mainScript = 'main.nf' nextflowVersion = '!>=21.10.3' - version = '1.6' + version = '1.7' } // Load modules.config for DSL2 module specific options diff --git a/nextflow_schema.json b/nextflow_schema.json index 2bc7c52e..a51dc45a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -19,7 +19,7 @@ "pattern": "^\\S+\\.txt$", "schema": "assets/schema_input.json", "fa_icon": "fas fa-file-excel", - "description": "File containing SRA/ENA/DDBJ/GEO identifiers one per line to download their associated metadata and FastQ files." + "description": "File containing SRA/ENA/DDBJ identifiers one per line to download their associated metadata and FastQ files." }, "input_type": { "type": "string", @@ -43,7 +43,8 @@ "nf_core_pipeline": { "type": "string", "fa_icon": "fab fa-apple", - "description": "Name of supported nf-core pipeline e.g. 'rnaseq'. A samplesheet for direct use with the pipeline will be created with the appropriate columns." + "description": "Name of supported nf-core pipeline e.g. 'rnaseq'. A samplesheet for direct use with the pipeline will be created with the appropriate columns.", + "enum": ["rnaseq", "viralrecon", "taxprofiler"] }, "force_sratools_download": { "type": "boolean", diff --git a/workflows/sra.nf b/workflows/sra.nf index 2b3cb498..0dde4588 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -50,6 +50,13 @@ workflow SRA { main: ch_versions = Channel.empty() + // + // Fail the pipeline if GEO ids detected + // + ids + .collect() + .map { WorkflowSra.isGeoFail(it, log) } + // // MODULE: Get SRA run information for public database ids //