diff --git a/.editorconfig b/.editorconfig index b6b31907..b78de6e6 100644 --- a/.editorconfig +++ b/.editorconfig @@ -8,7 +8,7 @@ trim_trailing_whitespace = true indent_size = 4 indent_style = space -[*.{md,yml,yaml,html,css,scss,js}] +[*.{md,yml,yaml,html,css,scss,js,cff}] indent_size = 2 # These files are edited and tested upstream in nf-core/modules diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 1f17f774..c9f23b88 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -15,8 +15,7 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/fetc - [ ] This comment contains a description of changes (with reason). - [ ] If you've fixed a bug or added code that should be tested, add tests! - - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/fetchngs/tree/master/.github/CONTRIBUTING.md) - - [ ] If necessary, also make a PR on the nf-core/fetchngs _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. +- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/fetchngs/tree/master/.github/CONTRIBUTING.md)- [ ] If necessary, also make a PR on the nf-core/fetchngs _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. - [ ] Make sure your code lints (`nf-core lint`). - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). - [ ] Usage Documentation in `docs/usage.md` is updated. diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index fe67f343..d648893c 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -25,3 +25,7 @@ jobs: "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/fetchngs/results-${{ github.sha }}" } profiles: test_full,aws_tower + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index d6cd3108..6e6a8c52 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -23,3 +23,7 @@ jobs: "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/fetchngs/results-test-${{ github.sha }}" } profiles: test,aws_tower + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b24c607a..6939e1b9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,7 +10,6 @@ on: env: NXF_ANSI_LOG: false - CAPSULE_LOG: none jobs: test: @@ -20,27 +19,17 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - # Nextflow versions - include: - # Test pipeline minimum Nextflow version - - NXF_VER: "21.10.3" - NXF_EDGE: "" - # Test latest edge release of Nextflow - - NXF_VER: "" - NXF_EDGE: "1" + NXF_VER: + - "21.10.3" + - "latest-everything" steps: - name: Check out pipeline code uses: actions/checkout@v2 - name: Install Nextflow - env: - NXF_VER: ${{ matrix.NXF_VER }} - # Uncomment only if the edge release is more recent than the latest stable release - # See https://github.com/nextflow-io/nextflow/issues/2467 - # NXF_EDGE: ${{ matrix.NXF_EDGE }} - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ + uses: nf-core/setup-nextflow@v1 + with: + version: "${{ matrix.NXF_VER }}" - name: Run pipeline with test data run: | diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 77358dee..8a5ce69b 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -35,6 +35,36 @@ jobs: - name: Run Prettier --check run: prettier --check ${GITHUB_WORKSPACE} + PythonBlack: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Check code lints with Black + uses: psf/black@stable + + # If the above check failed, post a comment on the PR explaining the failure + - name: Post PR comment + if: failure() + uses: mshick/add-pr-comment@v1 + with: + message: | + ## Python linting (`black`) is failing + + To keep the code consistent with lots of contributors, we run automated code consistency checks. + To fix this CI test, please run: + + * Install [`black`](https://black.readthedocs.io/en/stable/): `pip install black` + * Fix formatting errors in your pipeline: `black .` + + Once you push these changes the test should pass, and you can hide this comment :+1: + + We highly recommend setting up Black in your code editor so that this formatting is done automatically on save. Ask about it on Slack for help! + + Thanks again for your contribution! + repo-token: ${{ secrets.GITHUB_TOKEN }} + allow-repeats: false + nf-core: runs-on: ubuntu-latest steps: @@ -42,15 +72,11 @@ jobs: uses: actions/checkout@v2 - name: Install Nextflow - env: - CAPSULE_LOG: none - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ + uses: nf-core/setup-nextflow@v1 - uses: actions/setup-python@v3 with: - python-version: "3.6" + python-version: "3.7" architecture: "x64" - name: Install dependencies diff --git a/.prettierignore b/.prettierignore index d0e7ae58..eb74a574 100644 --- a/.prettierignore +++ b/.prettierignore @@ -1,4 +1,5 @@ email_template.html +adaptivecard.json .nextflow* work/ data/ diff --git a/CHANGELOG.md b/CHANGELOG.md index f3bbf9a7..b57bfefc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[1.8](https://github.com/nf-core/fetchngs/releases/tag/1.8)] - 2022-11-08 + +### Enhancements & fixes + +- [#111](https://github.com/nf-core/fetchngs/issues/111) - Change input mimetype to csv +- [#114](https://github.com/nf-core/fetchngs/issues/114) - Final samplesheet is not created when `--skip_fastq_download` is provided +- [#118](https://github.com/nf-core/fetchngs/issues/118) - Allow input pattern validation for csv/tsv/txt +- [#119](https://github.com/nf-core/fetchngs/issues/119) - `--force_sratools_download` results in different fastq names compared to FTP download +- [#121](https://github.com/nf-core/fetchngs/issues/121) - Add `tower.yml` to render samplesheet as Report in Tower +- Fetch `SRR` and `DRR` metadata from ENA API instead of NCBI API to bypass frequent breaking changes +- Updated pipeline template to [nf-core/tools 2.6](https://github.com/nf-core/tools/releases/tag/2.6) + ## [[1.7](https://github.com/nf-core/fetchngs/releases/tag/1.7)] - 2022-07-01 ### :warning: Major enhancements diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 00000000..017666c0 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,56 @@ +cff-version: 1.2.0 +message: "If you use `nf-core tools` in your work, please cite the `nf-core` publication" +authors: + - family-names: Ewels + given-names: Philip + - family-names: Peltzer + given-names: Alexander + - family-names: Fillinger + given-names: Sven + - family-names: Patel + given-names: Harshil + - family-names: Alneberg + given-names: Johannes + - family-names: Wilm + given-names: Andreas + - family-names: Garcia + given-names: Maxime Ulysse + - family-names: Di Tommaso + given-names: Paolo + - family-names: Nahnsen + given-names: Sven +title: "The nf-core framework for community-curated bioinformatics pipelines." +version: 2.4.1 +doi: 10.1038/s41587-020-0439-x +date-released: 2022-05-16 +url: https://github.com/nf-core/tools +prefered-citation: + type: article + authors: + - family-names: Ewels + given-names: Philip + - family-names: Peltzer + given-names: Alexander + - family-names: Fillinger + given-names: Sven + - family-names: Patel + given-names: Harshil + - family-names: Alneberg + given-names: Johannes + - family-names: Wilm + given-names: Andreas + - family-names: Garcia + given-names: Maxime Ulysse + - family-names: Di Tommaso + given-names: Paolo + - family-names: Nahnsen + given-names: Sven + doi: 10.1038/s41587-020-0439-x + journal: nature biotechnology + start: 276 + end: 278 + title: "The nf-core framework for community-curated bioinformatics pipelines." + issue: 3 + volume: 38 + year: 2020 + url: https://dx.doi.org/10.1038/s41587-020-0439-x diff --git a/README.md b/README.md index 1151db3d..71b4602d 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,14 @@ # ![nf-core/fetchngs](docs/images/nf-core-fetchngs_logo_light.png#gh-light-mode-only) ![nf-core/fetchngs](docs/images/nf-core-fetchngs_logo_dark.png#gh-dark-mode-only) -[![GitHub Actions CI Status](https://github.com/nf-core/fetchngs/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/fetchngs/actions?query=workflow%3A%22nf-core+CI%22) -[![GitHub Actions Linting Status](https://github.com/nf-core/fetchngs/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/fetchngs/actions?query=workflow%3A%22nf-core+linting%22) -[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?logo=Amazon%20AWS)](https://nf-co.re/fetchngs/results) -[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.5070524-1073c8)](https://doi.org/10.5281/zenodo.5070524) +[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/fetchngs/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.5070524-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.5070524) [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.10.3-23aa62.svg)](https://www.nextflow.io/) -[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?logo=anaconda)](https://docs.conda.io/en/latest/) -[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?logo=docker)](https://www.docker.com/) -[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg)](https://sylabs.io/docs/) +[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) +[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) +[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) [![Launch on Nextflow Tower](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Nextflow%20Tower-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/nf-core/fetchngs) -[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23fetchngs-4A154B?logo=slack)](https://nfcore.slack.com/channels/fetchngs) -[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?logo=twitter)](https://twitter.com/nf_core) -[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?logo=youtube)](https://www.youtube.com/c/nf-core) +[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23fetchngs-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/fetchngs)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) ## Introduction @@ -21,11 +16,11 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. -On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources. The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/fetchngs/results). +On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources.The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/fetchngs/results). ## Pipeline summary -Via a single file of ids, provided one-per-line (see [example input file](https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/sra_ids_test.txt)) the pipeline performs the following steps: +Via a single file of ids, provided one-per-line (see [example input file](https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/sra_ids_test.csv)) the pipeline performs the following steps: ### SRA / ENA / DDBJ ids @@ -46,7 +41,7 @@ As a workaround, if you have a GEO accession you can directly download a text fi - Click `SRA Run Selector` at the bottom of the GEO accession page - Select the desired samples in the `SRA Run Selector` and then download the `Accession List` -This downloads a text file called `SRR_Acc_List.txt` that can be directly provided to the pipeline e.g. `--input SRR_Acc_List.txt`. +This downloads a text file called `SRR_Acc_List.txt` that can be directly provided to the pipeline once renamed with a .csv extension e.g. `--input SRR_Acc_List.csv`. ### Synapse ids @@ -73,7 +68,7 @@ You can use the `--nf_core_pipeline` parameter to customise this behaviour e.g. 3. Download the pipeline and test it on a minimal dataset with a single command: - ```console + ```bash nextflow run nf-core/fetchngs -profile test,YOURPROFILE --outdir ``` @@ -87,7 +82,7 @@ You can use the `--nf_core_pipeline` parameter to customise this behaviour e.g. 4. Start running your own analysis! ```bash - nextflow run nf-core/fetchngs --input ids.txt --outdir -profile + nextflow run nf-core/fetchngs --input ids.csv --outdir -profile ``` ## Documentation diff --git a/assets/adaptivecard.json b/assets/adaptivecard.json new file mode 100644 index 00000000..26660eca --- /dev/null +++ b/assets/adaptivecard.json @@ -0,0 +1,67 @@ +{ + "type": "message", + "attachments": [ + { + "contentType": "application/vnd.microsoft.card.adaptive", + "contentUrl": null, + "content": { + "\$schema": "http://adaptivecards.io/schemas/adaptive-card.json", + "msteams": { + "width": "Full" + }, + "type": "AdaptiveCard", + "version": "1.2", + "body": [ + { + "type": "TextBlock", + "size": "Large", + "weight": "Bolder", + "color": "<% if (success) { %>Good<% } else { %>Attention<%} %>", + "text": "nf-core/fetchngs v${version} - ${runName}", + "wrap": true + }, + { + "type": "TextBlock", + "spacing": "None", + "text": "Completed at ${dateComplete} (duration: ${duration})", + "isSubtle": true, + "wrap": true + }, + { + "type": "TextBlock", + "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors. The full error message was: ${errorReport}.<% } %>", + "wrap": true + }, + { + "type": "TextBlock", + "text": "The command used to launch the workflow was as follows:", + "wrap": true + }, + { + "type": "TextBlock", + "text": "${commandLine}", + "isSubtle": true, + "wrap": true + } + ], + "actions": [ + { + "type": "Action.ShowCard", + "title": "Pipeline Configuration", + "card": { + "type": "AdaptiveCard", + "\$schema": "http://adaptivecards.io/schemas/adaptive-card.json", + "body": [ + { + "type": "FactSet", + "facts": [<% out << summary.collect{ k,v -> "{\"title\": \"$k\", \"value\" : \"$v\"}"}.join(",\n") %> + ] + } + ] + } + } + ] + } + } + ] +} diff --git a/assets/email_template.txt b/assets/email_template.txt index 2de93008..f9393aa8 100644 --- a/assets/email_template.txt +++ b/assets/email_template.txt @@ -6,7 +6,6 @@ `._,._,' nf-core/fetchngs v${version} ---------------------------------------------------- - Run Name: $runName <% if (success){ diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml new file mode 100644 index 00000000..f9309867 --- /dev/null +++ b/assets/methods_description_template.yml @@ -0,0 +1,25 @@ +id: "nf-core-fetchngs-methods-description" +description: "Suggested text and references to use when describing pipeline usage within the methods section of a publication." +section_name: "nf-core/fetchngs Methods Description" +section_href: "https://github.com/nf-core/fetchngs" +plot_type: "html" +## TODO nf-core: Update the HTML below to your prefered methods description, e.g. add publication citation for this pipeline +## You inject any metadata in the Nextflow '${workflow}' object +data: | +

Methods

+

Data was processed using nf-core/fetchngs v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020).

+

The pipeline was executed with Nextflow v${workflow.nextflow.version} (Di Tommaso et al., 2017) with the following command:

+
${workflow.commandLine}
+

References

+
    +
  • Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316-319. https://doi.org/10.1038/nbt.3820
  • +
  • Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. Nature Biotechnology, 38(3), 276-278. https://doi.org/10.1038/s41587-020-0439-x
  • +
+
+
Notes:
+
    + ${nodoi_text} +
  • The command above does not include parameters contained in any configs or profiles that may have been used. Ensure the config file is also uploaded with your publication!
  • +
  • You should also cite all software used within this run. Check the "Software Versions" of this report to get version information.
  • +
+
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index a1144be1..a7d7cf37 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -3,9 +3,11 @@ report_comment: > analysis pipeline. For information about how to interpret these results, please see the documentation. report_section_order: - software_versions: + "nf-core-fetchngs-methods-description": order: -1000 - "nf-core-fetchngs-summary": + software_versions: order: -1001 + "nf-core-fetchngs-summary": + order: -1002 export_plots: true diff --git a/bin/multiqc_mappings_config.py b/bin/multiqc_mappings_config.py index a7fc92ba..2fcdb49a 100755 --- a/bin/multiqc_mappings_config.py +++ b/bin/multiqc_mappings_config.py @@ -3,11 +3,10 @@ import sys with open(sys.argv[1], "r") as fin, open(sys.argv[2], "w") as fout: - header = fin.readline().split(',') + header = fin.readline().split(",") config = "sample_names_rename_buttons:\n" - config += "\n".join([' - ' + x.strip('"') for x in header]) + config += "\n".join([" - " + x.strip('"') for x in header]) config += "sample_names_rename:\n" for line in fin: config += f" - [{', '.join(line.strip().split(','))}]\n" fout.write(config) - diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 21c7225d..70627791 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -49,9 +49,7 @@ ) GEO_IDS = ("GSE18729", "GSM465244") ID_REGEX = re.compile(r"^([A-Z]+)([0-9]+)$") -PREFIX_LIST = sorted( - {ID_REGEX.match(id).group(1) for id in SRA_IDS + ENA_IDS + DDBJ_IDS + GEO_IDS} -) +PREFIX_LIST = sorted({ID_REGEX.match(id).group(1) for id in SRA_IDS + ENA_IDS + DDBJ_IDS + GEO_IDS}) # List of metadata fields fetched from the ENA API - can be overriden by options @@ -193,25 +191,18 @@ def is_valid(cls, identifier): class DatabaseResolver: """Define a service class for resolving various identifiers to experiments.""" - _GEO_PREFIXES = { - "GSE", - "GSM" - } + _GEO_PREFIXES = {"GSE", "GSM"} _SRA_PREFIXES = { "PRJNA", "SAMN", - "SRR", "DRA", "DRP", - "DRR", "DRS", "DRX", "PRJDB", "SAMD", } - _ENA_PREFIXES = { - "ERR" - } + _ENA_PREFIXES = {"ERR", "SRR", "DRR"} @classmethod def expand_identifier(cls, identifier): @@ -240,24 +231,14 @@ def expand_identifier(cls, identifier): def _content_check(cls, response, identifier): """Check that the response has content or terminate.""" if response.status == 204: - logger.error( - f"There is no content for id {identifier}. Maybe you lack the right " - f"permissions?" - ) + logger.error(f"There is no content for id {identifier}. Maybe you lack the right " f"permissions?") sys.exit(1) @classmethod def _id_to_srx(cls, identifier): """Resolve the identifier to SRA experiments.""" - params = { - "id": identifier, - "db": "sra", - "rettype": "runinfo", - "retmode": "text" - } - response = fetch_url( - f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{urlencode(params)}" - ) + params = {"id": identifier, "db": "sra", "rettype": "runinfo", "retmode": "text"} + response = fetch_url(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{urlencode(params)}") cls._content_check(response, identifier) return [row["Experiment"] for row in open_table(response, delimiter=",")] @@ -265,20 +246,13 @@ def _id_to_srx(cls, identifier): def _gse_to_srx(cls, identifier): """Resolve the identifier to SRA experiments.""" ids = [] - params = { - "id": identifier, - "db": "gds", - "rettype": "runinfo", - "retmode": "text" - } - response = fetch_url( - f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{urlencode(params)}" - ) + params = {"id": identifier, "db": "gds", "rettype": "runinfo", "retmode": "text"} + response = fetch_url(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{urlencode(params)}") cls._content_check(response, identifier) gsm_ids = [ line.split("=")[1].strip() for line in response.text().splitlines() - if line.split('=')[1].strip().startswith('GSM') + if line.split("=")[1].strip().startswith("GSM") ] for gsm_id in gsm_ids: ids += cls._id_to_srx(gsm_id) @@ -293,13 +267,9 @@ def _id_to_erx(cls, identifier): "result": "read_run", "fields": ",".join(fields), } - response = fetch_url( - f"https://www.ebi.ac.uk/ena/portal/api/filereport?{urlencode(params)}" - ) + response = fetch_url(f"https://www.ebi.ac.uk/ena/portal/api/filereport?{urlencode(params)}") cls._content_check(response, identifier) - return [ - row["experiment_accession"] for row in open_table(response, delimiter="\t") - ] + return [row["experiment_accession"] for row in open_table(response, delimiter="\t")] class ENAMetadataFetcher: @@ -328,9 +298,7 @@ def open_experiment_table(self, accession): """ params = {**self._params, "accession": accession} - response = fetch_url( - f"https://www.ebi.ac.uk/ena/portal/api/filereport?{urlencode(params)}" - ) + response = fetch_url(f"https://www.ebi.ac.uk/ena/portal/api/filereport?{urlencode(params)}") self._content_check(response, accession) return open_table(response, delimiter="\t") @@ -338,10 +306,7 @@ def open_experiment_table(self, accession): def _content_check(cls, response, identifier): """Check that the response has content or terminate.""" if response.status == 204: - logger.error( - f"There is no content for id {identifier}. Maybe you lack the right " - f"permissions?" - ) + logger.error(f"There is no content for id {identifier}. Maybe you lack the right " f"permissions?") sys.exit(1) @@ -362,8 +327,7 @@ def open_table(response, delimiter=","): def parse_args(args=None): parser = argparse.ArgumentParser( - description="Download and create a run information metadata file from SRA / " - "ENA / DDBJ / GEO identifiers.", + description="Download and create a run information metadata file from SRA / " "ENA / DDBJ / GEO identifiers.", epilog="Example usage: python fetch_sra_runinfo.py ", ) parser.add_argument( @@ -383,8 +347,7 @@ def parse_args(args=None): "--ena_metadata_fields", type=str, default=",".join(ENA_METADATA_FIELDS), - help=f"Comma-separated list of ENA metadata fields to fetch " - f"(default: {','.join(ENA_METADATA_FIELDS)}).", + help=f"Comma-separated list of ENA metadata fields to fetch " f"(default: {','.join(ENA_METADATA_FIELDS)}).", ) parser.add_argument( "-l", @@ -431,9 +394,7 @@ def get_ena_fields(): return [ row["columnId"] for row in open_table( - fetch_url( - f"https://www.ebi.ac.uk/ena/portal/api/returnFields?{urlencode(params)}" - ), + fetch_url(f"https://www.ebi.ac.uk/ena/portal/api/returnFields?{urlencode(params)}"), delimiter="\t", ) ] @@ -453,16 +414,11 @@ def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields): seen_ids.add(db_id) if not DatabaseIdentifierChecker.is_valid(db_id): id_str = ", ".join([x + "*" for x in PREFIX_LIST]) - logger.error( - f"Please provide a valid database id starting with {id_str}!\n" - f"Line: '{line.strip()}'" - ) + logger.error(f"Please provide a valid database id starting with {id_str}!\n" f"Line: '{line.strip()}'") sys.exit(1) ids = DatabaseResolver.expand_identifier(db_id) if not ids: - logger.error( - f"No matches found for database id {db_id}!\nLine: '{line.strip()}'" - ) + logger.error(f"No matches found for database id {db_id}!\nLine: '{line.strip()}'") sys.exit(1) for accession in ids: for row in ena_fetcher.open_experiment_table(accession): diff --git a/bin/sra_runinfo_to_ftp.py b/bin/sra_runinfo_to_ftp.py index d9400b2f..ef80ec80 100755 --- a/bin/sra_runinfo_to_ftp.py +++ b/bin/sra_runinfo_to_ftp.py @@ -62,10 +62,7 @@ def parse_sra_runinfo(file_in): reader = csv.DictReader(fin, delimiter="\t", skipinitialspace=True) header = list(reader.fieldnames) if missing := frozenset(columns).difference(frozenset(header)): - logger.critical( - f"The following expected columns are missing from {file_in}: " - f"{', '.join(missing)}." - ) + logger.critical(f"The following expected columns are missing from {file_in}: " f"{', '.join(missing)}.") sys.exit(1) for row in reader: db_id = row["experiment_accession"] @@ -73,14 +70,9 @@ def parse_sra_runinfo(file_in): fq_files = row["fastq_ftp"].split(";")[-2:] fq_md5 = row["fastq_md5"].split(";")[-2:] if len(fq_files) == 1: - assert fq_files[0].endswith( - ".fastq.gz" - ), f"Unexpected FastQ file format {file_in.name}." + assert fq_files[0].endswith(".fastq.gz"), f"Unexpected FastQ file format {file_in.name}." if row["library_layout"] != "SINGLE": - logger.warning( - f"The library layout '{row['library_layout']}' should be " - f"'SINGLE'." - ) + logger.warning(f"The library layout '{row['library_layout']}' should be " f"'SINGLE'.") sample = { "fastq_1": fq_files[0], "fastq_2": None, @@ -89,17 +81,10 @@ def parse_sra_runinfo(file_in): "single_end": "true", } elif len(fq_files) == 2: - assert fq_files[0].endswith( - "_1.fastq.gz" - ), f"Unexpected FastQ file format {file_in.name}." - assert fq_files[1].endswith( - "_2.fastq.gz" - ), f"Unexpected FastQ file format {file_in.name}." + assert fq_files[0].endswith("_1.fastq.gz"), f"Unexpected FastQ file format {file_in.name}." + assert fq_files[1].endswith("_2.fastq.gz"), f"Unexpected FastQ file format {file_in.name}." if row["library_layout"] != "PAIRED": - logger.warning( - f"The library layout '{row['library_layout']}' should be " - f"'PAIRED'." - ) + logger.warning(f"The library layout '{row['library_layout']}' should be " f"'PAIRED'.") sample = { "fastq_1": fq_files[0], "fastq_2": fq_files[1], @@ -124,8 +109,7 @@ def parse_sra_runinfo(file_in): else: if sample in runinfo[db_id]: logger.error( - f"Input run info file contains duplicate rows!\n" - f"{', '.join([row[col] for col in header])}" + f"Input run info file contains duplicate rows!\n" f"{', '.join([row[col] for col in header])}" ) else: runinfo[db_id].append(sample) @@ -146,9 +130,7 @@ def sra_runinfo_to_ftp(files_in, file_out): logger.warning(f"Duplicate sample identifier found!\nID: '{db_id}'") # Create a combined header from all input files. - combined_header = header[0] + list( - set().union(chain.from_iterable(header)).difference(header[0]) - ) + combined_header = header[0] + list(set().union(chain.from_iterable(header)).difference(header[0])) combined_header.insert(0, "id") # Write samplesheet with paths to FastQ files and md5 sums. @@ -159,7 +141,7 @@ def sra_runinfo_to_ftp(files_in, file_out): for db_id in sorted(samplesheet): for idx, row in enumerate(samplesheet[db_id], start=1): row["id"] = f"{db_id}" - if 'run_accession' in row: + if "run_accession" in row: row["id"] = f"{db_id}_{row['run_accession']}" writer.writerow(row) diff --git a/conf/base.config b/conf/base.config index 3d3db8d3..4382da20 100644 --- a/conf/base.config +++ b/conf/base.config @@ -19,6 +19,16 @@ process { maxErrors = '-1' // Process-specific resource requirements + // NOTE - Please try and re-use the labels below as much as possible. + // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. + // If possible, it would be nice to keep the same label naming convention when + // adding in your local modules too. + // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors + withLabel:process_single { + cpus = { check_max( 1 , 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + } withLabel:process_low { cpus = { check_max( 2 * task.attempt, 'cpus' ) } memory = { check_max( 12.GB * task.attempt, 'memory' ) } diff --git a/conf/modules.config b/conf/modules.config index 82a5f75c..c42f4d60 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -78,16 +78,9 @@ if (params.input_type == 'sra') { withName: SRATOOLS_FASTERQDUMP { publishDir = [ - [ - path: { "${params.outdir}/fastq" }, - mode: params.publish_dir_mode, - pattern: "*.fastq.gz" - ], - [ - path: { "${params.outdir}/fastq/md5" }, - mode: params.publish_dir_mode, - pattern: "*.md5" - ] + path: { "${params.outdir}/fastq" }, + mode: params.publish_dir_mode, + pattern: "*.fastq.gz" ] } diff --git a/conf/test.config b/conf/test.config index 39a4ffdb..3c58e9cf 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,5 +20,5 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/sra_ids_test.txt' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/sra_ids_test.csv' } diff --git a/conf/test_full.config b/conf/test_full.config index 0887326a..2f0303ea 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -15,5 +15,5 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/sra_ids_test.txt' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/sra_ids_test.csv' } diff --git a/conf/test_synapse.config b/conf/test_synapse.config index b49f2463..1ac1388a 100644 --- a/conf/test_synapse.config +++ b/conf/test_synapse.config @@ -20,6 +20,6 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/synapse_ids_test.txt' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/synapse_ids_test.csv' input_type = 'synapse' } diff --git a/docs/usage.md b/docs/usage.md index ba4de1b2..989695e4 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -32,7 +32,7 @@ If you have a GEO accession (found in the data availability section of published - Click `SRA Run Selector` at the bottom of the GEO accession page - Select the desired samples in the `SRA Run Selector` and then download the `Accession List` -This downloads a text file called `SRR_Acc_List.txt` that can be directly provided to the pipeline e.g. `--input SRR_Acc_List.txt`. +This downloads a text file called `SRR_Acc_List.txt` that can be directly provided to the pipeline once renamed with a .csv extension e.g. `--input SRR_Acc_List.csv`. ### Synapse ids @@ -72,16 +72,16 @@ If FTP connections are blocked on your network use the [`--force_sratools_downlo The typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/fetchngs --input ids.txt --outdir -profile docker +nextflow run nf-core/fetchngs --input ids.csv --outdir -profile docker ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. Note that the pipeline will create the following files in your working directory: -```console +```bash work # Directory containing the nextflow working files - # Finished results in specified location (defined with --outdir) + # Finished results in specified location (defined with --outdir) .nextflow_log # Log file from Nextflow # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` @@ -90,7 +90,7 @@ work # Directory containing the nextflow working files When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: -```console +```bash nextflow pull nf-core/fetchngs ``` @@ -252,6 +252,14 @@ See the main [Nextflow documentation](https://www.nextflow.io/docs/latest/config If you have any questions or issues please send us a message on [Slack](https://nf-co.re/join/slack) on the [`#configs` channel](https://nfcore.slack.com/channels/configs). +## Azure Resource Requests + +To be used with the `azurebatch` profile by specifying the `-profile azurebatch`. +We recommend providing a compute `params.vm_type` of `Standard_D16_v3` VMs by default but these options can be changed if required. + +Note that the choice of VM size depends on your quota and the overall workload during the analysis. +For a thorough list, please refer the [Azure Sizes for virtual machines in Azure](https://docs.microsoft.com/en-us/azure/virtual-machines/sizes). + ## Running in the background Nextflow handles job submissions and supervises the running jobs. The Nextflow process must run until the pipeline is finished. @@ -266,6 +274,6 @@ Some HPC setups also allow you to run nextflow within a cluster job submitted yo In some cases, the Nextflow Java virtual machines can start to request a large amount of memory. We recommend adding the following line to your environment to limit this (typically in `~/.bashrc` or `~./bash_profile`): -```console +```bash NXF_OPTS='-Xms1g -Xmx4g' ``` diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index 4cc2509e..cfd8608a 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -123,6 +123,61 @@ class NfcoreTemplate { output_tf.withWriter { w -> w << email_txt } } + // + // Construct and send adaptive card + // https://adaptivecards.io + // + public static void adaptivecard(workflow, params, summary_params, projectDir, log) { + def hook_url = params.hook_url + + def summary = [:] + for (group in summary_params.keySet()) { + summary << summary_params[group] + } + + def misc_fields = [:] + misc_fields['start'] = workflow.start + misc_fields['complete'] = workflow.complete + misc_fields['scriptfile'] = workflow.scriptFile + misc_fields['scriptid'] = workflow.scriptId + if (workflow.repository) misc_fields['repository'] = workflow.repository + if (workflow.commitId) misc_fields['commitid'] = workflow.commitId + if (workflow.revision) misc_fields['revision'] = workflow.revision + misc_fields['nxf_version'] = workflow.nextflow.version + misc_fields['nxf_build'] = workflow.nextflow.build + misc_fields['nxf_timestamp'] = workflow.nextflow.timestamp + + def msg_fields = [:] + msg_fields['version'] = workflow.manifest.version + msg_fields['runName'] = workflow.runName + msg_fields['success'] = workflow.success + msg_fields['dateComplete'] = workflow.complete + msg_fields['duration'] = workflow.duration + msg_fields['exitStatus'] = workflow.exitStatus + msg_fields['errorMessage'] = (workflow.errorMessage ?: 'None') + msg_fields['errorReport'] = (workflow.errorReport ?: 'None') + msg_fields['commandLine'] = workflow.commandLine + msg_fields['projectDir'] = workflow.projectDir + msg_fields['summary'] = summary << misc_fields + + // Render the JSON template + def engine = new groovy.text.GStringTemplateEngine() + def hf = new File("$projectDir/assets/adaptivecard.json") + def json_template = engine.createTemplate(hf).make(msg_fields) + def json_message = json_template.toString() + + // POST + def post = new URL(hook_url).openConnection(); + post.setRequestMethod("POST") + post.setDoOutput(true) + post.setRequestProperty("Content-Type", "application/json") + post.getOutputStream().write(json_message.getBytes("UTF-8")); + def postRC = post.getResponseCode(); + if (! postRC.equals(200)) { + log.warn(post.getErrorStream().getText()); + } + } + // // Print pipeline summary on completion // diff --git a/lib/Utils.groovy b/lib/Utils.groovy old mode 100755 new mode 100644 index 28567bd7..8d030f4e --- a/lib/Utils.groovy +++ b/lib/Utils.groovy @@ -21,19 +21,26 @@ class Utils { } // Check that all channels are present - def required_channels = ['conda-forge', 'bioconda', 'defaults'] - def conda_check_failed = !required_channels.every { ch -> ch in channels } + // This channel list is ordered by required channel priority. + def required_channels_in_order = ['conda-forge', 'bioconda', 'defaults'] + def channels_missing = ((required_channels_in_order as Set) - (channels as Set)) as Boolean // Check that they are in the right order - conda_check_failed |= !(channels.indexOf('conda-forge') < channels.indexOf('bioconda')) - conda_check_failed |= !(channels.indexOf('bioconda') < channels.indexOf('defaults')) + def channel_priority_violation = false + def n = required_channels_in_order.size() + for (int i = 0; i < n - 1; i++) { + channel_priority_violation |= !(channels.indexOf(required_channels_in_order[i]) < channels.indexOf(required_channels_in_order[i+1])) + } - if (conda_check_failed) { + if (channels_missing | channel_priority_violation) { log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " There is a problem with your Conda configuration!\n\n" + " You will need to set-up the conda-forge and bioconda channels correctly.\n" + - " Please refer to https://bioconda.github.io/user/install.html#set-up-channels\n" + - " NB: The order of the channels matters!\n" + + " Please refer to https://bioconda.github.io/\n" + + " The observed channel order is \n" + + " ${channels}\n" + + " but the following channel order is required:\n" + + " ${required_channels_in_order}\n" + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" } } diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 77b7ffde..2f145895 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -21,7 +21,7 @@ class WorkflowMain { // Print help to screen if required // public static String help(workflow, params, log) { - def command = "nextflow run ${workflow.manifest.name} --input ids.txt -profile docker" + def command = "nextflow run ${workflow.manifest.name} --input ids.csv -profile docker" def help_string = '' help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs) help_string += NfcoreSchema.paramsHelp(workflow, params, command) @@ -59,6 +59,7 @@ class WorkflowMain { } // Print parameter summary log to screen + log.info paramsSummaryLog(workflow, params, log) // Check that a -profile or Nextflow config has been provided to run the pipeline @@ -74,7 +75,7 @@ class WorkflowMain { // Check input has been provided if (!params.input) { - log.error "Please provide an input file containing ids to the pipeline - one per line e.g. '--input ids.txt'" + log.error "Please provide an input file containing ids to the pipeline - one per line e.g. '--input ids.csv'" System.exit(1) } diff --git a/modules.json b/modules.json index fefca2b7..512b3fea 100644 --- a/modules.json +++ b/modules.json @@ -2,18 +2,34 @@ "name": "nf-core/fetchngs", "homePage": "https://github.com/nf-core/fetchngs", "repos": { - "nf-core/modules": { - "custom/dumpsoftwareversions": { - "git_sha": "e745e167c1020928ef20ea1397b6b4d230681b4d" + "https://github.com/nf-core/modules.git": { + "modules": { + "nf-core": { + "custom/dumpsoftwareversions": { + "branch": "master", + "git_sha": "8022c68e7403eecbd8ba9c49496f69f8c49d50f0" + }, + "custom/sratoolsncbisettings": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, + "sratools/fasterqdump": { + "branch": "master", + "git_sha": "03711bcb7fa2a7088eb54abb1fca326d30e602c2" + }, + "sratools/prefetch": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + } + } }, - "custom/sratoolsncbisettings": { - "git_sha": "b2dbaa99309a2057efc32ef9d029ed91140068df" - }, - "sratools/fasterqdump": { - "git_sha": "0cdf7767a79faf424645beeff83ecfa5528b6a7c" - }, - "sratools/prefetch": { - "git_sha": "1b228835e9525990db99243cb4f0d07aa6e01bc3" + "subworkflows": { + "nf-core": { + "fastq_download_prefetch_fasterqdump_sratools": { + "branch": "master", + "git_sha": "03711bcb7fa2a7088eb54abb1fca326d30e602c2" + } + } } } } diff --git a/modules/local/sra_to_samplesheet.nf b/modules/local/sra_to_samplesheet.nf index 4b448e89..6dce588b 100644 --- a/modules/local/sra_to_samplesheet.nf +++ b/modules/local/sra_to_samplesheet.nf @@ -6,9 +6,9 @@ process SRA_TO_SAMPLESHEET { memory 100.MB input: - tuple val(meta), path(fastq) - val pipeline - val mapping_fields + val meta + val pipeline + val mapping_fields output: tuple val(meta), path("*samplesheet.csv"), emit: samplesheet @@ -20,19 +20,19 @@ process SRA_TO_SAMPLESHEET { // // Remove custom keys needed to download the data - def meta_map = meta.clone() - meta_map.remove("id") - meta_map.remove("fastq_1") - meta_map.remove("fastq_2") - meta_map.remove("md5_1") - meta_map.remove("md5_2") - meta_map.remove("single_end") + def meta_clone = meta.clone() + meta_clone.remove("id") + meta_clone.remove("fastq_1") + meta_clone.remove("fastq_2") + meta_clone.remove("md5_1") + meta_clone.remove("md5_2") + meta_clone.remove("single_end") // Add relevant fields to the beginning of the map pipeline_map = [ sample : "${meta.id.split('_')[0..-2].join('_')}", - fastq_1 : "${params.outdir}/fastq/${fastq[0]}", - fastq_2 : meta.single_end ? '' : "${params.outdir}/fastq/${fastq[1]}" + fastq_1 : meta.fastq_1, + fastq_2 : meta.fastq_2 ] // Add nf-core pipeline specific entries @@ -43,7 +43,7 @@ process SRA_TO_SAMPLESHEET { pipeline_map << [ fasta: '' ] } } - pipeline_map << meta_map + pipeline_map << meta_clone // Create a samplesheet samplesheet = pipeline_map.keySet().collect{ '"' + it + '"'}.join(",") + '\n' diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf similarity index 79% rename from modules/nf-core/modules/custom/dumpsoftwareversions/main.nf rename to modules/nf-core/custom/dumpsoftwareversions/main.nf index 327d5100..cebb6e05 100644 --- a/modules/nf-core/modules/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -1,11 +1,11 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { - label 'process_low' + label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda (params.enable_conda ? "bioconda::multiqc=1.11" : null) + conda (params.enable_conda ? 'bioconda::multiqc=1.13' : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.11--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.11--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml similarity index 100% rename from modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml rename to modules/nf-core/custom/dumpsoftwareversions/meta.yml diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py new file mode 100755 index 00000000..da033408 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python + + +"""Provide functions to merge multiple versions.yml files.""" + + +import yaml +import platform +from textwrap import dedent + + +def _make_versions_html(versions): + """Generate a tabular HTML output of all versions for MultiQC.""" + html = [ + dedent( + """\\ + + + + + + + + + + """ + ) + ] + for process, tmp_versions in sorted(versions.items()): + html.append("") + for i, (tool, version) in enumerate(sorted(tmp_versions.items())): + html.append( + dedent( + f"""\\ + + + + + + """ + ) + ) + html.append("") + html.append("
Process Name Software Version
{process if (i == 0) else ''}{tool}{version}
") + return "\\n".join(html) + + +def main(): + """Load all version files and generate merged output.""" + versions_this_module = {} + versions_this_module["${task.process}"] = { + "python": platform.python_version(), + "yaml": yaml.__version__, + } + + with open("$versions") as f: + versions_by_process = yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module + + # aggregate versions by the module name (derived from fully-qualified process name) + versions_by_module = {} + for process, process_versions in versions_by_process.items(): + module = process.split(":")[-1] + try: + if versions_by_module[module] != process_versions: + raise AssertionError( + "We assume that software versions are the same between all modules. " + "If you see this error-message it means you discovered an edge-case " + "and should open an issue in nf-core/tools. " + ) + except KeyError: + versions_by_module[module] = process_versions + + versions_by_module["Workflow"] = { + "Nextflow": "$workflow.nextflow.version", + "$workflow.manifest.name": "$workflow.manifest.version", + } + + versions_mqc = { + "id": "software_versions", + "section_name": "${workflow.manifest.name} Software Versions", + "section_href": "https://github.com/${workflow.manifest.name}", + "plot_type": "html", + "description": "are collected at run time from the software output.", + "data": _make_versions_html(versions_by_module), + } + + with open("software_versions.yml", "w") as f: + yaml.dump(versions_by_module, f, default_flow_style=False) + with open("software_versions_mqc.yml", "w") as f: + yaml.dump(versions_mqc, f, default_flow_style=False) + + with open("versions.yml", "w") as f: + yaml.dump(versions_this_module, f, default_flow_style=False) + + +if __name__ == "__main__": + main() diff --git a/modules/nf-core/modules/custom/sratoolsncbisettings/main.nf b/modules/nf-core/custom/sratoolsncbisettings/main.nf similarity index 100% rename from modules/nf-core/modules/custom/sratoolsncbisettings/main.nf rename to modules/nf-core/custom/sratoolsncbisettings/main.nf diff --git a/modules/nf-core/modules/custom/sratoolsncbisettings/meta.yml b/modules/nf-core/custom/sratoolsncbisettings/meta.yml similarity index 100% rename from modules/nf-core/modules/custom/sratoolsncbisettings/meta.yml rename to modules/nf-core/custom/sratoolsncbisettings/meta.yml diff --git a/modules/nf-core/modules/custom/sratoolsncbisettings/templates/detect_ncbi_settings.sh b/modules/nf-core/custom/sratoolsncbisettings/templates/detect_ncbi_settings.sh similarity index 100% rename from modules/nf-core/modules/custom/sratoolsncbisettings/templates/detect_ncbi_settings.sh rename to modules/nf-core/custom/sratoolsncbisettings/templates/detect_ncbi_settings.sh diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/modules/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py deleted file mode 100644 index d1390392..00000000 --- a/modules/nf-core/modules/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python - -import yaml -import platform -from textwrap import dedent - - -def _make_versions_html(versions): - html = [ - dedent( - """\\ - - - - - - - - - - """ - ) - ] - for process, tmp_versions in sorted(versions.items()): - html.append("") - for i, (tool, version) in enumerate(sorted(tmp_versions.items())): - html.append( - dedent( - f"""\\ - - - - - - """ - ) - ) - html.append("") - html.append("
Process Name Software Version
{process if (i == 0) else ''}{tool}{version}
") - return "\\n".join(html) - - -versions_this_module = {} -versions_this_module["${task.process}"] = { - "python": platform.python_version(), - "yaml": yaml.__version__, -} - -with open("$versions") as f: - versions_by_process = yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module - -# aggregate versions by the module name (derived from fully-qualified process name) -versions_by_module = {} -for process, process_versions in versions_by_process.items(): - module = process.split(":")[-1] - try: - assert versions_by_module[module] == process_versions, ( - "We assume that software versions are the same between all modules. " - "If you see this error-message it means you discovered an edge-case " - "and should open an issue in nf-core/tools. " - ) - except KeyError: - versions_by_module[module] = process_versions - -versions_by_module["Workflow"] = { - "Nextflow": "$workflow.nextflow.version", - "$workflow.manifest.name": "$workflow.manifest.version", -} - -versions_mqc = { - "id": "software_versions", - "section_name": "${workflow.manifest.name} Software Versions", - "section_href": "https://github.com/${workflow.manifest.name}", - "plot_type": "html", - "description": "are collected at run time from the software output.", - "data": _make_versions_html(versions_by_module), -} - -with open("software_versions.yml", "w") as f: - yaml.dump(versions_by_module, f, default_flow_style=False) -with open("software_versions_mqc.yml", "w") as f: - yaml.dump(versions_mqc, f, default_flow_style=False) - -with open("versions.yml", "w") as f: - yaml.dump(versions_this_module, f, default_flow_style=False) diff --git a/modules/nf-core/modules/sratools/fasterqdump/main.nf b/modules/nf-core/sratools/fasterqdump/main.nf similarity index 70% rename from modules/nf-core/modules/sratools/fasterqdump/main.nf rename to modules/nf-core/sratools/fasterqdump/main.nf index 18f46e51..1ffb21f0 100644 --- a/modules/nf-core/modules/sratools/fasterqdump/main.nf +++ b/modules/nf-core/sratools/fasterqdump/main.nf @@ -12,8 +12,8 @@ process SRATOOLS_FASTERQDUMP { path ncbi_settings output: - tuple val(meta), path(output), emit: reads - path "versions.yml" , emit: versions + tuple val(meta), path(fastq), emit: reads + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -21,16 +21,20 @@ process SRATOOLS_FASTERQDUMP { script: def args = task.ext.args ?: '' def args2 = task.ext.args2 ?: '' - // Paired-end data extracted by fasterq-dump (--split-3 the default) always creates - // *_1.fastq *_2.fastq files but sometimes also an additional *.fastq file - // for unpaired reads which we ignore here. - output = meta.single_end ? '*.fastq.gz' : '*_{1,2}.fastq.gz' + def prefix = task.ext.prefix ?: "${meta.id}" + + // WARNING: Paired-end data extracted by fasterq-dump (--split-3 the default) + // always creates *_1.fastq *_2.fastq files but sometimes also + // an additional *.fastq file for unpaired reads which we ignore here. + fastq = meta.single_end ? '*.fastq.gz' : '*_{1,2}.fastq.gz' + def outfile = meta.single_end ? "${prefix}.fastq" : prefix """ export NCBI_SETTINGS="\$PWD/${ncbi_settings}" fasterq-dump \\ $args \\ --threads $task.cpus \\ + --outfile $outfile \\ ${sra.name} pigz \\ diff --git a/modules/nf-core/modules/sratools/fasterqdump/meta.yml b/modules/nf-core/sratools/fasterqdump/meta.yml similarity index 100% rename from modules/nf-core/modules/sratools/fasterqdump/meta.yml rename to modules/nf-core/sratools/fasterqdump/meta.yml diff --git a/modules/nf-core/modules/sratools/prefetch/main.nf b/modules/nf-core/sratools/prefetch/main.nf similarity index 100% rename from modules/nf-core/modules/sratools/prefetch/main.nf rename to modules/nf-core/sratools/prefetch/main.nf diff --git a/modules/nf-core/modules/sratools/prefetch/meta.yml b/modules/nf-core/sratools/prefetch/meta.yml similarity index 100% rename from modules/nf-core/modules/sratools/prefetch/meta.yml rename to modules/nf-core/sratools/prefetch/meta.yml diff --git a/modules/nf-core/modules/sratools/prefetch/templates/retry_with_backoff.sh b/modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh old mode 100644 new mode 100755 similarity index 100% rename from modules/nf-core/modules/sratools/prefetch/templates/retry_with_backoff.sh rename to modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh diff --git a/nextflow.config b/nextflow.config index 80ed6507..633a0fb2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -27,6 +27,7 @@ params { email_on_fail = null plaintext_email = false monochrome_logs = false + hook_url = null help = false validate_params = true show_hidden_params = false @@ -71,6 +72,16 @@ profiles { debug { process.beforeScript = 'echo $HOSTNAME' } conda { params.enable_conda = true + conda.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + mamba { + params.enable_conda = true + conda.useMamba = true docker.enabled = false singularity.enabled = false podman.enabled = false @@ -114,6 +125,11 @@ profiles { podman.enabled = false shifter.enabled = false } + gitpod { + executor.name = 'local' + executor.cpus = 16 + executor.memory = 60.GB + } test { includeConfig 'conf/test.config' } test_synapse { includeConfig 'conf/test_synapse.config' } test_full { includeConfig 'conf/test_full.config' } @@ -158,7 +174,8 @@ manifest { description = 'Pipeline to fetch metadata and raw FastQ files from public databases' mainScript = 'main.nf' nextflowVersion = '!>=21.10.3' - version = '1.7' + version = '1.8' + doi = 'https://doi.org/10.5281/zenodo.5070524' } // Load modules.config for DSL2 module specific options diff --git a/nextflow_schema.json b/nextflow_schema.json index a51dc45a..435079b3 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -15,8 +15,8 @@ "input": { "type": "string", "format": "file-path", - "mimetype": "text/plain", - "pattern": "^\\S+\\.txt$", + "mimetype": "text/csv", + "pattern": "^\\S+\\.(csv|tsv|txt)$", "schema": "assets/schema_input.json", "fa_icon": "fas fa-file-excel", "description": "File containing SRA/ENA/DDBJ identifiers one per line to download their associated metadata and FastQ files." @@ -203,6 +203,13 @@ "fa_icon": "fas fa-palette", "hidden": true }, + "hook_url": { + "type": "string", + "description": "Incoming hook URL for messaging service", + "fa_icon": "fas fa-people-group", + "help_text": "Incoming hook URL for messaging service. Currently, only MS Teams is supported.", + "hidden": true + }, "tracedir": { "type": "string", "description": "Directory to keep pipeline Nextflow logs and reports.", diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..0d62beb6 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,10 @@ +# Config file for Python. Mostly used to configure linting of bin/check_samplesheet.py with Black. +# Should be kept the same as nf-core/tools to avoid fighting with template synchronisation. +[tool.black] +line-length = 120 +target_version = ["py37", "py38", "py39", "py310"] + +[tool.isort] +profile = "black" +known_first_party = ["nf_core"] +multi_line_output = 3 diff --git a/subworkflows/nf-core/srafastq/main.nf b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf similarity index 60% rename from subworkflows/nf-core/srafastq/main.nf rename to subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf index f57b6fac..1e1d0d7b 100644 --- a/subworkflows/nf-core/srafastq/main.nf +++ b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf @@ -1,13 +1,13 @@ -include { CUSTOM_SRATOOLSNCBISETTINGS } from '../../../modules/nf-core/modules/custom/sratoolsncbisettings/main' -include { SRATOOLS_PREFETCH } from '../../../modules/nf-core/modules/sratools/prefetch/main' -include { SRATOOLS_FASTERQDUMP } from '../../../modules/nf-core/modules/sratools/fasterqdump/main' +include { CUSTOM_SRATOOLSNCBISETTINGS } from '../../../modules/nf-core/custom/sratoolsncbisettings/main' +include { SRATOOLS_PREFETCH } from '../../../modules/nf-core/sratools/prefetch/main' +include { SRATOOLS_FASTERQDUMP } from '../../../modules/nf-core/sratools/fasterqdump/main' -/** - * Download FASTQ sequencing reads from the NCBI's Sequence Read Archive (SRA). - */ -workflow SRAFASTQ { +// +// Download FASTQ sequencing reads from the NCBI's Sequence Read Archive (SRA). +// +workflow FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS { take: - sra_ids // channel: [ val(meta), val(id) ] + ch_sra_ids // channel: [ val(meta), val(id) ] main: @@ -18,19 +18,19 @@ workflow SRAFASTQ { // CUSTOM_SRATOOLSNCBISETTINGS() def settings = CUSTOM_SRATOOLSNCBISETTINGS.out.ncbi_settings // value channel: path(settings) - ch_versions = ch_versions.mix( CUSTOM_SRATOOLSNCBISETTINGS.out.versions ) + ch_versions = ch_versions.mix(CUSTOM_SRATOOLSNCBISETTINGS.out.versions) // // Prefetch sequencing reads in SRA format. // - SRATOOLS_PREFETCH ( sra_ids, settings ) - ch_versions = ch_versions.mix( SRATOOLS_PREFETCH.out.versions.first() ) + SRATOOLS_PREFETCH ( ch_sra_ids, settings ) + ch_versions = ch_versions.mix(SRATOOLS_PREFETCH.out.versions.first()) // // Convert the SRA format into one or more compressed FASTQ files. // SRATOOLS_FASTERQDUMP ( SRATOOLS_PREFETCH.out.sra, settings ) - ch_versions = ch_versions.mix( SRATOOLS_FASTERQDUMP.out.versions.first() ) + ch_versions = ch_versions.mix(SRATOOLS_FASTERQDUMP.out.versions.first()) emit: reads = SRATOOLS_FASTERQDUMP.out.reads // channel: [ val(meta), [ reads ] ] diff --git a/subworkflows/nf-core/srafastq/meta.yml b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/meta.yml similarity index 92% rename from subworkflows/nf-core/srafastq/meta.yml rename to subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/meta.yml index 873ccaca..c385ca21 100644 --- a/subworkflows/nf-core/srafastq/meta.yml +++ b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/meta.yml @@ -1,10 +1,10 @@ -name: sra_fastq +name: fastq_download_prefetch_fasterqdump_sratools description: Download FASTQ sequencing reads from the NCBI's Sequence Read Archive (SRA). keywords: - SRA - NCBI - sequencing - - FASTQ + - fastq - prefetch - fasterq-dump modules: @@ -38,3 +38,4 @@ output: pattern: "versions.yml" authors: - "@Midnighter" + - "@drpatelh" diff --git a/tower.yml b/tower.yml new file mode 100644 index 00000000..7b7fd106 --- /dev/null +++ b/tower.yml @@ -0,0 +1,7 @@ +reports: + samplesheet.csv: + display: "Auto-created samplesheet with collated metadata and FASTQ paths" + id_mappings.csv: + display: "File with database identifier mappings that can be used to rename samples" + multiqc_config.yml: + display: "MultiQC config file for bulk renaming of sample names from database ids" diff --git a/workflows/sra.nf b/workflows/sra.nf index 0dde4588..9f38e7f8 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -26,15 +26,15 @@ include { SRA_TO_SAMPLESHEET } from '../modules/local/sra_to_samplesheet' include { SRA_MERGE_SAMPLESHEET } from '../modules/local/sra_merge_samplesheet' include { MULTIQC_MAPPINGS_CONFIG } from '../modules/local/multiqc_mappings_config' -include { SRAFASTQ } from '../subworkflows/nf-core/srafastq/main' - /* ======================================================================================== IMPORT NF-CORE MODULES/SUBWORKFLOWS ======================================================================================== */ -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' + +include { FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS } from '../subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main' /* ======================================================================================== @@ -80,19 +80,27 @@ workflow SRA { .splitCsv(header:true, sep:'\t') .map { meta -> - meta.single_end = meta.single_end.toBoolean() - [ meta, [ meta.fastq_1, meta.fastq_2 ] ] + def meta_clone = meta.clone() + meta_clone.single_end = meta_clone.single_end.toBoolean() + return meta_clone } .unique() - .branch { - ftp: it[0].fastq_1 && !params.force_sratools_download - sra: !it[0].fastq_1 || params.force_sratools_download - } - .set { ch_sra_reads } + .set { ch_sra_metadata } ch_versions = ch_versions.mix(SRA_RUNINFO_TO_FTP.out.versions.first()) if (!params.skip_fastq_download) { + ch_sra_metadata + .map { + meta -> + [ meta, [ meta.fastq_1, meta.fastq_2 ] ] + } + .branch { + ftp: it[0].fastq_1 && !params.force_sratools_download + sra: !it[0].fastq_1 || params.force_sratools_download + } + .set { ch_sra_reads } + // // MODULE: If FTP link is provided in run information then download FastQ directly via FTP and validate with md5sums // @@ -104,38 +112,52 @@ workflow SRA { // // SUBWORKFLOW: Download sequencing reads without FTP links using sra-tools. // - SRAFASTQ ( + FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS ( ch_sra_reads.sra.map { meta, reads -> [ meta, meta.run_accession ] } ) - ch_versions = ch_versions.mix(SRAFASTQ.out.versions.first()) + ch_versions = ch_versions.mix(FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS.out.versions.first()) + + SRA_FASTQ_FTP + .out + .fastq + .mix(FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS.out.reads) + .map { + meta, fastq -> + def reads = meta.single_end ? [ fastq ] : fastq + def meta_clone = meta.clone() + meta_clone.fastq_1 = reads[0] ? "${params.outdir}/fastq/${reads[0].getName()}" : '' + meta_clone.fastq_2 = reads[1] && !meta.single_end ? "${params.outdir}/fastq/${reads[1].getName()}" : '' + return meta_clone + } + .set { ch_sra_metadata } + } - // - // MODULE: Stage FastQ files downloaded by SRA together and auto-create a samplesheet - // - SRA_TO_SAMPLESHEET ( - SRA_FASTQ_FTP.out.fastq.mix(SRAFASTQ.out.reads), - params.nf_core_pipeline ?: '', - params.sample_mapping_fields - ) + // + // MODULE: Stage FastQ files downloaded by SRA together and auto-create a samplesheet + // + SRA_TO_SAMPLESHEET ( + ch_sra_metadata, + params.nf_core_pipeline ?: '', + params.sample_mapping_fields + ) - // - // MODULE: Create a merged samplesheet across all samples for the pipeline - // - SRA_MERGE_SAMPLESHEET ( - SRA_TO_SAMPLESHEET.out.samplesheet.collect{it[1]}, - SRA_TO_SAMPLESHEET.out.mappings.collect{it[1]} - ) - ch_versions = ch_versions.mix(SRA_MERGE_SAMPLESHEET.out.versions) + // + // MODULE: Create a merged samplesheet across all samples for the pipeline + // + SRA_MERGE_SAMPLESHEET ( + SRA_TO_SAMPLESHEET.out.samplesheet.collect{it[1]}, + SRA_TO_SAMPLESHEET.out.mappings.collect{it[1]} + ) + ch_versions = ch_versions.mix(SRA_MERGE_SAMPLESHEET.out.versions) - // - // MODULE: Create a MutiQC config file with sample name mappings - // - if (params.sample_mapping_fields) { - MULTIQC_MAPPINGS_CONFIG ( - SRA_MERGE_SAMPLESHEET.out.mappings - ) - ch_versions = ch_versions.mix(MULTIQC_MAPPINGS_CONFIG.out.versions) - } + // + // MODULE: Create a MutiQC config file with sample name mappings + // + if (params.sample_mapping_fields) { + MULTIQC_MAPPINGS_CONFIG ( + SRA_MERGE_SAMPLESHEET.out.mappings + ) + ch_versions = ch_versions.mix(MULTIQC_MAPPINGS_CONFIG.out.versions) } // diff --git a/workflows/synapse.nf b/workflows/synapse.nf index 2a673623..1203056a 100644 --- a/workflows/synapse.nf +++ b/workflows/synapse.nf @@ -31,7 +31,7 @@ include { SYNAPSE_MERGE_SAMPLESHEET } from '../modules/local/synapse_merge_sampl ======================================================================================== */ -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' /* ========================================================================================