Merge pull request #373 from daichengxin/dev

Add dda id ci and fixed some bugs
bigbio · May 13, 2024 · bb0b114 · bb0b114
2 parents e3c95f0 + f571096
commit bb0b114
Show file tree

Hide file tree

Showing 17 changed files with 202 additions and 24 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -34,7 +34,7 @@ jobs:
         NXF_VER:
           - "23.04.0"
           - "latest-everything"
-        test_profile: ["test_lfq", "test_lfq_sage", "test_dia", "test_localize", "test_tmt"]
+        test_profile: ["test_lfq", "test_lfq_sage", "test_dia", "test_localize", "test_tmt", "test_dda_id"]
         exec_profile: ["docker", "conda"]
         exclude:
           - test_profile: test_dia

diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -14,13 +14,12 @@ jobs:
   pre-commit:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4
+      - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4
 
-      - name: Set up Python 3.11
-        uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5
+      - name: Set up Python 3.12
+        uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5
         with:
-          python-version: 3.11
-          cache: "pip"
+          python-version: "3.12"
 
       - name: Install pre-commit
         run: pip install pre-commit
@@ -32,14 +31,14 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Check out pipeline code
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4
 
       - name: Install Nextflow
-        uses: nf-core/setup-nextflow@v1
+        uses: nf-core/setup-nextflow@v2
 
-      - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5
+      - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5
         with:
-          python-version: "3.11"
+          python-version: "3.12"
           architecture: "x64"
 
       - name: Install dependencies
@@ -60,7 +59,7 @@ jobs:
 
       - name: Upload linting log file artifact
         if: ${{ always() }}
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4
         with:
           name: linting-logs
           path: |

diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Download lint results
-        uses: dawidd6/action-download-artifact@f6b0bace624032e30a85a8fd9c1a7f8f611f5737 # v3
+        uses: dawidd6/action-download-artifact@09f2f74827fd3a8607589e5ad7f9398816f540fe # v3
         with:
           workflow: linting.yml
           workflow_conclusion: completed

diff --git a/bin/add_sage_feature.py b/bin/add_sage_feature.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+# Add extra features in sage idXML. Adding extra feature in Sage isn't known input for PSMFeatureExtractor
+
+import pyopenms as oms
+import pandas as pd
+import sys
+
+
+def add_feature(idx_file, output_file, feat_file):
+    extra_feat = []
+    feat = pd.read_csv(feat_file, sep='\t')
+    for _, row in feat.iterrows():
+        if row["feature_generator"] == 'psm_file':
+            continue
+        else:
+            extra_feat.append(row["feature_name"])
+    print("Adding extra feature: {}".format(extra_feat))
+    protein_ids = []
+    peptide_ids = []
+    oms.IdXMLFile().load(idx_file, protein_ids, peptide_ids)
+    SearchParameters = protein_ids[0].getSearchParameters()
+    features = SearchParameters.getMetaValue("extra_features")
+    extra_features = features + "," + ",".join(extra_feat)
+    SearchParameters.setMetaValue("extra_features", extra_features)
+    protein_ids[0].setSearchParameters(SearchParameters)
+    oms.IdXMLFile().store(output_file, protein_ids, peptide_ids)
+    print("Done")
+
+
+def main():
+    idx_file = sys.argv[1]
+    output_file = sys.argv[2]
+    feat_file = sys.argv[3]
+    add_feature(idx_file, output_file, feat_file)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/bin/extract_sample.py b/bin/extract_sample.py
diff --git a/bin/ms2rescore_cli.py b/bin/ms2rescore_cli.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # Written by Jonas Scheid under the MIT license
 
+
 import sys
 import click
 import importlib.resources

diff --git a/bin/psm_conversion.py b/bin/psm_conversion.py
diff --git a/conf/modules.config b/conf/modules.config
@@ -317,4 +317,13 @@ process {
         ]
     }
 
+    withName: '.*:DDA_ID:SAGEFEATURE' {
+        publishDir  = [
+            path: { "${params.outdir}/addsagefeature" },
+            pattern: "*.log",
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
 }
diff --git a/conf/test_dda_id.config b/conf/test_dda_id.config
@@ -0,0 +1,37 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running real full-size tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a real and full-size test.
+
+    Use as follows:
+        nextflow run nf-core/quantms -profile test_dda_id,<docker/singularity> [--outdir <OUTDIR>]
+
+------------------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Real full-size test profile for DDA ID'
+    config_profile_description = 'Real full-size test dataset to check pipeline function of the DDA identification branch of the pipeline'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus = 2
+    max_memory = 6.GB
+    max_time = 48.h
+
+    outdir = "./results_lfq_dda_id"
+
+    // Input data
+    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/tmt_ci/PXD000001.sdrf.tsv'
+    database = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/tmt_ci/erwinia_carotovora.fasta'
+    posterior_probabilities = "percolator"
+    search_engines = "msgf,comet"
+    add_decoys = true
+    decoy_string = "rev"
+    protein_level_fdr_cutoff = 0.01
+    psm_level_fdr_cutoff = 1.0
+    pmultiqc_idxml_skip = false
+    id_only = true
+    enable_pmultiqc = false
+    ms2rescore = true
+}
diff --git a/modules/local/add_sage_feat/main.nf b/modules/local/add_sage_feat/main.nf
@@ -0,0 +1,32 @@
+process SAGEFEATURE {
+    tag "$meta.mzml_id"
+    label 'process_low'
+
+    conda "bioconda::pyopenms=3.1.0"
+    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+        container "https://depot.galaxyproject.org/singularity/pyopenms:3.1.0--py39h9b8898c_0"
+    } else {
+        container "biocontainers/pyopenms:3.1.0--py39h9b8898c_0"
+    }
+
+    input:
+    tuple val(meta), path(id_file), path(extra_feat)
+
+    output:
+    tuple val(meta), path("${id_file.baseName}_feat.idXML"), emit: id_files_feat
+    path "versions.yml", emit: version
+    path "*.log", emit: log
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.mzml_id}"
+
+    """
+    add_sage_feature.py "${id_file}" "${id_file.baseName}_feat.idXML" "${extra_feat}" 2>&1 | tee add_sage_feature.log
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        pyopenms: \$(pip show pyopenms | grep "Version" | awk -F ': ' '{print \$2}')
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/add_sage_feat/meta.yml b/modules/local/add_sage_feat/meta.yml
@@ -0,0 +1,39 @@
+name: SAGEFEATURE
+description: A module to extract extra features from ms2rescore
+keywords:
+  - features
+  - ms2rescore
+tools:
+  - custom:
+      description: |
+        A custom module to extract extra features from ms2rescore.
+      homepage: https://github.com/bigbio/quantms
+      documentation: https://github.com/bigbio/quantms/tree/readthedocs
+input:
+  - meta:
+      type: map
+      description: Groovy Map containing sample information
+  - id_file:
+      type: file
+      description: |
+        Input idXML file containing the identifications.
+      pattern: "*.idXML"
+output:
+  - meta:
+      type: map
+      description: Groovy Map containing sample information
+  - id_files_feat:
+      type: file
+      description: |
+        Output file in idXML format
+      pattern: "*.idXML"
+  - log:
+      type: file
+      description: log file
+      pattern: "*.log"
+  - version:
+      type: file
+      description: File containing software version
+      pattern: "versions.yml"
+authors:
+  - "@daichengxin"
diff --git a/modules/local/extract_psm/main.nf b/modules/local/extract_psm/main.nf
@@ -2,11 +2,11 @@ process PSMCONVERSION {
     tag "$meta.mzml_id"
     label 'process_medium'
 
-    conda "bioconda::pyopenms=3.1.0"
+    conda "bioconda::pyopenms=2.8.0"
     if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
-        container "https://depot.galaxyproject.org/singularity/pyopenms:3.1.0--py39h9b8898c_0"
+        container "https://depot.galaxyproject.org/singularity/pyopenms:2.8.0--py38hd8d5640_1"
     } else {
-        container "biocontainers/pyopenms:3.1.0--py39h9b8898c_0"
+        container "biocontainers/pyopenms:2.8.0--py38hd8d5640_1"
     }
 
     input:

diff --git a/modules/local/ms2rescore/main.nf b/modules/local/ms2rescore/main.nf
@@ -2,7 +2,7 @@ process MS2RESCORE {
     tag "$meta.mzml_id"
     label 'process_high'
 
-    conda "bioconda::ms2rescore=3.0.2"
+    conda "bioconda::ms2rescore=3.0.2 bioconda::psm-utils=0.8.0 conda-forge::pydantic=1.10"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/ms2rescore:3.0.2--pyhdfd78af_0':
         'biocontainers/ms2rescore:3.0.2--pyhdfd78af_0' }"
@@ -36,19 +36,26 @@ process MS2RESCORE {
         ms2_tolerence = 0.02
     }
 
+    if (params.decoy_string_position == "prefix") {
+        decoy_pattern = "^${params.decoy_string}"
+    } else {
+        decoy_pattern = "${params.decoy_string}\$"
+    }
+
     """
     ms2rescore_cli.py \\
         --psm_file $idxml \\
         --spectrum_path . \\
         --ms2_tolerance $ms2_tolerence \\
         --output_path ${idxml.baseName}_ms2rescore.idXML \\
         --processes $task.cpus \\
+        --id_decoy_pattern $decoy_pattern \\
         $args \\
         2>&1 | tee ${meta.mzml_id}_ms2rescore.log
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        MS²Rescore: \$(echo \$(ms2rescore --version 2>&1) | grep -oP 'MS²Rescore \\(v\\K[^\\)]+' ))
+        MS2Rescore: \$(echo \$(ms2rescore --version 2>&1) | grep -oP 'MS²Rescore \\(v\\K[^\\)]+' )
     END_VERSIONS
     """
 
@@ -61,7 +68,7 @@ process MS2RESCORE {
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        MS²Rescore: \$(echo \$(ms2rescore --version 2>&1) | grep -oP 'MS²Rescore \\(v\\K[^\\)]+' ))
+        MS2Rescore: \$(echo \$(ms2rescore --version 2>&1) | grep -oP 'MS²Rescore \\(v\\K[^\\)]+' )
     END_VERSIONS
     """
 }
diff --git a/nextflow.config b/nextflow.config
@@ -412,6 +412,7 @@ profiles {
     test_full_tmt   { includeConfig 'conf/test_full_tmt.config' }
     test_full_dia   { includeConfig 'conf/test_full_dia.config' }
     test_full       { includeConfig 'conf/test_full_lfq.config' }
+    test_dda_id     { includeConfig 'conf/test_dda_id.config'   }
     mambaci         { includeConfig 'conf/mambaci.config'       }
 
 }

diff --git a/subworkflows/local/dda_id.nf b/subworkflows/local/dda_id.nf
@@ -13,6 +13,7 @@ include { PSMCONVERSION                  } from '../../modules/local/extract_psm
 include { MS2RESCORE                     } from '../../modules/local/ms2rescore/main'
 include { IDSCORESWITCHER                } from '../../modules/local/openms/idscoreswitcher/main'
 include { GETSAMPLE                      } from '../../modules/local/extract_sample/main'
+include { SAGEFEATURE                    } from '../../modules/local/add_sage_feat/main'
 
 //
 // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules
@@ -48,6 +49,8 @@ workflow DDA_ID {
             return [meta, filename, []]
     }.set{ch_id_files_branched}
 
+    ch_pmultiqc_consensus = Channel.empty()
+    ch_pmultiqc_ids = Channel.empty()
 
     //
     // SUBWORKFLOW: Rescoring
@@ -60,13 +63,14 @@ workflow DDA_ID {
 
                 MS2RESCORE.out.idxml.join(MS2RESCORE.out.feature_names).branch{ meta, idxml, feature_name ->
                     sage: idxml.name.contains('sage')
-                        return [meta, idxml]
+                        return [meta, idxml, feature_name]
                     nosage: true
                         return [meta, idxml, feature_name]
                 }.set{ch_ms2rescore_branched}
 
                 EXTRACTPSMFEATURES(ch_ms2rescore_branched.nosage)
-                ch_id_files_feats = EXTRACTPSMFEATURES.out.id_files_feat.mix(ch_ms2rescore_branched.sage)
+                SAGEFEATURE(ch_ms2rescore_branched.sage)
+                ch_id_files_feats = EXTRACTPSMFEATURES.out.id_files_feat.mix(SAGEFEATURE.out.id_files_feat)
                 ch_software_versions = ch_software_versions.mix(EXTRACTPSMFEATURES.out.version)
             } else {
                 EXTRACTPSMFEATURES(ch_id_files_branched.nosage)
@@ -143,13 +147,15 @@ workflow DDA_ID {
 
             }
 
+        ch_rescoring_results = ch_consensus_input
 
         } else if (params.posterior_probabilities == 'mokapot') {
             MS2RESCORE(ch_id_files.combine(ch_file_preparation_results, by: 0))
             ch_software_versions = ch_software_versions.mix(MS2RESCORE.out.versions)
             IDSCORESWITCHER(MS2RESCORE.out.idxml.combine(Channel.value("PEP")))
             ch_software_versions = ch_software_versions.mix(IDSCORESWITCHER.out.version)
             ch_consensus_input = IDSCORESWITCHER.out.id_score_switcher.combine(Channel.value("MS:1001491"))
+            ch_rescoring_results = IDSCORESWITCHER.out.id_files_ForIDPEP
         } else {
             ch_fdridpep = Channel.empty()
             if (params.search_engines.split(",").size() == 1) {
@@ -161,6 +167,7 @@ workflow DDA_ID {
             IDPEP(ch_fdridpep.mix(ch_id_files))
             ch_software_versions = ch_software_versions.mix(IDPEP.out.version)
             ch_consensus_input = IDPEP.out.id_files_ForIDPEP
+            ch_rescoring_results = ch_consensus_input
         }
 
         //
@@ -172,24 +179,30 @@ workflow DDA_ID {
             CONSENSUSID(ch_consensus_input.groupTuple(size: params.search_engines.split(",").size()))
             ch_software_versions = ch_software_versions.mix(CONSENSUSID.out.version.ifEmpty(null))
             ch_psmfdrcontrol = CONSENSUSID.out.consensusids
-            ch_consensus_results = CONSENSUSID.out.consensusids
+            ch_psmfdrcontrol
+                .map { it -> it[1] }
+                .set { ch_pmultiqc_consensus }
         } else {
             ch_psmfdrcontrol = ch_consensus_input
         }
 
         PSMFDRCONTROL(ch_psmfdrcontrol)
         ch_software_versions = ch_software_versions.mix(PSMFDRCONTROL.out.version.ifEmpty(null))
 
-
         // Extract PSMs and export parquet format
         PSMCONVERSION(PSMFDRCONTROL.out.id_filtered.combine(ch_spectrum_data, by: 0))
 
+        ch_rescoring_results
+            .map { it -> it[1] }
+            .set { ch_pmultiqc_ids }
     } else {
         PSMCONVERSION(ch_id_files.combine(ch_spectrum_data, by: 0))
     }
 
 
     emit:
+    ch_pmultiqc_ids         = ch_pmultiqc_ids
+    ch_pmultiqc_consensus   = ch_pmultiqc_consensus
     version                 = ch_software_versions
 }