Skip to content

Commit

Permalink
Merge branch 'dev' into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
daichengxin authored Mar 24, 2024
2 parents d090107 + 0b9c5ab commit 0c87e02
Show file tree
Hide file tree
Showing 14 changed files with 384 additions and 24 deletions.
3 changes: 2 additions & 1 deletion assets/adaptivecard.json
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@
"body": [
{
"type": "FactSet",
"facts": [<% out << summary.collect{ k,v -> "{\"title\": \"$k\", \"value\" : \"$v\"}"}.join(",\n") %>
"facts": [<% out << summary.collect{ k,v -> "{\"title\": \"$k\", \"value\" : \"$v\"}"
}.join(",\n") %>
]
}
]
Expand Down
24 changes: 19 additions & 5 deletions bin/mzml_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
import sys
from pathlib import Path
import sqlite3

import re
import pandas as pd
from pyopenms import MSExperiment, MzMLFile


def ms_dataframe(ms_path: str) -> None:
def ms_dataframe(ms_path: str, id_only: bool = False) -> None:
file_columns = [
"SpectrumID",
"MSLevel",
Expand All @@ -25,8 +25,9 @@ def ms_dataframe(ms_path: str) -> None:
"AcquisitionDateTime",
]

def parse_mzml(file_name: str, file_columns: list):
def parse_mzml(file_name: str, file_columns: list, id_only: bool = False):
info = []
psm_part_info = []
exp = MSExperiment()
acquisition_datetime = exp.getDateTime().get()
MzMLFile().load(file_name, exp)
Expand Down Expand Up @@ -54,11 +55,23 @@ def parse_mzml(file_name: str, file_columns: list):
charge_state = spectrum.getPrecursors()[0].getCharge()
emz = spectrum.getPrecursors()[0].getMZ() if spectrum.getPrecursors()[0].getMZ() else None
info_list = [id_, MSLevel, charge_state, peak_per_ms, bpc, tic, rt, emz, acquisition_datetime]
mz_array = peaks_tuple[0]
intensity_array = peaks_tuple[1]
else:
info_list = [id_, MSLevel, None, None, None, None, rt, None, acquisition_datetime]

if id_only and MSLevel == 2:
psm_part_info.append([re.findall(r"[scan|spectrum]=(\d+)", id_)[0], MSLevel, mz_array, intensity_array])
info.append(info_list)

if id_only and len(psm_part_info) > 0:
pd.DataFrame(psm_part_info, columns=["scan", "ms_level", "mz", "intensity"]).to_csv(
f"{Path(ms_path).stem}_spectrum_df.csv",
mode="w",
index=False,
header=True,
)

return pd.DataFrame(info, columns=file_columns)

def parse_bruker_d(file_name: str, file_columns: list):
Expand Down Expand Up @@ -139,7 +152,7 @@ def parse_bruker_d(file_name: str, file_columns: list):
if Path(ms_path).suffix == ".d" and Path(ms_path).is_dir():
ms_df = parse_bruker_d(ms_path, file_columns)
elif Path(ms_path).suffix in [".mzML", ".mzml"]:
ms_df = parse_mzml(ms_path, file_columns)
ms_df = parse_mzml(ms_path, file_columns, id_only)
else:
msg = f"Unrecognized or inexistent mass spec file '{ms_path}'"
raise RuntimeError(msg)
Expand All @@ -155,7 +168,8 @@ def parse_bruker_d(file_name: str, file_columns: list):

def main():
ms_path = sys.argv[1]
ms_dataframe(ms_path)
id_only = sys.argv[2]
ms_dataframe(ms_path, id_only)


if __name__ == "__main__":
Expand Down
117 changes: 117 additions & 0 deletions bin/psm_conversion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#!/usr/bin/env python
import numpy as np
import pyopenms as oms
import pandas as pd
import re
import os
from pathlib import Path
import sys

_parquet_field = [
"sequence", "protein_accessions", "protein_start_positions", "protein_end_positions",
"modifications", "retention_time", "charge", "calc_mass_to_charge", "reference_file_name",
"scan_number", "peptidoform", "posterior_error_probability", "global_qvalue", "is_decoy",
"consensus_support", "mz_array", "intensity_array", "num_peaks", "search_engines", "id_scores", "hit_rank"
]


def mods_position(peptide):
pattern = re.compile(r"\((.*?)\)")
original_mods = pattern.findall(peptide)
peptide = re.sub(r"\(.*?\)", ".", peptide)
position = [i.start() for i in re.finditer(r"\.", peptide)]
for j in range(1, len(position)):
position[j] -= j

for k in range(0, len(original_mods)):
original_mods[k] = str(position[k]) + "-" + original_mods[k]

original_mods = [str(i) for i in original_mods] if len(original_mods) > 0 else np.nan

return original_mods


def convert_psm(idxml, spectra_file, export_decoy_psm):
prot_ids = []
pep_ids = []
parquet_data = []
consensus_support = np.nan
mz_array = []
intensity_array = []
num_peaks = np.nan
id_scores = []
search_engines = []

oms.IdXMLFile().load(idxml, prot_ids, pep_ids)
if "ConsensusID" in prot_ids[0].getSearchEngine():
if prot_ids[0].getSearchParameters().metaValueExists("SE:MS-GF+"):
search_engines = ["MS-GF+"]
if prot_ids[0].getSearchParameters().metaValueExists("SE:Comet"):
search_engines.append("Comet")
if prot_ids[0].getSearchParameters().metaValueExists("SE:Sage"):
search_engines.append("Sage")
else:
search_engines = [prot_ids[0].getSearchEngine()]

reference_file_name = os.path.splitext(prot_ids[0].getMetaValue("spectra_data")[0].decode("UTF-8"))[0]
spectra_df = pd.read_csv(spectra_file) if spectra_file else None

for peptide_id in pep_ids:
retention_time = peptide_id.getRT()
calc_mass_to_charge = peptide_id.getMZ()
scan_number = int(re.findall(r"(spectrum|scan)=(\d+)", peptide_id.getMetaValue("spectrum_reference"))[0][1])

if isinstance(spectra_df, pd.DataFrame):
spectra = spectra_df[spectra_df["scan"] == scan_number]
mz_array = spectra["mz"].values[0]
intensity_array = spectra["intensity"].values[0]
num_peaks = len(mz_array)

for hit in peptide_id.getHits():
# if remove decoy when mapped to target+decoy?
is_decoy = 0 if hit.getMetaValue("target_decoy") == "target" else 1
if export_decoy_psm == "false" and is_decoy:
continue
global_qvalue = np.nan
if len(search_engines) > 1:
if "q-value" in peptide_id.getScoreType():
global_qvalue = hit.getScore()
consensus_support = hit.getMetaValue("consensus_support")
elif search_engines == "Comet":
id_scores = ["Comet:Expectation value: " + str(hit.getScore())]
elif search_engines == "MS-GF+":
id_scores = ["MS-GF:SpecEValue: " + str(hit.getScore())]
elif search_engines == "Sage":
id_scores = ["Sage:hyperscore: " + str(hit.getScore())]

charge = hit.getCharge()
peptidoform = hit.getSequence().toString()
modifications = mods_position(peptidoform)
sequence = hit.getSequence().toUnmodifiedString()
protein_accessions = [ev.getProteinAccession() for ev in hit.getPeptideEvidences()]
posterior_error_probability = hit.getMetaValue("Posterior Error Probability_score")
protein_start_positions = [ev.getStart() for ev in hit.getPeptideEvidences()]
protein_end_positions = [ev.getEnd() for ev in hit.getPeptideEvidences()]
hit_rank = hit.getRank()

parquet_data.append([sequence, protein_accessions, protein_start_positions, protein_end_positions,
modifications, retention_time, charge, calc_mass_to_charge, reference_file_name,
scan_number, peptidoform, posterior_error_probability, global_qvalue, is_decoy,
consensus_support, mz_array, intensity_array, num_peaks, search_engines, id_scores,
hit_rank])

pd.DataFrame(parquet_data, columns=_parquet_field).to_csv(f"{Path(idxml).stem}_psm.csv",
mode="w",
index=False,
header=True)


def main():
idxml_path = sys.argv[1]
spectra_file = sys.argv[2]
export_decoy_psm = sys.argv[3]
convert_psm(idxml_path, spectra_file, export_decoy_psm)


if __name__ == "__main__":
sys.exit(main())
10 changes: 10 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,16 @@ process {
]
}

withName: '.*:DDA_ID:PSMFDRCONTROL:IDFILTER' {
ext.args = "-score:pep \"$params.run_fdr_cutoff\""
ext.suffix = '.idXML'
publishDir = [
path: { "${params.outdir}/idfilter" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

// PROTEOMICSLFQ
withName: '.*:LFQ:PROTEOMICSLFQ' {
ext.args = "-debug $params.plfq_debug"
Expand Down
36 changes: 36 additions & 0 deletions modules/local/extract_psm/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
process PSMCONVERSION {
tag "$meta.mzml_id"
label 'process_medium'

conda "bioconda::pyopenms=3.1.0"
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "https://depot.galaxyproject.org/singularity/pyopenms:3.1.0--py39h9b8898c_0"
} else {
container "biocontainers/pyopenms:3.1.0--py39h9b8898c_0"
}

input:
tuple val(meta), path(idxml_file), path(spectrum_df)

output:
path "*_psm.csv", emit: psm_info
path "versions.yml", emit: version
path "*.log", emit: log

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.mzml_id}"


"""
psm_conversion.py "${idxml_file}" \\
${spectrum_df} \\
$params.export_decoy_psm \\
2>&1 | tee extract_idxml.log
cat <<-END_VERSIONS > versions.yml
"${task.process}":
pyopenms: \$(pip show pyopenms | grep "Version" | awk -F ': ' '{print \$2}')
END_VERSIONS
"""
}
34 changes: 34 additions & 0 deletions modules/local/extract_psm/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: PSMCONVERSION
description: A module to extract PSM information from idXML file
keywords:
- PSM
- conversion
tools:
- custom:
description: |
A custom module for PSM extraction.
homepage: https://github.com/bigbio/quantms
documentation: https://github.com/bigbio/quantms/tree/readthedocs
input:
- idxml_file:
type: file
description: idXML identification file
pattern: "*.idXML"
- spectrum_df:
type: file
description: spectrum data file
pattern: "_spectrum_df.csv"
- meta:
type: map
description: Groovy Map containing sample information
output:
- psm_info:
type: file
description: PSM csv file
pattern: "*_psm.csv"
- version:
type: file
description: File containing software version
pattern: "versions.yml"
authors:
- "@daichengxin"
2 changes: 2 additions & 0 deletions modules/local/mzmlstatistics/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ process MZMLSTATISTICS {

output:
path "*_ms_info.tsv", emit: ms_statistics
tuple val(meta), path("*_spectrum_df.csv"), emit: spectrum_df
path "versions.yml", emit: version
path "*.log", emit: log

Expand All @@ -24,6 +25,7 @@ process MZMLSTATISTICS {

"""
mzml_statistics.py "${ms_file}" \\
$params.id_only \\
2>&1 | tee mzml_statistics.log
cat <<-END_VERSIONS > versions.yml
Expand Down
4 changes: 4 additions & 0 deletions modules/local/mzmlstatistics/meta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ output:
type: file
description: mzMLs statistics file
pattern: "*_mzml_info.tsv"
- spectrum_df:
type: file
description: spectrum data file
pattern: "_spectrum_df.csv"
- version:
type: file
description: File containing software version
Expand Down
4 changes: 3 additions & 1 deletion modules/local/openms/thirdparty/searchenginemsgf/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,11 @@ process SEARCHENGINEMSGF {
}

num_enzyme_termini = ""
max_missed_cleavages = "-max_missed_cleavages ${params.allowed_missed_cleavages}"
if (meta.enzyme == "unspecific cleavage")
{
num_enzyme_termini = "none"
max_missed_cleavages = ""
}
else if (params.num_enzyme_termini == "fully")
{
Expand All @@ -75,7 +77,7 @@ process SEARCHENGINEMSGF {
-max_precursor_charge $params.max_precursor_charge \\
-min_peptide_length $params.min_peptide_length \\
-max_peptide_length $params.max_peptide_length \\
-max_missed_cleavages $params.allowed_missed_cleavages \\
${max_missed_cleavages} \\
-isotope_error_range $params.isotope_error_range \\
-enzyme "${enzyme}" \\
-tryptic ${msgf_num_enzyme_termini} \\
Expand Down
5 changes: 5 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,15 @@ params {
local_input_type = 'mzML'
database = null
acquisition_method = null
id_only = false

// Input options
input = null

// Tools flags
posterior_probabilities = 'percolator'
add_decoys = false
skip_rescoring = false
search_engines = 'comet'
sage_processes = 1
run_fdr_cutoff = 0.10
Expand Down Expand Up @@ -106,6 +108,9 @@ params {
// IDPEP flags
outlier_handling = "none"

// DDA_ID flags
export_decoy_psm = true

// Percolator flags
train_FDR = 0.05
test_FDR = 0.05
Expand Down
18 changes: 18 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,18 @@
"description": "Proteomics data acquisition method",
"enum": ["dda", "dia"],
"fa_icon": "far fa-list-ol"
},
"id_only": {
"type": "boolean",
"description": "Only perform identification subworkflow.",
"fa_icon": "far fa-check-square",
"help_text": "Only perform identification subworkflow for specific cases."
},
"export_decoy_psm": {
"type": "boolean",
"description": "Whether export PSM from decoy in final identification results",
"fa_icon": "far fa-check-square",
"help_text": "Whether export PSM from decoy in final identification results for dda_id subworkflow for specific cases."
}
}
},
Expand Down Expand Up @@ -416,6 +428,12 @@
"description": "Choose between different rescoring/posterior probability calculation methods and set them up.",
"default": "",
"properties": {
"skip_rescoring": {
"type": "boolean",
"description": "Skip PSM rescoring steps for specific cases, such as studying pure search engine results and search engine ranks",
"default": false,
"fa_icon": "far fa-check-square"
},
"posterior_probabilities": {
"type": "string",
"description": "How to calculate posterior probabilities for PSMs:\n\n* 'percolator' = Re-score based on PSM-feature-based SVM and transform distance\n to hyperplane for posteriors\n* 'fit_distributions' = Fit positive and negative distributions to scores\n (similar to PeptideProphet)",
Expand Down
Loading

0 comments on commit 0c87e02

Please sign in to comment.