diff --git a/psm_utils/io/fragpipe.py b/psm_utils/io/fragpipe.py index fc07395..c5c64ed 100644 --- a/psm_utils/io/fragpipe.py +++ b/psm_utils/io/fragpipe.py @@ -17,6 +17,7 @@ from abc import ABC from pathlib import Path from typing import Iterable, Optional +from pyteomics.proforma import MassModification, to_proforma from psm_utils.io._base_classes import ReaderBase from psm_utils.io._utils import set_csv_field_size_limit @@ -76,7 +77,7 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: return PSM( peptidoform=self._parse_peptidoform( - psm_dict["Modified Peptide"], psm_dict["Peptide"], psm_dict["Charge"] + psm_dict["Peptide"], psm_dict["Assigned Modifications"], psm_dict["Charge"] ), spectrum_id=self._parse_spectrum_id(psm_dict["Spectrum"]), run=self._parse_run(psm_dict["Spectrum File"]), @@ -98,22 +99,24 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: ) @staticmethod - def _parse_peptidoform(mod_peptide: str, peptide: str, charge: Optional[str]) -> str: + def _parse_peptidoform(peptide: str, modifications: str, charge: Optional[str]) -> str: """Parse the peptidoform from the modified peptide, peptide, and charge columns.""" - if mod_peptide: - peptide = mod_peptide - # N-terminal modification - if peptide.startswith("n"): - peptide = peptide[1:] - # A hyphen needs to be added after the N-terminal modification, thus after the ] - peptide = peptide.replace("]", "]-", 1) - # C-terminal modification - if peptide.endswith("]"): - if "c[" in peptide: - peptide = peptide.replace("c[", "-[", 1) - if charge: - peptide += f"/{int(float(charge))}" - return peptide + sequence = [(aa, []) for aa in peptide] + n_term, c_term = [], [] + for mod_entry in modifications.split(", "): + if mod_entry: + site, mass = mod_entry[:-1].split("(") + mass = float(mass) + if site == "N-term": + n_term.append(MassModification(mass)) + elif site == "C-term": + c_term.append(MassModification(mass)) + else: + res = site[-1] + idx = int(site[:-1]) - 1 + assert sequence[idx][0] == res + sequence[idx][1].append(MassModification(mass)) + return to_proforma(sequence, n_term=n_term, c_term=c_term, charge_state=charge) @staticmethod def _parse_spectrum_id(spectrum: str) -> str: diff --git a/tests/test_io/test_fragpipe.py b/tests/test_io/test_fragpipe.py index 9020c41..eea93b1 100644 --- a/tests/test_io/test_fragpipe.py +++ b/tests/test_io/test_fragpipe.py @@ -42,16 +42,14 @@ def test_iter(self): def test__parse_peptidoform(self): test_cases = [ - (("LHM[147]TNQNMEKc[17]", "LHMTNQNMEK", "3"), "LHM[147]TNQNMEK-[17]/3"), - (("n[43]ANIAVQR", "ANIAVQR", "2"), "[43]-ANIAVQR/2"), - ((None, "IPAVTYPK", "2"), "IPAVTYPK/2"), - (("", "IPAVTYPK", "2"), "IPAVTYPK/2"), - (("", "IPAVTYPK", 2), "IPAVTYPK/2"), + (("LHMTNQNMEK", "3M(15.994915), C-term(17.034480)", "3"), "LHM[+15.9949]TNQNMEK-[+17.0345]/3"), + (("ANIAVQR", "N-term(42.0106)", "2"), "[+42.0106]-ANIAVQR/2"), + (("IPAVTYPK", "", "2"), "IPAVTYPK/2"), ] reader = FragPipeReader("./tests/test_data/test_fragpipe.tsv") - for (peptide, modified_peptide, charge), expected in test_cases: - assert reader._parse_peptidoform(peptide, modified_peptide, charge) == expected + for (peptide, modifications, charge), expected in test_cases: + assert reader._parse_peptidoform(peptide, modifications, charge) == expected def test__parse_spectrum_id(self): test_cases = [