Skip to content

Commit

Permalink
almost there, but abbreviations might be wrong
Browse files Browse the repository at this point in the history
  • Loading branch information
manulera committed Mar 15, 2024
1 parent db664de commit 9ea7cea
Show file tree
Hide file tree
Showing 10 changed files with 17,005 additions and 11,609 deletions.
10,129 changes: 5,068 additions & 5,061 deletions data/alleles.tsv

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions data/chromosome1.contig
Original file line number Diff line number Diff line change
Expand Up @@ -34754,7 +34754,7 @@ FT db_xref=PMID:30566651"
FT CDS join(2167206..2167245,2167302..2168683)
FT /colour=7
FT /product="mitochondrial inner membrane protein involved in
FT translation Mdm28 (predicted)"
FT translation Mdm38 (predicted)"
FT /systematic_id="SPAC23C11.17"
FT /db_xref="SPD:24/24E06"
FT /controlled_curation="term=sequence feature, transmembrane
Expand All @@ -34777,7 +34777,8 @@ FT /controlled_curation="term=human LETM1 ortholog;
FT date=20100614"
FT /controlled_curation="term=warning, gene structure
FT updated; date=20220825"
FT /primary_name="mdm28"
FT /synonym="mdm28"
FT /primary_name="mdm38"
FT intron 2167246..2167301
FT /controlled_curation="term=misc, confirmed intron"
FT /systematic_id="SPAC23C11.17"
Expand Down
4 changes: 2 additions & 2 deletions data/chromosome2.contig
Original file line number Diff line number Diff line change
Expand Up @@ -60868,8 +60868,8 @@ FT /feature_source="evidence=ECO:0000309; source=SO:0001238;
FT condition=FYECO:0000126; during=GO:0072690;
FT score=5.463740; db_xref=PMID:30566651"
FT CDS join(3734792..3735902,3735949..3736232,3736288..3736725,3736788..3736904)
FT /colour=7
FT /product="ER protein folding protein (predicted)"
FT /colour=10
FT /product="ER membrane protein"
FT /db_xref="SPD:27/27H12"
FT /systematic_id="SPBC13G1.05"
FT /controlled_curation="term=sequence feature, transmembrane
Expand Down
Binary file modified data/genome.pickle
Binary file not shown.
18,400 changes: 11,872 additions & 6,528 deletions data/pombase-chado.modifications

Large diffs are not rendered by default.

34 changes: 30 additions & 4 deletions protein_modification_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,17 @@ def check_func(row, genome, allowed_mod_dict):
type='dummy',
rule_name='dummy',
regex=f'(?<!{aa})({aa})(\d+){aa}?',
apply_syntax=lambda x: f'{x[0]}{x[1]}',
)
result = replace_allele_features_with_syntax_rules([dummy_rule], [row['sequence_position']], [], gene)
# Special abbreviations for CTD modifications
ctd_rule = SyntaxRule(
type='ctd_abbreviations',
rule_name='ctd_abbreviations',
regex='(CTD_S2|CTD_T4|CTD_S5|CTD_S7)',
apply_syntax=lambda x: x[0]
)

result = replace_allele_features_with_syntax_rules([dummy_rule, ctd_rule], [row['sequence_position']], [], gene)

# Extract the matched and unmatched elements
match_groups: list[tuple[re.Match, SyntaxRule]] = list(filter(lambda x: type(x) != str, result))
Expand All @@ -70,21 +79,38 @@ def check_func(row, genome, allowed_mod_dict):
if len(unmatched):
return 'pattern_error', ''

correct_name = ','.join(''.join(match_group[0].groups()) for match_group in match_groups)
correct_name_list = list()
for match_group in match_groups:
groups_from_match = match_group[0].groups()
syntax_rule = match_group[1]
correct_name_list.append(syntax_rule.apply_syntax(groups_from_match))

correct_name = ','.join(correct_name_list)

change_sequence_position_to = ''
if correct_name != row['sequence_position']:
change_sequence_position_to = correct_name

errors = [check_sequence_single_pos(match_group[0].groups(), gene, 'peptide') for match_group in match_groups]
# Error handling ommitted for CTD
errors = list()
for match_group in match_groups:
if match_group[1].rule_name == 'ctd_abbreviations':
if systematic_id == 'SPBC28F2.12':
continue
else:
errors.append('no_ctd')
else:
errors.append(check_sequence_single_pos(match_group[0].groups(), gene, 'peptide'))

if any(errors):
return '|'.join(errors), change_sequence_position_to

# If there are restriction for this particular MOD, check for those
if allowed_mod_dict[row['modification']]:
# Get all letters in the sequence_position
residues = set(x for x in re.findall('[a-zA-Z]', row['sequence_position']))
# We use ([A-Za-z])(?=\d) instead of [A-Za-z] so that CTD abbreviations such as CTD_S2
# are also supported
residues = set(x for x in re.findall('([A-Za-z])(?=\d)', row['sequence_position']))
if any(residue not in allowed_mod_dict[row['modification']] for residue in residues):
return 'residue_not_allowed', change_sequence_position_to

Expand Down
17 changes: 17 additions & 0 deletions protein_modification_transvar.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,19 @@
tqdm.pandas()


def expand_CTD_abbreviations(sequence_position: str) -> str:
"""Expand CTD abbreviations to all positions"""
abbreviations = {
"CTD_S2": "S1579,S1586,S1593,S1600,S1607,S1614,S1621,S1628,S1635,S1642,S1649,S1656,S1663,S1670,S1677,S1684,S1691,S1698,S1705,S1712,S1719,S1726,S1733,S1740,S1747",
"CTD_T4": "T1584,T1591,T1598,T1605,T1612,T1619,T1626,T1615,T1640,T1647,T1654,T1661,T1663,T1675,T1682,T1689,T1696,T1703,T1710,T1717,T1723,T1731,T1738,T1745,T1752",
"CTD_S5": "T1582,T1589,T1596,T1603,T1610,T1617,T1624,T1613,T1638,T1645,T1652,T1659,T1666,T1673,T1680,T1687,T1694,T1701,T1708,T1715,T1722,T1729,T1736,T1743,T1750",
"CTD_S7": "S1584,S1591,S1598,S1605,S1612,S1619,S1626,S1615,S1640,S1647,S1654,S1661,S1668,S1675,S1682 S1689,S1696,S1703,S1710,S1717,S1724,S1731,S1738,S1745,S1752"
}
for key in abbreviations:
sequence_position = sequence_position.replace(key, abbreviations[key])
return sequence_position


def format_for_transvar(row, genome):

# Transvar uses only gene_ids, while the pipeline uses a mix to handle multi-transcripts
Expand Down Expand Up @@ -42,6 +55,7 @@ def get_transvar_coordinates(row, db, genome, exclude_transcripts):
# print(row['systematic_id'], '<<<>>>', row['transvar_input'])
qc_id = process_systematic_id(row['systematic_id'], genome, 'first')
transcript_id = None if (qc_id == row['systematic_id']) else qc_id
print(row['transvar_input'])
try:
transvar_annotation_list = parse_transvar_string(get_transvar_str_annotation('panno', row['transvar_input'], db))
return get_transvar_annotation_coordinates(transvar_annotation_list, row['systematic_id'], transcript_id)
Expand Down Expand Up @@ -69,6 +83,9 @@ def main(genome_file, protein_modification_results_file, exclude_transcripts_fil
data['exploded_sequence_position'] = data['sequence_position']
data.loc[data['change_sequence_position_to'] != '', 'exploded_sequence_position'] = data['change_sequence_position_to']

# Expand CTD abbreviations
data['exploded_sequence_position'] = data['sequence_position'].apply(expand_CTD_abbreviations)

# Explode the sequence_position and the rules_applied
data_exploded = data[['systematic_id', 'sequence_position', 'exploded_sequence_position']].copy()
data_exploded.drop_duplicates(inplace=True)
Expand Down
10 changes: 0 additions & 10 deletions results/protein_modification_cannot_fix_other_errors.tsv
Original file line number Diff line number Diff line change
@@ -1,11 +1 @@
systematic_id primary_name modification evidence sequence_position annotation_extension reference taxon date error
SPBC28F2.12 rpb1 MOD:00046 Inferred from Direct Assay CTD_S2 added_by(PomBase:SPBC32H8.10) PMID:19328067 4896 2009-07-12 pattern_error
SPBC28F2.12 rpb1 MOD:00046 Inferred from Direct Assay CTD_S2 added_by(PomBase:SPAC2F3.15) PMID:19328067 4896 2009-07-12 pattern_error
SPBC28F2.12 rpb1 MOD:00046 Inferred from Direct Assay CTD_S2 added_by(PomBase:SPAC2F3.15) PMID:19328067 4896 2024-03-05 pattern_error
SPBC28F2.12 rpb1 MOD:00046 Inferred from Direct Assay CTD_S2 added_by(PomBase:SPBC32H8.10) PMID:19328067 4896 2024-03-05 pattern_error
SPBC28F2.12 rpb1 MOD:00046 Inferred from Direct Assay CTD_S5 added_by(PomBase:SPBC19F8.07) PMID:19328067 4896 2009-07-12 pattern_error
SPBC28F2.12 rpb1 MOD:00046 Inferred from Direct Assay CTD_S5 added_by(PomBase:SPBC32H8.10) PMID:19328067 4896 2009-07-12 pattern_error
SPBC28F2.12 rpb1 MOD:00046 Inferred from Direct Assay CTD_S5 added_by(PomBase:SPAC24B11.06c) PMID:33410907 4896 2021-01-12 pattern_error
SPBC28F2.12 rpb1 MOD:00046 Inferred from Direct Assay CTD_S5 removed_by(PomBase:SPAC3G9.04) PMID:33410907 4896 2021-01-22 pattern_error
SPBC28F2.12 rpb1 MOD:00046 Inferred from Direct Assay CTD_S5 added_by(PomBase:SPBC19F8.07) PMID:19328067 4896 2024-03-05 pattern_error
SPBC28F2.12 rpb1 MOD:00046 Inferred from Direct Assay CTD_S5 added_by(PomBase:SPBC32H8.10) PMID:19328067 4896 2024-03-05 pattern_error
4 changes: 2 additions & 2 deletions results/protein_modification_cannot_fix_sequence_errors.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -163,9 +163,9 @@ SPCC1739.11c cdc11 MOD:00046 Inferred from Direct Assay S12 PMID:22419817 4896
SPCC1795.06 map2 MOD:01641 Inferred from Direct Assay L23 removed_by(PomBase:SPAC1296.03c) PMID:8654750 4896 2011-10-10 L23
SPCC1795.06 map2 MOD:01641 Inferred from Direct Assay L23 removed_by(PomBase:SPAC1296.03c) PMID:8654750 4896 2011-10-10 L23
SPCC1795.06 map2 MOD:01641 Inferred from Direct Assay L23 removed_by(PomBase:SPAC1296.03c) PMID:8654750 4896 2011-10-10 L23
SPCC1795.06 map2 MOD:01641 Inferred from Direct Assay L23 removed_by(PomBase:SPAC1296.03c) PMID:9191272 4896 2011-10-10 L23
SPCC1795.06 map2 MOD:01641 Inferred from Direct Assay L23 removed_by(PomBase:SPAC1296.03c) PMID:8736868 4896 2011-10-10 L23
SPCC1795.06 map2 MOD:01641 Inferred from Direct Assay L23 removed_by(PomBase:SPAC1296.03c) PMID:8861202 4896 2011-10-19 L23
SPCC1795.06 map2 MOD:01641 Inferred from Direct Assay L23 removed_by(PomBase:SPAC1296.03c) PMID:8736868 4896 2011-10-10 L23
SPCC1795.06 map2 MOD:01641 Inferred from Direct Assay L23 removed_by(PomBase:SPAC1296.03c) PMID:9191272 4896 2011-10-10 L23
SPCC18B5.11c cds1 MOD:00696 Inferred from Direct Assay T121 PMID:24663817 4896 2020-03-09 T121
SPCC18B5.11c cds1 MOD:00047 Inferred from Direct Assay T334 added_by(PomBase:SPCC18B5.11c) PMID:19357077 4896 2014-02-04 T334
SPCC18B5.11c cds1 MOD:00047 Inferred from Direct Assay T379 added_by(PomBase:SPCC18B5.11c) PMID:19357077 4896 2014-02-04 T379
Expand Down
11 changes: 11 additions & 0 deletions rpb_only.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
systematic_id primary_name modification evidence sequence_position annotation_extension reference taxon date sequence_error change_sequence_position_to
SPBC28F2.12 rpb1 MOD:00046 Inferred from Direct Assay CTD_S2 added_by(PomBase:SPBC32H8.10) PMID:19328067 4896 2009-07-12
SPBC28F2.12 rpb1 MOD:00046 Inferred from Direct Assay CTD_S2 added_by(PomBase:SPAC2F3.15) PMID:19328067 4896 2009-07-12
SPBC28F2.12 rpb1 MOD:00046 Inferred from Direct Assay CTD_S2 added_by(PomBase:SPAC2F3.15) PMID:19328067 4896 2024-03-05
SPBC28F2.12 rpb1 MOD:00046 Inferred from Direct Assay CTD_S2 added_by(PomBase:SPBC32H8.10) PMID:19328067 4896 2024-03-05
SPBC28F2.12 rpb1 MOD:00046 Inferred from Direct Assay CTD_S5 added_by(PomBase:SPBC19F8.07) PMID:19328067 4896 2009-07-12
SPBC28F2.12 rpb1 MOD:00046 Inferred from Direct Assay CTD_S5 added_by(PomBase:SPBC32H8.10) PMID:19328067 4896 2009-07-12
SPBC28F2.12 rpb1 MOD:00046 Inferred from Direct Assay CTD_S5 removed_by(PomBase:SPAC3G9.04) PMID:33410907 4896 2021-01-22
SPBC28F2.12 rpb1 MOD:00046 Inferred from Direct Assay CTD_S5 added_by(PomBase:SPAC24B11.06c) PMID:33410907 4896 2021-01-12
SPBC28F2.12 rpb1 MOD:00046 Inferred from Direct Assay CTD_S5 added_by(PomBase:SPBC19F8.07) PMID:19328067 4896 2024-03-05
SPBC28F2.12 rpb1 MOD:00046 Inferred from Direct Assay CTD_S5 added_by(PomBase:SPBC32H8.10) PMID:19328067 4896 2024-03-05

0 comments on commit 9ea7cea

Please sign in to comment.