Skip to content

Commit

Permalink
count fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
manulera committed Nov 22, 2024
1 parent afc5521 commit 2b274ff
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 0 deletions.
39 changes: 39 additions & 0 deletions change_log/count_changes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import pandas
import glob

files = glob.glob("allele_*.tsv") + [
"description_in_db_doesnt_match_03082023.tsv",
"manual_cannot_fix_new_03082023.tsv",
"unknowns_with_correct_descriptions_10082023.tsv",
]
data = pandas.DataFrame()

for file in files:
print(file)
df = pandas.read_csv(file, sep="\t", na_filter=False)
unique_columns = df[["allele_name", "allele_description"]]
if "change_description_to" in df.columns:
unique_columns.loc[:, "description_changed"] = df["change_description_to"] != ""
else:
unique_columns.loc[:, "description_changed"] = False
if "change_name_to" in df.columns:
unique_columns.loc[:, "name_changed"] = df["change_name_to"] != ""
data = pandas.concat([data, unique_columns])

data = data.drop_duplicates()

print("> alleles fixed (at most)", len(data))
print("> alleles with description changed (at most)", data["description_changed"].sum())
print("> alleles with name changed (at most)", data["name_changed"].sum())
print()
files = glob.glob("protein_*.tsv")
data = pandas.DataFrame()

for file in files:
df = pandas.read_csv(file, sep="\t", na_filter=False)
unique_columns = df[["systematic_id", "sequence_position"]]
data = pandas.concat([data, unique_columns])

data = data.drop_duplicates()

print("> modifications fixed (at most)", len(data))
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
systematic_id primary_name modification evidence sequence_position annotation_extension reference taxon date sequence_error change_sequence_position_to auto_fix_comment
SPAC3H1.05 ste24 MOD:00046 tryptic phosphopeptide mapping assay evidence used in automatic assertion S12 PMID:30726745 4896 2019-02-05 S12 ? old_coords_fix, revision 8148: join(1938900..1939004,1939075..1940394)
SPAC57A7.12 ssz1 MOD:00046 experimental evidence S12 present_during(GO:0000087) PMID:21712547 4896 2011-06-28 S12 ? old_coords_fix, revision 8148: complement(join(1515089..1516663,1516789..1516914))
SPBC2F12.10 mrpl35 MOD:00047 tryptic phosphopeptide mapping assay evidence used in automatic assertion T13 PMID:33823663 4896 2022-03-08 T13 ? old_coords_fix, revision 20110324: complement(1719551..1720663)
Expand Down

0 comments on commit 2b274ff

Please sign in to comment.