From 2b274ff390cc1b8a57df1dc3c4ce61bde55b6ed0 Mon Sep 17 00:00:00 2001 From: Manuel Lera-Ramirez Date: Fri, 22 Nov 2024 15:26:28 +0000 Subject: [PATCH] count fixes --- change_log/count_changes.py | 39 +++++++++++++++++++ ...in_new_gene_structure_issue62_14072023.tsv | 1 + 2 files changed, 40 insertions(+) create mode 100644 change_log/count_changes.py diff --git a/change_log/count_changes.py b/change_log/count_changes.py new file mode 100644 index 0000000..ae31102 --- /dev/null +++ b/change_log/count_changes.py @@ -0,0 +1,39 @@ +import pandas +import glob + +files = glob.glob("allele_*.tsv") + [ + "description_in_db_doesnt_match_03082023.tsv", + "manual_cannot_fix_new_03082023.tsv", + "unknowns_with_correct_descriptions_10082023.tsv", +] +data = pandas.DataFrame() + +for file in files: + print(file) + df = pandas.read_csv(file, sep="\t", na_filter=False) + unique_columns = df[["allele_name", "allele_description"]] + if "change_description_to" in df.columns: + unique_columns.loc[:, "description_changed"] = df["change_description_to"] != "" + else: + unique_columns.loc[:, "description_changed"] = False + if "change_name_to" in df.columns: + unique_columns.loc[:, "name_changed"] = df["change_name_to"] != "" + data = pandas.concat([data, unique_columns]) + +data = data.drop_duplicates() + +print("> alleles fixed (at most)", len(data)) +print("> alleles with description changed (at most)", data["description_changed"].sum()) +print("> alleles with name changed (at most)", data["name_changed"].sum()) +print() +files = glob.glob("protein_*.tsv") +data = pandas.DataFrame() + +for file in files: + df = pandas.read_csv(file, sep="\t", na_filter=False) + unique_columns = df[["systematic_id", "sequence_position"]] + data = pandas.concat([data, unique_columns]) + +data = data.drop_duplicates() + +print("> modifications fixed (at most)", len(data)) diff --git a/change_log/protein_modification_residues_not_in_new_gene_structure_issue62_14072023.tsv b/change_log/protein_modification_residues_not_in_new_gene_structure_issue62_14072023.tsv index e3f5cdf..db746f3 100644 --- a/change_log/protein_modification_residues_not_in_new_gene_structure_issue62_14072023.tsv +++ b/change_log/protein_modification_residues_not_in_new_gene_structure_issue62_14072023.tsv @@ -1,3 +1,4 @@ +systematic_id primary_name modification evidence sequence_position annotation_extension reference taxon date sequence_error change_sequence_position_to auto_fix_comment SPAC3H1.05 ste24 MOD:00046 tryptic phosphopeptide mapping assay evidence used in automatic assertion S12 PMID:30726745 4896 2019-02-05 S12 ? old_coords_fix, revision 8148: join(1938900..1939004,1939075..1940394) SPAC57A7.12 ssz1 MOD:00046 experimental evidence S12 present_during(GO:0000087) PMID:21712547 4896 2011-06-28 S12 ? old_coords_fix, revision 8148: complement(join(1515089..1516663,1516789..1516914)) SPBC2F12.10 mrpl35 MOD:00047 tryptic phosphopeptide mapping assay evidence used in automatic assertion T13 PMID:33823663 4896 2022-03-08 T13 ? old_coords_fix, revision 20110324: complement(1719551..1720663)