-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSimpleStatisticalModel.py
197 lines (135 loc) · 7.6 KB
/
SimpleStatisticalModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import pandas as pd
import math
from auxFonctions import AminoAcid
import numpy as np
# Read data from a file into a list of entries
with open('data/SIG_13.red', 'r') as file:
entries = file.read().split('\n ')
# Define a function to process each entry in the data file
def process_entry(entry):
lines = entry.split('\n')
protein_id, primary_structure, annotation = lines
return {
'Protein ID': protein_id.split()[1],
'Primary Structure': primary_structure,
'Annotation': annotation
}
# Process each entry
processed_entries = [process_entry(entry) for entry in entries]
# Create a DataFrame
df = pd.DataFrame(processed_entries)
# Now you can analyze the DataFrame as needed
# Get the position of the cleavage site
cleavage_site_position = df['Annotation'].apply(lambda x: x.find('C'))
#print("Position of the cleavage site:")
#print(cleavage_site_position)
print("Average position of the cleavage site:")
print(cleavage_site_position.mean())
print("\n")
print("The extremum position of the cleavage site:")
print(cleavage_site_position.min())
print(cleavage_site_position.max())
print("The distance from cleavage site to the C-terminal part of the amino-sequence:")
#there is always 30 amino-acids after the cleavage site
print("\n")\
# with have then p = [13, 1] and q = [1, 30]
# Split the primary structure into a list of amino acids
amino_acid_seq = df['Primary Structure'].apply(lambda x: list(x))
amino_acid_seqBis = df['Primary Structure'].apply(lambda x: list(x))
# for each amino acid in the sequence, replace it with the corresponding AminoAcid object
amino_acid_seq = amino_acid_seq.apply(lambda x: [AminoAcid(aa) for aa in x])
#Parametres de l'étude
p_opt, q_opt = 1, 1
fn_min = 1408
fp_min = 1408
for p in range(2, 14):
for q in range(1, 10):
# print("p = ", p, " q = ", q)
# print("\n")
# Create a DataFrame to store, for each primary structure, the neihborhood of the cleavage site
# The neighborhood is defined as the word of length p+q starting p letters before the cleavage site
correct_neighborhood = pd.Series()
for i, seq in amino_acid_seqBis.items():
correct_neighborhood[i] = ''.join(seq[cleavage_site_position[i]-p:cleavage_site_position[i]+q])
incorrect_neighborhood = pd.Series()
decalage = [3,4,5,-3,-4, -5]
for i, seq in amino_acid_seqBis.items():
dec = np.random.choice(decalage)
dec = 0 if cleavage_site_position[i]-13 - dec < 0 else dec
incorrect_neighborhood[i] = ''.join(seq[cleavage_site_position[i]-p-dec:cleavage_site_position[i]+q-dec])
# Create a DataFrame to store the counts of each amino acid at every position relative to the cleavage site
#the cleavage site is between to aminoacids, so cleavage_site_position is the position of the first amino acid after the cleavge site
#So i need to create a dataframe with columns from -p to q without 0
amino_acid_counts = pd.DataFrame(0, index=AminoAcid.properties.keys(), columns=range(-p, q))
amino_acid_freqs = pd.DataFrame(0.0, index=AminoAcid.properties.keys(), columns=range(-p, q))
amino_acid_pseudo_counts = pd.DataFrame(0, index=AminoAcid.properties.keys(), columns=range(-p, q))
amino_acid_s_values = pd.DataFrame(0.0, index=AminoAcid.properties.keys(), columns=range(-p, q))
# Count the occurrences of each amino acid at every position relative to the cleavage site
for i, seq in amino_acid_seq.items():
for j, aa in enumerate(seq):
position = j - cleavage_site_position[i] #position of the amino acid relative to the cleavage site
if position in amino_acid_counts.columns:
amino_acid_counts.loc[aa.code, position] += 1
# Add pseudo-counts to avoid zero counts here pseudocount parameter is 1/len(df)
amino_acid_pseudo_counts = amino_acid_counts + 1
# Print the results
#print("Occurrences of each amino acid at every position relative to the cleavage site:")
#print(amino_acid_pseudo_counts)
# Compute the observed frequency of each amino acid at the relative position (using pseudo-counts)
for i in amino_acid_counts.index:
for j in amino_acid_counts.columns:
amino_acid_freqs.loc[i, j] = amino_acid_pseudo_counts.loc[i, j] / len(df)
# Compute the general background frequency of each amino acid
general_background_frequency = amino_acid_freqs.mean(axis=1)
# Compute the s value of each amino acid at every position
for i in amino_acid_counts.index:
for j in amino_acid_counts.columns:
amino_acid_s_values.loc[i, j] = math.log(amino_acid_freqs.loc[i, j]) - math.log(general_background_frequency[i])
# Define the function computing the q-1 score for a given word
def q_minus_1_score(word):
return sum([amino_acid_s_values.loc[aa, i-p] for i, aa in enumerate(word)])
# To obtain the score of the correct neighborhoods, we apply the q-1 score function to each neighborhood
#correct_neighborhood = correct_neighborhood.apply(lambda x: [AminoAcid(aa) for aa in x])
correct_neigboorhood_score = correct_neighborhood.apply(q_minus_1_score)
incorrect_neighborhood_score = incorrect_neighborhood.apply(q_minus_1_score)
correct_mean = correct_neigboorhood_score.mean()
std_dev = correct_neigboorhood_score.std()
incorrect_mean = incorrect_neighborhood_score.mean()
std_dev_incorrect = incorrect_neighborhood_score.std()
threshold_opti = (correct_neigboorhood_score.mean() - std_dev/math.sqrt(0.9) + incorrect_neighborhood_score.mean() + std_dev_incorrect/math.sqrt(0.9))/2
false_negatives = correct_neigboorhood_score[correct_neigboorhood_score < threshold_opti].count()
false_positives = incorrect_neighborhood_score[incorrect_neighborhood_score > threshold_opti].count()
if false_negatives + false_positives < fn_min + fp_min:
fn_min = false_negatives
fp_min = false_positives
p_opt = p
q_opt = q
print("Optimal values of p and q for the deviation:")
print("p = ", p_opt, " q = ", q_opt)
print("\n")
print("Minimal amount of false_negatives:")
print(fn_min)
print("Minimal amount of false_positives:")
print(fp_min)
#Here I'm saving my results in the result.txt file
def save_results():
with open('results.txt', 'a') as file:
file.write("### RESULTS :#####\n\n")
file.write("Parameters of the model:\n")
file.write("Data obtained from the EUKSIG_13.red file\n\n")
file.write("p = 13 q = 2\n\n")
file.write("Average position of the cleavage site:\n")
file.write(str(cleavage_site_position.mean()) + "\n\n")
file.write("Occurrences of each amino acid at every position relative to the cleavage site:\n")
amino_acid_counts_str = amino_acid_counts.to_string()
file.write(amino_acid_counts_str + "\n\n")
file.write("Observed frequency of each amino acid at the relative position:\n")
amino_acid_freqs_str = amino_acid_freqs.to_string()
file.write(amino_acid_freqs_str + "\n\n")
file.write("General background frequency of each amino acid:\n")
file.write(str(general_background_frequency) + "\n\n")
file.write("s value of each amino acid at every position:\n")
amino_acid_s_values_str = amino_acid_s_values.to_string()
file.write(amino_acid_s_values_str + "\n\n")
file.write("END OF RESULTS\n\n\n")
#save_results()