-
Notifications
You must be signed in to change notification settings - Fork 0
/
03_full_length_different_probabilities_reduced_AA.py
66 lines (47 loc) · 2.1 KB
/
03_full_length_different_probabilities_reduced_AA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import os
import shutil
import pandas as pd
import numpy as np
from multiprocessing import Pool, cpu_count
from functools import partial
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("paper", font_scale=2.0)
sns.set_style('whitegrid')
from functions import *
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)
with SequenceFile("data/uniprot-9606.fasta", digital=True, alphabet=alphabet) as seq_file:
sequences = list(seq_file)
uniprot = fasta_reader('data/uniprot-9606.fasta')
# final
aa_knowns = ['L', 'LS', 'C', 'K', 'Y', 'CK', 'KY', 'YC', 'LSE', 'CKY', \
'W', 'WM', 'WMC', 'LSEA', 'WMCH', 'WMCHY', 'LSEAG', ]
prob_range = [0.9, 0.8, 0.6, 0.4, 0.2, 0.1, 0.09, 0.08, 0.07, 0.06, 0.05]
for aa_known in aa_knowns:
for prob in prob_range:
# output file name
fname = f'full_length_{aa_known}_max_prob_{prob}_del.pkl.gz'
if not os.path.isfile('results/scan_results/'+fname):
temp_df = uniprot.copy()
temp_df['temp'] = temp_df.values.tolist()
temp_df['posteriors'] = temp_df.temp.parallel_apply(\
lambda x: generate_reads(seq=x[1], aa_known=aa_known, max_prob=prob))
temp_df['temp1'] = temp_df[['Accession', 'posteriors']].values.tolist()
temp_df['hmm'] = temp_df.temp1.parallel_apply(\
lambda x: hmm_build(x[1], x[0], f'full_length_{x[0]}_{aa_known}_max_prob_{prob}'))
hmms = np.array_split(temp_df.hmm, 10)
hmms = [i.tolist() for i in hmms]
results = []
for i, v in enumerate(hmms):
results.append(score(v, sequences, background))
# print(f'\t\t\t\tdone: {i}', end='\r')
fname = f'full_length_{aa_known}_max_prob_{prob}_del.pkl.gz'
res_df = pd.concat(results)
res_df.to_pickle('results/scan_results/' + fname)
del temp_df, res_df, hmms
shutil.rmtree('temp/', ignore_errors=True)
try:
os.makedirs('temp/')
except FileExistsError:
pass