1
1
import pandas as pd
2
2
import numpy as np
3
3
import re
4
+ import time
4
5
import SynTrackerVis_app .config as config
5
6
6
7
@@ -23,7 +24,7 @@ def return_selected_genome_avg_table(avg_big_df, selected_genome):
23
24
def calculate_avg_scores_selected_genome_size (score_per_region_selected_genome_df , genome , size ):
24
25
25
26
# Taking all available regions - no subsampling
26
- if size == 'All_regions ' :
27
+ if size == 'All ' :
27
28
avg_scores_one_size_df = score_per_region_selected_genome_df .groupby (['Sample1' , 'Sample2' ])['Synteny_score' ].\
28
29
mean ().reset_index ().rename (columns = {"Synteny_score" : "APSS" })
29
30
@@ -52,7 +53,18 @@ def calculate_avg_scores_selected_genome_size(score_per_region_selected_genome_d
52
53
53
54
return avg_scores_one_size_df
54
55
56
+
57
+ def count_samples_num (row , df ):
58
+ condition_df = df [df [row ['Subsampled_regions' ]] > 0 ]
59
+
60
+ unique_samples = pd .concat ([condition_df ['Sample1' ], condition_df ['Sample2' ]]).unique ()
61
+ samples_num = len (unique_samples )
62
+
63
+ return samples_num
64
+
65
+
55
66
def create_pairs_num_per_sampling_size (score_per_region_df ):
67
+ print ("\n create_pairs_num_per_sampling_size:" )
56
68
57
69
regions_num_per_pair_df = score_per_region_df [['Sample1' , 'Sample2' , 'Synteny_score' ]].\
58
70
groupby (['Sample1' , 'Sample2' ]).count ().reset_index (). \
@@ -62,7 +74,7 @@ def create_pairs_num_per_sampling_size(score_per_region_df):
62
74
63
75
# Add a column for each subsampling size (to calculate how many pairs have results for at least this size)
64
76
for size in config .sampling_sizes :
65
- if size == 'All_regions ' :
77
+ if size == 'All ' :
66
78
regions_num_per_pair_df [size ] = np .where (regions_num_per_pair_df ['Num_of_compared_regions' ] >= 1 ,
67
79
1 , 0 )
68
80
else :
@@ -71,7 +83,7 @@ def create_pairs_num_per_sampling_size(score_per_region_df):
71
83
72
84
#print(regions_num_per_pair_df)
73
85
74
- pairs_num_per_sampling_size_df = regions_num_per_pair_df [['All_regions ' , '40' , '60' , '80' , '100' ,
86
+ pairs_num_per_sampling_size_df = regions_num_per_pair_df [['All ' , '40' , '60' , '80' , '100' ,
75
87
'125' , '150' , '175' , '200' , '250' , '300' , '350' ,
76
88
'400' ]].sum ().reset_index ()
77
89
pairs_num_per_sampling_size_df .columns .values [0 ] = "Subsampled_regions"
@@ -82,51 +94,75 @@ def create_pairs_num_per_sampling_size(score_per_region_df):
82
94
pairs_num_per_sampling_size_df ['Pairs_lost_percent' ] = \
83
95
pairs_num_per_sampling_size_df ['Pairs_lost_percent' ].apply (lambda x : round (x , 2 ))
84
96
85
- #print(pairs_num_per_sampling_size_df)
86
- return pairs_num_per_sampling_size_df
87
-
88
-
89
- def create_score_per_region_sorted_contigs_table (score_per_region_df ):
90
- contigs_dict = dict ()
91
- contig_length_dict = dict ()
97
+ # Calculate the number of samples in each sampling size
98
+ pairs_num_per_sampling_size_df ['Number_of_samples' ] = \
99
+ pairs_num_per_sampling_size_df .apply (lambda row : count_samples_num (row , regions_num_per_pair_df ), axis = 1 )
92
100
93
- # Split the 'Region' column into
94
- score_per_region_df [['Contig_name' , 'Position' ]] = score_per_region_df ['Region' ].str .extract (r'(\S+)_(\d+)_\d+' )
95
- score_per_region_df ['Position' ] = score_per_region_df ['Position' ].astype (int )
101
+ print (pairs_num_per_sampling_size_df )
96
102
97
- # If the contig names contain numbers, sort them numerically
98
- if re .search (r"^\S+_\d+$" , score_per_region_df .iloc [0 ]['Contig_name' ]):
103
+ return pairs_num_per_sampling_size_df
99
104
100
- # Create a temporary column 'contigs_sort' to sort the contig names numericlly
101
- score_per_region_df ['Contig_number' ] = score_per_region_df ['Contig_name' ].str .extract (r'\S+_(\d+)' )\
102
- .astype (int )
103
105
104
- contigs_list_by_name = list (score_per_region_df .sort_values ('Contig_number' ).groupby (['Contig_name' ],
105
- sort = False ).groups )
106
+ def return_sorted_contigs_lists (score_per_region_df ):
107
+ '''
108
+ contigs_dict = {}
106
109
107
- else :
108
- contigs_list_by_name = list (score_per_region_df .groupby (['Contig_name' ]).groups )
110
+ region_list = list(score_per_region_df['Region'])
111
+ for region in region_list:
112
+ regex = r'(\S+)_(\d+)_\d+'
113
+ m = re.search(regex, region)
114
+ if m:
115
+ contig_name = m.group(1)
116
+ pos = m.group(2)
117
+ contigs_dict[contig_name] = int(pos)
109
118
110
- #print("\ncreate_score_per_region_sorted_contigs_table:")
111
- #print(score_per_region_df)
112
- #print("\nContigs list sorted by name:")
113
- #print(contigs_list_by_name)
119
+ # Sort the contigs dict by name
120
+ contigs_list_by_name = sorted(contigs_dict)
114
121
115
- # Create a dictionary for the contigs, sorted by their names.
116
- for contig in contigs_list_by_name :
122
+ # Sort the contigs dict by length
123
+ contigs_list_by_length = sorted(contigs_dict, key=lambda k: contigs_dict[k], reverse=True)
124
+ '''
117
125
118
- score_per_region_contig_df = score_per_region_df [score_per_region_df ['Contig_name' ] == contig ]
119
- contigs_dict [contig ] = score_per_region_contig_df [['Contig_name' , 'Position' , 'Synteny_score' ]]
126
+ before = time .time ()
127
+ # Split the 'Region' column into Contig_name and Position
128
+ pattern = re .compile (r'(\S+)_(\d+)_\d+' )
129
+ score_per_region_df [['Contig_name' , 'Position' ]] = score_per_region_df ['Region' ].str .extract (pattern )
130
+ #score_per_region_df[['Contig_name', 'Position']] = score_per_region_df['Region'].str.extract(r'(\S+)_(\d+)_\d+')
131
+ score_per_region_df ['Position' ] = score_per_region_df ['Position' ].astype (int )
120
132
121
- # Find contig length by the last position
122
- score_per_region_contig_df = score_per_region_contig_df .sort_values ('Position' )
123
- contig_length = score_per_region_contig_df .iloc [- 1 ]['Position' ] + config .region_length
124
- contig_length_dict [contig ] = contig_length
133
+ after = time .time ()
134
+ duration = after - before
135
+ print ("Extract position from region took " + str (duration ) + " seconds.\n " )
125
136
126
- # Sort the contig lengths dict by the lengths and return a sorted list of names
127
- #sorted_dict = dict(sorted(contig_length_dict.items(), key=lambda item: item[1], reverse=True))
128
- sorted_by_length_list = sorted ( contig_length_dict , key = contig_length_dict . get , reverse = True )
137
+ # Get a list of contigs, sorted by name
138
+ # If the contig names contain numbers, sort them numerically
139
+ #if re.search(r"^\S+_\d+$", score_per_region_df.iloc[0]['Contig_name']):
129
140
130
- return contigs_dict , contigs_list_by_name , sorted_by_length_list
141
+ # Create a temporary column 'contigs_sort' to sort the contig names numericlly
142
+ # score_per_region_df['Contig_number'] = score_per_region_df['Contig_name'].str.extract(r'\S+_(\d+)')\
143
+ # .astype(int)
144
+
145
+ # contigs_list_by_name = list(score_per_region_df.sort_values('Contig_number').groupby(['Contig_name'],
146
+ # sort=False).groups)
147
+
148
+ #else:
149
+ # contigs_list_by_name = list(score_per_region_df.groupby(['Contig_name']).groups)
150
+
151
+ before = time .time ()
152
+ contigs_list_by_name = list (score_per_region_df .groupby (['Contig_name' ]).groups )
153
+ after = time .time ()
154
+ duration = after - before
155
+ print ("Sort by name took " + str (duration ) + " seconds.\n " )
156
+
157
+ # Get a list of contigs, sorted by length
158
+ before = time .time ()
159
+ contigs_list_by_length = list (score_per_region_df .sort_values ('Position' , ascending = False ).groupby (['Contig_name' ],
160
+ sort = False ).
161
+ groups )
162
+ after = time .time ()
163
+ duration = after - before
164
+ print ("Sort by length took " + str (duration ) + " seconds.\n " )
165
+
166
+ return contigs_list_by_name , contigs_list_by_length
131
167
132
168
0 commit comments