Skip to content

Commit 2aeb8cb

Browse files
committed
Version 0.4.0-alpha:
- Added the possibility to download the data tables for each plot. - Improved the network and changed the save plot as image option, so that the plot will be saved using Matplotlib in higher resolution and in several formats. - Run the calculations required for coverage plot in a different thread in the background, in order to present the initial plots display faster.
1 parent 70e166e commit 2aeb8cb

File tree

7 files changed

+1391
-507
lines changed

7 files changed

+1391
-507
lines changed

SynTrackerVis_app/config.py

Lines changed: 35 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
import bokeh.palettes as bp
2+
import colorcet as cc
23

3-
sampling_sizes = ['All_regions', '40', '60', '80', '100', '125', '150', '175', '200', '250', '300', '350', '400']
4+
col_set = ['Ref_genome', 'Sample1', 'Sample2', 'Region', 'Synteny_score']
5+
sampling_sizes = ['All', '40', '60', '80', '100', '125', '150', '175', '200', '250', '300', '350', '400']
46
sampling_sizes_wo_all = ['40', '60', '80', '100', '125', '150', '175', '200', '250', '300', '350', '400']
7+
genomes_sorting_options = ['Number of compared pairs', 'Genome name']
8+
contig_sorting_options = ['Contig length', 'Contig name']
9+
catplot_types = ['Scatter (jitter) plot', 'Boxplot']
510
min_pairs_for_all_regions = 100
611
max_clustermap_cols = 120
12+
max_network_nodes = 300
713
network_iterations_options = ['50', '100', '150', '200', '250', '300', '350', '400', '450', '500']
814
network_thresholds_options = ['Mean APSS', 'Mean APSS+1 STD', 'Mean APSS+2 STD', 'Define another threshold']
915
APSS_connections_threshold_default = 0.9
@@ -14,15 +20,19 @@
1420

1521
## CSS Styles ##
1622
normal_bar_color = "#B048B5"
17-
highlight_bar_color = "#ba2649"
23+
#highlight_bar_color = "#ba2649"
24+
highlight_bar_color = "#43BFC7"
1825
title_red_color = "#800517"
1926
title_purple_color = "#800080"
2027
title_blue_color = "#002060"
28+
same_color = "#F22C5D"
29+
diff_color = "#47A3E1"
30+
nodes_default_color = 'gray'
31+
2132

2233
main_area_style = {
2334
'width': "1200px",
2435
'padding': "20px",
25-
#'background': "#b0e0e6",
2636
}
2737

2838
single_multi_tabs_style = {
@@ -40,46 +50,48 @@
4050
}
4151

4252
main_column_style = {
43-
#'background': "#f9f9f9",
53+
'background': "#f9f9f9",
4454
'padding': "20px",
45-
#'padding': "0px 20px 10px 20px"
4655
}
4756

4857
plot_card_style = {
4958
'background': "#ffffff",
5059
'width': "1150px",
5160
}
5261

53-
## matplotlib patrameters
62+
secondary_button = {
63+
'background': 'rgba(0, 128, 255, 0.5)',
64+
'color': 'white'
65+
}
66+
67+
# Export file formats
68+
matplotlib_file_formats = ['png', 'pdf', 'svg', 'eps']
69+
bokeh_file_formats = ['png', 'svg']
70+
71+
# Colormaps
5472
clustermap_colormaps_list = ['Blues', 'Purples', 'Greens', 'Oranges', 'Reds', 'Greys',
5573
'OrRd', 'PuRd', 'RdPu', 'BuPu', 'GnBu', 'PuBu', 'BuGn', 'YlGn',
5674
'YlGnBu', 'PuBuGn', 'YlOrRd',
5775
'spring', 'summer', 'autumn', 'winter', 'cool', 'Wistia']
58-
matplotlib_file_formats = ['png', 'pdf', 'svg', 'eps']
5976

60-
## Bokeh patrameters
61-
Bokeh_categorical_colormap_dict = {
62-
'Category20': bp.Category20[20],
63-
'Category10': bp.Category10[10],
77+
categorical_colormap_dict = {
78+
'cet_glasbey': cc.glasbey,
79+
'cet_glasbey_light': cc.glasbey_light,
80+
'cet_glasbey_category10': bp.Category10[10],
6481
'Set1': bp.Set1[9],
65-
'Set3': bp.Set3[12],
66-
'Spectral': bp.Spectral[11],
67-
'Bokeh': bp.Bokeh[8],
82+
'Set3': bp.Set3[12]
6883
}
69-
Bokeh_categorical_colormap_list = [bp.Category10[10], bp.Category20[20], bp.Pastel1[9], bp.Set1[9], bp.Set3[12],
70-
bp.Spectral[11], bp.Bokeh[8], bp.Turbo256]
71-
Bokeh_continuous_colormap_dict = {
72-
'Turbo256': bp.Turbo256,
73-
'Plasma': bp.Plasma256,
74-
'Viridis': bp.Viridis256,
84+
85+
continuous_colormap_dict = {
86+
'cet_rainbow4': cc.m_rainbow4,
87+
'cet_isolum': cc.isolum,
88+
'plasma': bp.Plasma256,
89+
'viridis': bp.Viridis256,
7590
'Blues': bp.Blues256,
7691
'Reds': bp.Reds256,
7792
'Greens': bp.Greens256,
7893
}
79-
bokeh_file_formats = ['png', 'svg']
8094

81-
normal_bar_color = "#B048B5"
82-
highlight_bar_color = "#43BFC7"
8395

8496

8597

SynTrackerVis_app/data_manipulation_multi.py

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ def complete_metadata(score_per_region_df, metadata_df):
3838
#new_row.append("NaN")
3939
metadata_df.loc[len(metadata_df)] = new_row
4040

41-
#print("\nMetadata after filling missing samples:")
42-
#print(metadata_df)
41+
print("\nMetadata after filling missing samples:")
42+
print(metadata_df)
4343

4444
# Create a dictionary to map the samples to feature values from metadata_df
4545
for feature in metadata_features_list:
@@ -63,6 +63,27 @@ def count_species_num(row, df):
6363
return species_num
6464

6565

66+
def create_sorted_by_pairs_genomes_list(score_per_region_all_genomes_df):
67+
regions_num_per_pair_df = score_per_region_all_genomes_df[['Ref_genome', 'Sample1', 'Sample2', 'Synteny_score']]. \
68+
groupby(['Ref_genome', 'Sample1', 'Sample2']).count().reset_index(). \
69+
rename(columns={"Synteny_score": "Num_of_compared_regions"})
70+
71+
regions_num_per_pair_df['40'] = np.where(regions_num_per_pair_df['Num_of_compared_regions'] >= 40, 1, 0)
72+
73+
pairs_num_at_40_regions_df = regions_num_per_pair_df[['Ref_genome', '40']].groupby('Ref_genome').sum().\
74+
sort_values('40', ascending=False).reset_index()
75+
pairs_num_at_40_regions_df.columns.values[1] = "Number_of_pairs"
76+
77+
print("\ncreate_sorted_by_pairs_genomes_list:")
78+
print(pairs_num_at_40_regions_df)
79+
80+
genomes_list_by_pairs_num = list(pairs_num_at_40_regions_df['Ref_genome'])
81+
#print("\nGenomes list sorted by pairs number:")
82+
#print(genomes_list_by_pairs_num)
83+
84+
return genomes_list_by_pairs_num
85+
86+
6687
def create_pairs_num_per_sampling_size(score_per_region_selected_genomes_df):
6788

6889
regions_num_per_pair_df = score_per_region_selected_genomes_df[['Ref_genome', 'Sample1', 'Sample2', 'Synteny_score']].\
@@ -72,15 +93,15 @@ def create_pairs_num_per_sampling_size(score_per_region_selected_genomes_df):
7293

7394
# Add a column for each subsampling size (to calculate how many pairs have results for at least this size)
7495
for size in config.sampling_sizes:
75-
if size == 'All_regions':
96+
if size == 'All':
7697
regions_num_per_pair_df[size] = np.where(regions_num_per_pair_df['Num_of_compared_regions'] >= 1,
7798
1, 0)
7899
else:
79100
regions_num_per_pair_df[size] = np.where(regions_num_per_pair_df['Num_of_compared_regions'] >= int(size),
80101
1, 0)
81102
#print(regions_num_per_pair_df)
82103

83-
pairs_num_per_sampling_size_df = regions_num_per_pair_df[['Ref_genome', 'All_regions', '40', '60', '80', '100',
104+
pairs_num_per_sampling_size_df = regions_num_per_pair_df[['Ref_genome', 'All', '40', '60', '80', '100',
84105
'125', '150', '175', '200', '250', '300', '350',
85106
'400']].groupby('Ref_genome').sum().reset_index()
86107
#print(pairs_num_per_sampling_size_df)
@@ -90,7 +111,7 @@ def create_pairs_num_per_sampling_size(score_per_region_selected_genomes_df):
90111
pairs_num_per_sampling_size_df[size], 0)
91112
#print(pairs_num_per_sampling_size_df)
92113

93-
summary_df = pairs_num_per_sampling_size_df[['All_regions', '40', '60', '80', '100', '125', '150', '175', '200',
114+
summary_df = pairs_num_per_sampling_size_df[['All', '40', '60', '80', '100', '125', '150', '175', '200',
94115
'250', '300', '350', '400']].sum().reset_index()
95116

96117
summary_df.columns.values[0] = "Subsampled_regions"
@@ -109,7 +130,7 @@ def create_pairs_num_per_sampling_size(score_per_region_selected_genomes_df):
109130
def calculate_APSS_all_genomes_sampling_size(score_per_region_df, size):
110131

111132
# Taking all available regions - no subsampling
112-
if size == 'All_regions':
133+
if size == 'All':
113134
avg_scores_one_size_df = score_per_region_df.groupby(['Ref_genome', 'Sample1', 'Sample2'])['Synteny_score'].\
114135
mean().reset_index().rename(columns={"Synteny_score": "APSS"})
115136

@@ -131,18 +152,18 @@ def calculate_APSS_all_genomes_sampling_size(score_per_region_df, size):
131152
if not avg_scores_one_size_df.empty:
132153
avg_scores_one_size_df['Compared_regions'] = size
133154

134-
print("\ncalculate_APSS_all_genomes_sampling_size:")
135-
print(avg_scores_one_size_df)
155+
#print("\ncalculate_APSS_all_genomes_sampling_size:")
156+
#print(avg_scores_one_size_df)
136157

137158
# Filter out species with less than 10 pairs
138159
samples_per_genome_df = avg_scores_one_size_df[['Ref_genome', 'APSS']].groupby('Ref_genome').count().reset_index().\
139160
rename(columns={"APSS": "count"})
140-
print(samples_per_genome_df)
161+
#print(samples_per_genome_df)
141162
merged_df = avg_scores_one_size_df.merge(samples_per_genome_df[['Ref_genome', 'count']], on='Ref_genome',
142163
how='left')
143164
avg_scores_one_size_filtered_df = merged_df[merged_df['count'] >= 10].drop(columns='count')
144-
print("\ncalculate_APSS_all_genomes_sampling_size after genomes filtering:")
145-
print(avg_scores_one_size_filtered_df)
165+
#print("\ncalculate_APSS_all_genomes_sampling_size after genomes filtering:")
166+
#print(avg_scores_one_size_filtered_df)
146167

147168
return avg_scores_one_size_filtered_df
148169

@@ -152,8 +173,8 @@ def return_genomes_subset_APSS_selected_size_table(all_genomes_selected_size_APS
152173
all_genomes_selected_size_APSS_df[all_genomes_selected_size_APSS_df['Ref_genome'].isin(genomes_list)].\
153174
reset_index()
154175

155-
print("\nreturn_genomes_subset_APSS_selected_size_table:")
156-
print(genomes_subset_selected_size_APSS_df)
176+
#print("\nreturn_genomes_subset_APSS_selected_size_table:")
177+
#print(genomes_subset_selected_size_APSS_df)
157178
return genomes_subset_selected_size_APSS_df
158179

159180

SynTrackerVis_app/data_manipulation_single.py

Lines changed: 74 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import pandas as pd
22
import numpy as np
33
import re
4+
import time
45
import SynTrackerVis_app.config as config
56

67

@@ -23,7 +24,7 @@ def return_selected_genome_avg_table(avg_big_df, selected_genome):
2324
def calculate_avg_scores_selected_genome_size(score_per_region_selected_genome_df, genome, size):
2425

2526
# Taking all available regions - no subsampling
26-
if size == 'All_regions':
27+
if size == 'All':
2728
avg_scores_one_size_df = score_per_region_selected_genome_df.groupby(['Sample1', 'Sample2'])['Synteny_score'].\
2829
mean().reset_index().rename(columns={"Synteny_score": "APSS"})
2930

@@ -52,7 +53,18 @@ def calculate_avg_scores_selected_genome_size(score_per_region_selected_genome_d
5253

5354
return avg_scores_one_size_df
5455

56+
57+
def count_samples_num(row, df):
58+
condition_df = df[df[row['Subsampled_regions']] > 0]
59+
60+
unique_samples = pd.concat([condition_df['Sample1'], condition_df['Sample2']]).unique()
61+
samples_num = len(unique_samples)
62+
63+
return samples_num
64+
65+
5566
def create_pairs_num_per_sampling_size(score_per_region_df):
67+
print("\ncreate_pairs_num_per_sampling_size:")
5668

5769
regions_num_per_pair_df = score_per_region_df[['Sample1', 'Sample2', 'Synteny_score']].\
5870
groupby(['Sample1', 'Sample2']).count().reset_index(). \
@@ -62,7 +74,7 @@ def create_pairs_num_per_sampling_size(score_per_region_df):
6274

6375
# Add a column for each subsampling size (to calculate how many pairs have results for at least this size)
6476
for size in config.sampling_sizes:
65-
if size == 'All_regions':
77+
if size == 'All':
6678
regions_num_per_pair_df[size] = np.where(regions_num_per_pair_df['Num_of_compared_regions'] >= 1,
6779
1, 0)
6880
else:
@@ -71,7 +83,7 @@ def create_pairs_num_per_sampling_size(score_per_region_df):
7183

7284
#print(regions_num_per_pair_df)
7385

74-
pairs_num_per_sampling_size_df = regions_num_per_pair_df[['All_regions', '40', '60', '80', '100',
86+
pairs_num_per_sampling_size_df = regions_num_per_pair_df[['All', '40', '60', '80', '100',
7587
'125', '150', '175', '200', '250', '300', '350',
7688
'400']].sum().reset_index()
7789
pairs_num_per_sampling_size_df.columns.values[0] = "Subsampled_regions"
@@ -82,51 +94,75 @@ def create_pairs_num_per_sampling_size(score_per_region_df):
8294
pairs_num_per_sampling_size_df['Pairs_lost_percent'] = \
8395
pairs_num_per_sampling_size_df['Pairs_lost_percent'].apply(lambda x: round(x, 2))
8496

85-
#print(pairs_num_per_sampling_size_df)
86-
return pairs_num_per_sampling_size_df
87-
88-
89-
def create_score_per_region_sorted_contigs_table(score_per_region_df):
90-
contigs_dict = dict()
91-
contig_length_dict = dict()
97+
# Calculate the number of samples in each sampling size
98+
pairs_num_per_sampling_size_df['Number_of_samples'] = \
99+
pairs_num_per_sampling_size_df.apply(lambda row: count_samples_num(row, regions_num_per_pair_df), axis=1)
92100

93-
# Split the 'Region' column into
94-
score_per_region_df[['Contig_name', 'Position']] = score_per_region_df['Region'].str.extract(r'(\S+)_(\d+)_\d+')
95-
score_per_region_df['Position'] = score_per_region_df['Position'].astype(int)
101+
print(pairs_num_per_sampling_size_df)
96102

97-
# If the contig names contain numbers, sort them numerically
98-
if re.search(r"^\S+_\d+$", score_per_region_df.iloc[0]['Contig_name']):
103+
return pairs_num_per_sampling_size_df
99104

100-
# Create a temporary column 'contigs_sort' to sort the contig names numericlly
101-
score_per_region_df['Contig_number'] = score_per_region_df['Contig_name'].str.extract(r'\S+_(\d+)')\
102-
.astype(int)
103105

104-
contigs_list_by_name = list(score_per_region_df.sort_values('Contig_number').groupby(['Contig_name'],
105-
sort=False).groups)
106+
def return_sorted_contigs_lists(score_per_region_df):
107+
'''
108+
contigs_dict = {}
106109
107-
else:
108-
contigs_list_by_name = list(score_per_region_df.groupby(['Contig_name']).groups)
110+
region_list = list(score_per_region_df['Region'])
111+
for region in region_list:
112+
regex = r'(\S+)_(\d+)_\d+'
113+
m = re.search(regex, region)
114+
if m:
115+
contig_name = m.group(1)
116+
pos = m.group(2)
117+
contigs_dict[contig_name] = int(pos)
109118
110-
#print("\ncreate_score_per_region_sorted_contigs_table:")
111-
#print(score_per_region_df)
112-
#print("\nContigs list sorted by name:")
113-
#print(contigs_list_by_name)
119+
# Sort the contigs dict by name
120+
contigs_list_by_name = sorted(contigs_dict)
114121
115-
# Create a dictionary for the contigs, sorted by their names.
116-
for contig in contigs_list_by_name:
122+
# Sort the contigs dict by length
123+
contigs_list_by_length = sorted(contigs_dict, key=lambda k: contigs_dict[k], reverse=True)
124+
'''
117125

118-
score_per_region_contig_df = score_per_region_df[score_per_region_df['Contig_name'] == contig]
119-
contigs_dict[contig] = score_per_region_contig_df[['Contig_name', 'Position', 'Synteny_score']]
126+
before = time.time()
127+
# Split the 'Region' column into Contig_name and Position
128+
pattern = re.compile(r'(\S+)_(\d+)_\d+')
129+
score_per_region_df[['Contig_name', 'Position']] = score_per_region_df['Region'].str.extract(pattern)
130+
#score_per_region_df[['Contig_name', 'Position']] = score_per_region_df['Region'].str.extract(r'(\S+)_(\d+)_\d+')
131+
score_per_region_df['Position'] = score_per_region_df['Position'].astype(int)
120132

121-
# Find contig length by the last position
122-
score_per_region_contig_df = score_per_region_contig_df.sort_values('Position')
123-
contig_length = score_per_region_contig_df.iloc[-1]['Position'] + config.region_length
124-
contig_length_dict[contig] = contig_length
133+
after = time.time()
134+
duration = after - before
135+
print("Extract position from region took " + str(duration) + " seconds.\n")
125136

126-
# Sort the contig lengths dict by the lengths and return a sorted list of names
127-
#sorted_dict = dict(sorted(contig_length_dict.items(), key=lambda item: item[1], reverse=True))
128-
sorted_by_length_list = sorted(contig_length_dict, key=contig_length_dict.get, reverse=True)
137+
# Get a list of contigs, sorted by name
138+
# If the contig names contain numbers, sort them numerically
139+
#if re.search(r"^\S+_\d+$", score_per_region_df.iloc[0]['Contig_name']):
129140

130-
return contigs_dict, contigs_list_by_name, sorted_by_length_list
141+
# Create a temporary column 'contigs_sort' to sort the contig names numericlly
142+
# score_per_region_df['Contig_number'] = score_per_region_df['Contig_name'].str.extract(r'\S+_(\d+)')\
143+
# .astype(int)
144+
145+
# contigs_list_by_name = list(score_per_region_df.sort_values('Contig_number').groupby(['Contig_name'],
146+
# sort=False).groups)
147+
148+
#else:
149+
# contigs_list_by_name = list(score_per_region_df.groupby(['Contig_name']).groups)
150+
151+
before = time.time()
152+
contigs_list_by_name = list(score_per_region_df.groupby(['Contig_name']).groups)
153+
after = time.time()
154+
duration = after - before
155+
print("Sort by name took " + str(duration) + " seconds.\n")
156+
157+
# Get a list of contigs, sorted by length
158+
before = time.time()
159+
contigs_list_by_length = list(score_per_region_df.sort_values('Position', ascending=False).groupby(['Contig_name'],
160+
sort=False).
161+
groups)
162+
after = time.time()
163+
duration = after - before
164+
print("Sort by length took " + str(duration) + " seconds.\n")
165+
166+
return contigs_list_by_name, contigs_list_by_length
131167

132168

SynTrackerVis_app/plots_multi_genomes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def plot_species_vs_sampling_size_bar(df, sampling_size, is_all_regions):
5151

5252
def create_box_plot(avg_df, pvalues_df, color, use_metadata, feature, same_color, different_color):
5353

54-
print("\ncreate_box_plot: Feature is " + feature)
54+
#print("\ncreate_box_plot: Feature is " + feature)
5555
#print("\nAPSS dataframe:")
5656
#print(avg_df)
5757

0 commit comments

Comments
 (0)