-
Notifications
You must be signed in to change notification settings - Fork 4
/
10_reorganize_variants.py
executable file
·120 lines (90 loc) · 3.88 KB
/
10_reorganize_variants.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#! /usr/bin/python3
#
# This source code is part of icgc, an ICGC processing pipeline.
#
# Icgc is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Icgc is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see<http://www.gnu.org/licenses/>.
#
# Contact: [email protected]
#
import time
from config import Config
from icgc_utils.common_queries import *
from icgc_utils.processes import *
variant_columns = ['icgc_mutation_id', 'chromosome','icgc_donor_id', 'icgc_specimen_id', 'icgc_sample_id',
'submitted_sample_id','control_genotype', 'tumor_genotype', 'total_read_count', 'mutant_allele_read_count']
def time_qry(cursor, qry):
time0 = time.time()
search_db(cursor,qry, verbose=False)
time1 = time.time()
print("\n%s\ndone in %.3f mins" % (qry, float(time1-time0)/60))
#########################################
def reorganize_variants(cursor, orig_icgc_table):
keep_string = ", ".join([c for c in variant_columns])
switch_to_db(cursor, "icgc")
tmp_table = "scratch_%d_%s" % (os.getpid(), orig_icgc_table)
check_and_drop(cursor, tmp_table)
qry = "create temporary table %s engine=MYISAM as select %s from %s " % (tmp_table, keep_string, orig_icgc_table)
time_qry(cursor,qry)
new_icgc_table = orig_icgc_table.replace("_temp", "")
check_and_drop(cursor,new_icgc_table)
qry = "create table %s like %s" % (new_icgc_table, tmp_table)
time_qry(cursor,qry)
# add mut_to_total_read_count_ratio, pathogenicity and reliability columns to the new variants table
qry = "alter table %s add column mut_to_total_read_count_ratio float default 0.0, " % new_icgc_table
qry += "add column pathogenicity_estimate boolean default 0, add column reliability_estimate boolean default 0"
time_qry(cursor,qry)
qry = "insert into %s (%s) select distinct * from %s" % (new_icgc_table, keep_string, tmp_table)
time_qry(cursor,qry)
# add back the primary key
qry = "alter table %s add column id int not null primary key auto_increment first" % new_icgc_table
time_qry(cursor,qry)
check_and_drop(cursor,'icgc', tmp_table)
#########################################
def reorganize(tables, other_args):
db = connect_to_mysql(Config.mysql_conf_file)
cursor = db.cursor()
switch_to_db(cursor,"icgc")
for table in tables:
time0 = time.time()
print("====================")
print("reorganizing variants from ", table, os.getpid())
reorganize_variants(cursor, table)
time1 = time.time()
print(("\t\t %s (%d) done in %.3f mins" % (table, tables.index(table), float(time1-time0)/60)), os.getpid())
cursor.close()
db.close()
return
#########################################
#########################################
def main():
print("disabled")
exit()
db = connect_to_mysql(Config.mysql_conf_file)
cursor = db.cursor()
#########################
# which temp somatic tables do we have
qry = "select table_name from information_schema.tables "
qry += "where table_schema='icgc' and table_name like '%simple_somatic_temp'"
tables = [field[0] for field in search_db(cursor,qry)]
table_size = get_table_size(cursor, 'icgc', tables)
cursor.close()
db.close()
tables_sorted = sorted(tables, key=lambda t: table_size[t], reverse=True)
half = int(len(tables_sorted)/2)
tables_mirrored = tables_sorted[0:half] + list(reversed(tables_sorted[half:]))
number_of_chunks = half
parallelize(number_of_chunks, reorganize, tables_mirrored, [], round_robin=True)
#########################################
if __name__ == '__main__':
main()