-
Notifications
You must be signed in to change notification settings - Fork 0
/
format_all_data.py
89 lines (74 loc) · 2.91 KB
/
format_all_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
''' In this script, different data sources are compared with their patient id,
the patients without full data categories will be disgard. The output files
will have identical patients id list.
'''
import csv
import re
import os
id_matcher = re.compile("TCGA-\w{2}-\w{4}")
def instance_unified(histology_file_list, cbioportal_file_list, clinical_file):
clinical_reader = csv.reader(open(clinical_file, 'r'), delimiter=',')
clinical_data = []
# bypass header
header = clinical_reader.next()
for line in clinical_reader:
if line[1] == "None":
pass
else:
clinical_data.append(line)
clinical_writer = csv.writer(
open('source/' + os.path.basename(clinical_file), 'w'),
delimiter=',')
clinical_writer.writerow(header)
for line in clinical_data:
clinical_writer.writerow(line)
for file in histology_file_list:
histology_dict = {}
reader = csv.reader(open(file, 'r'), delimiter=',')
writer = csv.writer(open('source/' + os.path.basename(file), 'w'),
delimiter=',')
header = reader.next()
for line in reader:
id = id_matcher.search(line[0]).group()
histology_dict[id] = line
writer.writerow(header)
emptyrow = ['NA'] * len(header)
for patient in clinical_data:
if patient[0] in histology_dict:
writer.writerow(histology_dict[patient[0]])
else:
emptyrow[0] = patient[0]
writer.writerow(emptyrow)
for file in cbioportal_file_list:
cbioportal_dict = {}
reader = csv.reader(open(file, 'r'), delimiter='\t')
writer = csv.writer(open('source/' + os.path.basename(file), 'w'),
delimiter=',')
ids = reader.next()[2:]
ids = map(lambda x: id_matcher.search(x).group(), ids)
header = ['id']
for id in ids:
cbioportal_dict[id] = []
for line in reader:
header.append(line[0])
line = line[2:]
for i in range(len(ids)):
cbioportal_dict[ids[i]].append(line[i])
writer.writerow(header)
emptyrow = ['NA'] * len(header)
for patient in clinical_data:
if patient[0] in cbioportal_dict:
line = cbioportal_dict[patient[0]]
line.insert(0, patient[0])
writer.writerow(line)
else:
emptyrow[0] = patient[0]
writer.writerow(emptyrow)
if __name__ == "__main__":
instance_unified(['pipeline/cell.csv',
'pipeline/cytoplasm.csv',
'pipeline/nulei.csv'],
['cbioportal/data_expression_Zscores.txt',
'cbioportal/data_linear_CNA.txt',
'cbioportal/data_mRNA_median_Zscores.txt'],
'TCGAMaxim/clinical.csv')