-
Notifications
You must be signed in to change notification settings - Fork 0
/
import_data.py
207 lines (174 loc) · 7.62 KB
/
import_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
"""Several types of data files will be imported into the otu db.
This module encompasses all the techniques.
"""
import os
import attr
from clize import run, parameters
from otudb.database import otudb, tables
from otudb.parsers import CSVParser, FastaParser
from otudb.utils import log_it
log = log_it(logname='import_data')
otu_data_file_imports = {
# sample_info
'sample_info': ['sample_name', 'sample_type', 'sex', 'cage', 'time', 'conditions'],
# analysis_sets
'analysis_sets': ['set_name', 'desc'], # TODO: this file will need to be created by end user, or parsed from others?
# otu_seqs
'otu_seq_fasta': [], # standard FASTA format
# otu_annotation, RDP format. otu_name header empty!
'otu_taxa_rdp': ['otu_name', 'phylum', 'class', 'order', 'family', 'genus'],
# otu_annotation, GreenGenes format. NO header line! taxa field is ;-delimited
'otu_taxa_gg': ['otu_name', 'percent_identity', 'p_value', 'k__domain; p__phylum; c__class; o__order; f__family; g__genus; s__species'],
# otu_counts: filename 'otu_table.tsv' contains the read count of each pair
'otu_table': ['OTUId', 'sample_id', '*sample_id'], # , sample_id, sample_id, ...
}
def sample_import(filepath):
"""import sample metadata into the db"""
log.info('Starting to import sample metadata')
try:
si = CSVParser(filepath, mode='r', delimiter=',')
sample_info = tables.models.sample_info
with otudb.transaction():
row_count = 0
for row in si.load_data():
row_count+=1
log.info('Importing: %s',row['sample_name'])
res = sample_info.create(sample_name=row['sample_name'],
sample_type=row['sample_type'],
study=row['study'],
sex=row['sex'],
cage=row['cage'],
time=row['time'],
)
log.info('Completed importing %s rows from: %s', row_count, filepath)
except Exception as e:
log.error(f'Whoops while importing {filepath}.')
raise e
pass
def analysis_import(filepath):
"""import analysis sets into the db"""
log.info('Starting to import analysis set info.')
sets = CSVParser(filepath, mode='r', delimiter='\t')
pass
def fasta_import(filepath):
"""import a fasta file into the db"""
log.info('Starting to import FASTA')
try:
fp = FastaParser(filepath, mode='r')
seq_table = tables.models.otu_seq
row_count = 0
# fields = [seq_table.otu_name, seq_table.sequence, seq_table.sequence]
with otudb.transaction():
for head, seq in fp.load_data():
row_count+=1
log.info('Importing: %s',head)
res = seq_table.create(otu_name=head,
sequence=seq,
seq_length=len(seq)
)
log.info('Completed importing %s rows from: %s', row_count, filepath)
except Exception as e:
log.error(f'Whoops while importing {filepath}.')
raise e
# else:
def count_table_import(filepath):
"""import an OTU table of OTUid and Sample Name(s)
as a matrix of percent abundance values as a
csv file into the db
"""
log.info('Starting to import OTU count table')
try:
tp = CSVParser(filepath, mode='r', delimiter='\t')
counts = tables.models.otu_counts
sample_info = tables.models.sample_info
sample_names = tp.get_fieldnames()[1:] # first field is OTUId
log.info(f'{tp.filename} sample list: {sample_names}')
with otudb.transaction():
row_count = 0
for row in tp.load_data():
row_count+=1
log.info('Importing: %s',row['OTUId'])
sample_id = 0 #TODO implement sample_info imports for relationship
for sample in sample_names:
# sample_id = sample_info.get('sample_name' == sample).sample_id
sample_id += 1
if sample_id:
log.info('Importing: %s of %s with %s%%',row['OTUId'],sample,row[sample])
res = counts.create(otu_id=row['OTUId'],
sample_id=sample_id,
percent_abundance=row[sample]
)
else:
log.info('"%s" not found in sample_info table.',sample)
log.info('Completed importing %s rows from: %s', row_count, filepath)
except Exception as e:
log.error(f'Whoops while importing {filepath}.')
raise e
def taxa_import_rdp(filepath):
"""import a taxa annotation file into the db"""
log.info('Importing RDP taxonomy.')
fieldnames = otu_data_file_imports['otu_taxa_rdp']
try:
tp = CSVParser(filepath, mode='r', delimiter='\t', fieldnames=fieldnames)
with otudb.transaction():
for row in tp.load_data():
pass
except Exception as e:
log.error(f'Whoops while importing {filepath} in RDP format.')
raise e
def taxa_import_gg(filepath):
"""import a taxa annotation file into the db"""
log.info('Importing GreenGenes taxonomy.')
fieldnames = otu_data_file_imports['otu_taxa_gg']
try:
tp = CSVParser(filepath, mode='r', delimiter='\t', fieldnames=fieldnames)
with otudb.transaction():
for row in tp.load_data():
pass
except Exception as e:
log.error(f'Whoops while importing {filepath} in GG format.')
raise e
def taxa_import(filepath):
"""import a taxa annotation file into the db"""
log.info('Starting to import taxonomy annotations.')
try:
fh = open(filepath, mode='r')
line = fh.readline()
log.info('Determing Taxa file source...')
if 'k__' in line:
return taxa_import_gg(filepath)
else:
return taxa_import_rdp(filepath)
except Exception as e:
log.error(f'Whoops while importing {filepath}.')
raise e
types_of_imports = parameters.one_of(
('sample', "sample metadata"),
('analysis', "analysis sets with names, descriptions"),
('count', "otu count table, pct abundance per sample"),
('fasta', "otu seq fasta"),
('taxa', "otu annotations (taxonomy)"),
)
def parse_import(*,
filepath:['p', str]=None,
filetype:['t', types_of_imports]=None,
):
"""Perform imports of files into OTUdb, as indicated.
:param filepath: path to file to be imported (REQUIRED)
:param filetype: which type of file are you importing?
Possible types (-t) of files to import are:
. sample: sample metadata\n
. analysis: analysis sets with names, descriptions\n
. count: otu count table, pct abundance per sample\n
. fasta: otu seq fasta\n
. taxa: otu annotations (taxonomy)
"""
if not filepath or not filetype:
log.error(' Whoops! Path *and* type of file to be imported are required...')
return
elif filetype == 'sample': return sample_import(filepath)
elif filetype == 'analysis': return analysis_import(filepath)
elif filetype == 'fasta': return fasta_import(filepath)
elif filetype == 'count': return count_table_import(filepath)
elif filetype == 'taxa': return taxa_import(filepath)
run(parse_import)