9
9
import getpass
10
10
11
11
import covizu
12
- from covizu .minimap2 import minimap2 , encode_diffs
12
+ from covizu import minimap2
13
+ #from covizu.minimap2 import minimap2, encode_diffs
13
14
from covizu .utils .seq_utils import *
14
15
from covizu .utils .progress_utils import Callback
15
16
17
+ import gc
18
+
16
19
17
20
def download_feed (url , user , password ):
18
21
"""
@@ -23,6 +26,9 @@ def download_feed(url, user, password):
23
26
:param password: str, access credentials - if None, query user
24
27
:return: str, path to time-stamped download file
25
28
"""
29
+ if url is None :
30
+ print ("Error: no URL specified in download_feed()" )
31
+ sys .exit ()
26
32
if user is None :
27
33
user = getpass .getpass ("GISAID username: " )
28
34
if password is None :
@@ -35,7 +41,9 @@ def download_feed(url, user, password):
35
41
36
42
def load_gisaid (path , minlen = 29000 , mindate = '2019-12-01' , callback = None ,
37
43
fields = ("covv_accession_id" , "covv_virus_name" , "covv_lineage" ,
38
- "covv_collection_date" , "covv_location" , "sequence" )):
44
+ "covv_collection_date" , "covv_location" , "sequence" ),
45
+ debug = None
46
+ ):
39
47
"""
40
48
Read in GISAID feed as xz compressed JSON, applying some basic filters
41
49
@@ -44,13 +52,16 @@ def load_gisaid(path, minlen=29000, mindate='2019-12-01', callback=None,
44
52
:param mindate: datetime.date, earliest reasonable sample collection date
45
53
:param callback: function, optional callback function
46
54
:param fields: tuple, fieldnames to keep
55
+ :param debug: int, if >0 then limits input JSON for debugging
47
56
48
57
:yield: dict, contents of each GISAID record
49
58
"""
50
59
mindate = fromisoformat (mindate )
51
60
rejects = {'short' : 0 , 'baddate' : 0 , 'nonhuman' : 0 , 'nolineage' : 0 }
52
61
with lzma .open (path , 'rb' ) as handle :
53
- for line in handle :
62
+ for ln , line in enumerate (handle ):
63
+ if debug and ln > debug :
64
+ break
54
65
record = json .loads (line )
55
66
56
67
# remove unused data
@@ -132,9 +143,9 @@ def extract_features(batcher, ref_file, binpath='minimap2', nthread=3, minlen=29
132
143
reflen = len (convert_fasta (handle )[0 ][1 ])
133
144
134
145
for fasta , batch in batcher :
135
- mm2 = minimap2 (fasta , ref_file , stream = True , path = binpath , nthread = nthread ,
146
+ mm2 = minimap2 . minimap2 (fasta , ref_file , stream = True , path = binpath , nthread = nthread ,
136
147
minlen = minlen )
137
- result = list (encode_diffs (mm2 , reflen = reflen ))
148
+ result = list (minimap2 . encode_diffs (mm2 , reflen = reflen ))
138
149
for row , record in zip (result , batch ):
139
150
# reconcile minimap2 output with GISAID record
140
151
qname , diffs , missing = row
@@ -218,7 +229,8 @@ def filter_problematic(records, origin='2019-12-01', rate=0.0655, cutoff=0.005,
218
229
219
230
def sort_by_lineage (records , callback = None , interval = 10000 ):
220
231
"""
221
- Resolve stream into a dictionary keyed by Pangolin lineage
232
+ Resolve stream into a dictionary keyed by Pangolin lineage.
233
+ Note: records yielded from generator accumulate in this function.
222
234
223
235
:param records: generator, return value of extract_features()
224
236
:param callback: optional, progress monitoring
@@ -231,14 +243,21 @@ def sort_by_lineage(records, callback=None, interval=10000):
231
243
callback ('aligned {} records' .format (i ))
232
244
233
245
lineage = record ['covv_lineage' ]
246
+ diffs = record .pop ('diffs' ) # REMOVE entry from record!
247
+ if diffs is not None :
248
+ diffs .sort ()
249
+ key = ',' .join (['|' .join (map (str , diff )) for diff in diffs ])
234
250
235
251
if str (lineage ) == "None" or lineage == '' :
236
252
# discard uncategorized genomes, #324, #335
237
253
continue
238
254
239
255
if lineage not in result :
240
- result .update ({lineage : []})
241
- result [lineage ].append (record )
256
+ result .update ({lineage : {}})
257
+ if key not in result [lineage ]:
258
+ result [lineage ].update ({key : []})
259
+
260
+ result [lineage ][key ].append (record )
242
261
243
262
return result
244
263
@@ -288,11 +307,11 @@ def parse_args():
288
307
289
308
parser .add_argument ('--infile' , type = str , default = None ,
290
309
help = "input, path to xz-compressed JSON" )
291
- parser .add_argument ('--url' , type = str , default = os . environ [ "GISAID_URL" ],
310
+ parser .add_argument ('--url' , type = str ,
292
311
help = "URL to download provision file, defaults to environment variable." )
293
- parser .add_argument ('--user' , type = str , default = os . environ [ "GISAID_USER" ],
312
+ parser .add_argument ('--user' , type = str ,
294
313
help = "GISAID username, defaults to environment variable." )
295
- parser .add_argument ('--password' , type = str , default = os . environ [ "GISAID_PSWD" ],
314
+ parser .add_argument ('--password' , type = str ,
296
315
help = "GISAID password, defaults to environment variable." )
297
316
298
317
parser .add_argument ('--minlen' , type = int , default = 29000 , help = 'option, minimum genome length' )
@@ -316,7 +335,20 @@ def parse_args():
316
335
help = "Path to VCF file of problematic sites in SARS-COV-2 genome. "
317
336
"Source: https://github.com/W-L/ProblematicSites_SARS-CoV2" )
318
337
319
- return parser .parse_args ()
338
+ parser .add_argument ("--debug" , type = int , help = "int, limit number of rows of input xz file to parse for debugging" )
339
+
340
+ args = parser .parse_args ()
341
+
342
+ if args .url is None and "GISAID_URL" in os .environ :
343
+ args .url = os .environ ["GISAID_URL" ]
344
+ if args .user is None and "GISAID_USER" in os .environ :
345
+ args .user = os .environ ["GISAID_USER" ]
346
+ # otherwise download_feed() will prompt for username
347
+ if args .password is None and "GISAID_PSWD" in os .environ :
348
+ args .password = os .environ ["GISAID_PSWD" ]
349
+ # otherwise download_feed() will prompt for password
350
+
351
+ return args
320
352
321
353
322
354
if __name__ == '__main__' :
@@ -329,7 +361,8 @@ def parse_args():
329
361
if args .infile is None :
330
362
args .infile = download_feed (args .url , args .user , args .password )
331
363
332
- loader = load_gisaid (args .infile , minlen = args .minlen , mindate = args .mindate )
364
+ loader = load_gisaid (args .infile , minlen = args .minlen , mindate = args .mindate ,
365
+ debug = args .debug )
333
366
batcher = batch_fasta (loader , size = args .batchsize )
334
367
aligned = extract_features (batcher , ref_file = args .ref , binpath = args .binpath ,
335
368
nthread = args .mmthreads , minlen = args .minlen )
0 commit comments