Skip to content

Commit ac00d0c

Browse files
authored
Merge pull request #401 from PoonLab/dev
Improve page load times
2 parents ced6331 + fd41251 commit ac00d0c

22 files changed

+1197
-712
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,6 @@ debug/*
99
build/*
1010
dist/*
1111
venv/*
12+
node_modules/*
1213
cypress/fixtures/
1314
cypress/plugins/

CONTRIBUTING.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@ Development of CoVizu is primarily being carried out on workstations and servers
2828
However, we have also run the system on desktop computers running macOS 10.13.
2929
Most web development is tested in Google Chrome.
3030

31+
To setup the development environment:
32+
1. Navigate to the directory containing `package.json`
33+
2. Run the command `npm install`
34+
3. Run the `run-server.sh` script or `npm start`
35+
3136

3237
## Coding style
3338

INSTALL.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
* [TreeTime](https://github.com/neherlab/treetime) version 0.7.5+
1010
* [RapidNJ](https://birc.au.dk/software/rapidnj/)
1111
* [git](https://git-scm.com/)
12+
* [Node.js](https://nodejs.org/en/download/)
13+
* [npm](https://docs.npmjs.com/about-npm-versions)
1214

1315
If running locally (without dedicated GISAID feed):
1416
* [Pangolin](https://github.com/cov-lineages/pangolin/)

batch.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,13 +169,18 @@ def process_feed(args, callback=None):
169169
# write data stats
170170
dbstat_file = os.path.join(args.outdir, 'dbstats.{}.json'.format(timestamp))
171171
with open(dbstat_file, 'w') as handle:
172-
nseqs = sum([len(rows) for rows in by_lineage.values()])
172+
# total number of sequences
173+
nseqs = 0
174+
for records in by_lineage.values():
175+
for variant in records.values():
176+
nseqs += len(variant) # number of samples
173177
val = {
174178
'lastupdate': timestamp.split('T')[0],
175179
'noseqs': nseqs,
176180
'lineages': {}
177181
}
178-
for lineage, samples in by_lineage.items():
182+
for lineage, records in by_lineage.items():
183+
samples = unpack_records(records)
179184
ndiffs = [len(x['diffs']) for x in samples]
180185
val['lineages'][lineage] = {
181186
'nsamples': len(samples),

covizu/clustering.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def recode_features(records, callback=None, limit=10000):
2121
Recode feature vectors with integer indices based on set union.
2222
Pass results to bootstrap() to reconstruct trees by neighbor-joining method.
2323
24-
:param records: list, dict for each record
24+
:param records: dict, samples keyed by unique mutation set
2525
:param callback: optional, function for progress monitoring
2626
:param limit: int, maximum number of variants to prevent memory allocation crashes
2727
:return: dict, key-value pairs of all features indexed by integers
@@ -30,16 +30,17 @@ def recode_features(records, callback=None, limit=10000):
3030
"""
3131
# compress genomes with identical feature vectors
3232
fvecs = {}
33-
for record in records:
34-
label = '|'.join([record['covv_virus_name'], record['covv_accession_id'],
35-
record['covv_collection_date']])
36-
key = tuple([tuple(x) for x in record['diffs']])
33+
for muts, variant in records.items():
34+
key = tuple([tuple(x.split('|')) for x in muts.split(',')])
3735
if key not in fvecs:
3836
fvecs.update({key: []})
39-
fvecs[key].append(label)
37+
for sample in variant:
38+
label = "{covv_virus_name}|{covv_accession_id}|{covv_collection_date}".format(**sample)
39+
fvecs[key].append(label)
4040

4141
# limit to N most recently-sampled feature vectors
42-
intermed = [(max([l.split('|')[-1] for l in label]), key) for key, label in fvecs.items()]
42+
intermed = [(max([label.split('|')[-1] for label in labels]), key)
43+
for key, labels in fvecs.items()]
4344
intermed.sort(reverse=True)
4445

4546
# generate union of all features

covizu/minimap2.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import json
77

88
import covizu
9-
from covizu.utils import gisaid_utils
9+
import covizu.utils.gisaid_utils
1010

1111

1212
def apply_cigar(seq, rpos, cigar):

covizu/treetime.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,8 @@ def retrieve_genomes(by_lineage, known_seqs, ref_file, earliest=True, callback=N
191191

192192
# retrieve unaligned genomes from database
193193
for lineage, records in by_lineage.items():
194+
records = covizu.utils.batch_utils.unpack_records(records)
195+
194196
# filter records for lineage-defining genomes
195197
curated = filter(
196198
lambda r: r['covv_virus_name'].replace('hCoV-19/', '').replace(' ', '_') # issue #313
@@ -284,6 +286,12 @@ def parse_args():
284286
cb.callback("Identifying lineage representative genomes")
285287
fasta = retrieve_genomes(by_lineage, known_seqs=lineages, ref_file=args.ref, earliest=args.earliest,
286288
callback=cb.callback)
289+
outfile = open("iss385.fasta", 'w')
290+
for header, seq in fasta.items():
291+
outfile.write(f">{header}\n{seq}\n")
292+
outfile.close()
293+
294+
sys.exit()
287295

288296
cb.callback("Reconstructing tree with {}".format(args.ft2bin))
289297
nwk = fasttree(fasta, binpath=args.ft2bin)

covizu/utils/batch_utils.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,38 @@
55
import json
66

77

8+
def unpack_records(records):
9+
"""
10+
by_lineage is a nested dict with the inner dicts keyed by serialized
11+
mutation sets (diffs). This function is used to reconstitute the mutations
12+
as a list of tuples, restoring the diffs entry for each record, and to return
13+
a list of dicts.
14+
Used in:
15+
- treetime:retrieve_genomes()
16+
- get_mutations()
17+
- batch.py, command line interface
18+
See issue: https://github.com/PoonLab/covizu/issues/387
19+
20+
@param records: dict, sets of genomes under a single lineage classification,
21+
each set keyed by their shared set of mutations (diffs)
22+
@return: a list of dicts, each dict representing a genome and its metadata
23+
"""
24+
unpacked = []
25+
for key, variant in records.items():
26+
# reconstitute the mutations defining this variant
27+
diffs = []
28+
for mutation in key.split(','):
29+
typ, pos, alt = mutation.split('|')
30+
if typ == '-':
31+
alt = int(alt) # number of nucleotides in indel
32+
diffs.append(tuple([typ, int(pos), alt]))
33+
34+
for sample in variant:
35+
sample.update({'diffs': diffs})
36+
unpacked.append(sample)
37+
return unpacked
38+
39+
840
def build_timetree(by_lineage, args, callback=None):
941
""" Generate time-scaled tree of Pangolin lineages """
1042

@@ -215,7 +247,9 @@ def get_mutations(by_lineage):
215247
:return: dict, common mutations by lineage
216248
"""
217249
result = {}
218-
for lineage, samples in by_lineage.items():
250+
for lineage, records in by_lineage.items():
251+
samples = unpack_records(records)
252+
219253
# enumerate features
220254
counts = {}
221255
for sample in samples:
@@ -224,8 +258,11 @@ def get_mutations(by_lineage):
224258
if feat not in counts:
225259
counts.update({feat: 0})
226260
counts[feat] += 1
261+
227262
# filter for mutations that occur in at least half of samples
228-
common = dict([(feat, count/len(samples)) for feat, count in counts.items() if count/len(samples) >= 0.5])
263+
common = dict([(feat, count/len(samples)) for feat, count in counts.items()
264+
if count/len(samples) >= 0.5])
229265
result.update({lineage: common})
266+
230267
return result
231268

covizu/utils/gisaid_utils.py

Lines changed: 46 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,13 @@
99
import getpass
1010

1111
import covizu
12-
from covizu.minimap2 import minimap2, encode_diffs
12+
from covizu import minimap2
13+
#from covizu.minimap2 import minimap2, encode_diffs
1314
from covizu.utils.seq_utils import *
1415
from covizu.utils.progress_utils import Callback
1516

17+
import gc
18+
1619

1720
def download_feed(url, user, password):
1821
"""
@@ -23,6 +26,9 @@ def download_feed(url, user, password):
2326
:param password: str, access credentials - if None, query user
2427
:return: str, path to time-stamped download file
2528
"""
29+
if url is None:
30+
print("Error: no URL specified in download_feed()")
31+
sys.exit()
2632
if user is None:
2733
user = getpass.getpass("GISAID username: ")
2834
if password is None:
@@ -35,7 +41,9 @@ def download_feed(url, user, password):
3541

3642
def load_gisaid(path, minlen=29000, mindate='2019-12-01', callback=None,
3743
fields=("covv_accession_id", "covv_virus_name", "covv_lineage",
38-
"covv_collection_date", "covv_location", "sequence")):
44+
"covv_collection_date", "covv_location", "sequence"),
45+
debug=None
46+
):
3947
"""
4048
Read in GISAID feed as xz compressed JSON, applying some basic filters
4149
@@ -44,13 +52,16 @@ def load_gisaid(path, minlen=29000, mindate='2019-12-01', callback=None,
4452
:param mindate: datetime.date, earliest reasonable sample collection date
4553
:param callback: function, optional callback function
4654
:param fields: tuple, fieldnames to keep
55+
:param debug: int, if >0 then limits input JSON for debugging
4756
4857
:yield: dict, contents of each GISAID record
4958
"""
5059
mindate = fromisoformat(mindate)
5160
rejects = {'short': 0, 'baddate': 0, 'nonhuman': 0, 'nolineage': 0}
5261
with lzma.open(path, 'rb') as handle:
53-
for line in handle:
62+
for ln, line in enumerate(handle):
63+
if debug and ln > debug:
64+
break
5465
record = json.loads(line)
5566

5667
# remove unused data
@@ -132,9 +143,9 @@ def extract_features(batcher, ref_file, binpath='minimap2', nthread=3, minlen=29
132143
reflen = len(convert_fasta(handle)[0][1])
133144

134145
for fasta, batch in batcher:
135-
mm2 = minimap2(fasta, ref_file, stream=True, path=binpath, nthread=nthread,
146+
mm2 = minimap2.minimap2(fasta, ref_file, stream=True, path=binpath, nthread=nthread,
136147
minlen=minlen)
137-
result = list(encode_diffs(mm2, reflen=reflen))
148+
result = list(minimap2.encode_diffs(mm2, reflen=reflen))
138149
for row, record in zip(result, batch):
139150
# reconcile minimap2 output with GISAID record
140151
qname, diffs, missing = row
@@ -218,7 +229,8 @@ def filter_problematic(records, origin='2019-12-01', rate=0.0655, cutoff=0.005,
218229

219230
def sort_by_lineage(records, callback=None, interval=10000):
220231
"""
221-
Resolve stream into a dictionary keyed by Pangolin lineage
232+
Resolve stream into a dictionary keyed by Pangolin lineage.
233+
Note: records yielded from generator accumulate in this function.
222234
223235
:param records: generator, return value of extract_features()
224236
:param callback: optional, progress monitoring
@@ -231,14 +243,21 @@ def sort_by_lineage(records, callback=None, interval=10000):
231243
callback('aligned {} records'.format(i))
232244

233245
lineage = record['covv_lineage']
246+
diffs = record.pop('diffs') # REMOVE entry from record!
247+
if diffs is not None:
248+
diffs.sort()
249+
key = ','.join(['|'.join(map(str, diff)) for diff in diffs])
234250

235251
if str(lineage) == "None" or lineage == '':
236252
# discard uncategorized genomes, #324, #335
237253
continue
238254

239255
if lineage not in result:
240-
result.update({lineage: []})
241-
result[lineage].append(record)
256+
result.update({lineage: {}})
257+
if key not in result[lineage]:
258+
result[lineage].update({key: []})
259+
260+
result[lineage][key].append(record)
242261

243262
return result
244263

@@ -288,11 +307,11 @@ def parse_args():
288307

289308
parser.add_argument('--infile', type=str, default=None,
290309
help="input, path to xz-compressed JSON")
291-
parser.add_argument('--url', type=str, default=os.environ["GISAID_URL"],
310+
parser.add_argument('--url', type=str,
292311
help="URL to download provision file, defaults to environment variable.")
293-
parser.add_argument('--user', type=str, default=os.environ["GISAID_USER"],
312+
parser.add_argument('--user', type=str,
294313
help="GISAID username, defaults to environment variable.")
295-
parser.add_argument('--password', type=str, default=os.environ["GISAID_PSWD"],
314+
parser.add_argument('--password', type=str,
296315
help="GISAID password, defaults to environment variable.")
297316

298317
parser.add_argument('--minlen', type=int, default=29000, help='option, minimum genome length')
@@ -316,7 +335,20 @@ def parse_args():
316335
help="Path to VCF file of problematic sites in SARS-COV-2 genome. "
317336
"Source: https://github.com/W-L/ProblematicSites_SARS-CoV2")
318337

319-
return parser.parse_args()
338+
parser.add_argument("--debug", type=int, help="int, limit number of rows of input xz file to parse for debugging")
339+
340+
args = parser.parse_args()
341+
342+
if args.url is None and "GISAID_URL" in os.environ:
343+
args.url = os.environ["GISAID_URL"]
344+
if args.user is None and "GISAID_USER" in os.environ:
345+
args.user = os.environ["GISAID_USER"]
346+
# otherwise download_feed() will prompt for username
347+
if args.password is None and "GISAID_PSWD" in os.environ:
348+
args.password = os.environ["GISAID_PSWD"]
349+
# otherwise download_feed() will prompt for password
350+
351+
return args
320352

321353

322354
if __name__ == '__main__':
@@ -329,7 +361,8 @@ def parse_args():
329361
if args.infile is None:
330362
args.infile = download_feed(args.url, args.user, args.password)
331363

332-
loader = load_gisaid(args.infile, minlen=args.minlen, mindate=args.mindate)
364+
loader = load_gisaid(args.infile, minlen=args.minlen, mindate=args.mindate,
365+
debug=args.debug)
333366
batcher = batch_fasta(loader, size=args.batchsize)
334367
aligned = extract_features(batcher, ref_file=args.ref, binpath=args.binpath,
335368
nthread=args.mmthreads, minlen=args.minlen)

0 commit comments

Comments
 (0)