PoonLab
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎CONTRIBUTING.md
Lines changed: 5 additions & 0 deletions b/‎CONTRIBUTING.md
Lines changed: 5 additions & 0 deletions
diff --git a/‎INSTALL.md
Lines changed: 2 additions & 0 deletions b/‎INSTALL.md
Lines changed: 2 additions & 0 deletions
diff --git a/‎batch.py
Lines changed: 7 additions & 2 deletions b/‎batch.py
Lines changed: 7 additions & 2 deletions
diff --git a/‎covizu/clustering.py
Lines changed: 8 additions & 7 deletions b/‎covizu/clustering.py
Lines changed: 8 additions & 7 deletions
diff --git a/‎covizu/minimap2.py
Lines changed: 1 addition & 1 deletion b/‎covizu/minimap2.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎covizu/treetime.py
Lines changed: 8 additions & 0 deletions b/‎covizu/treetime.py
Lines changed: 8 additions & 0 deletions
diff --git a/‎covizu/utils/batch_utils.py
Lines changed: 39 additions & 2 deletions b/‎covizu/utils/batch_utils.py
Lines changed: 39 additions & 2 deletions
diff --git a/‎covizu/utils/gisaid_utils.py
Lines changed: 46 additions & 13 deletions b/‎covizu/utils/gisaid_utils.py
Lines changed: 46 additions & 13 deletions
@@ -9,5 +9,6 @@ debug/*
 build/*
 dist/*
 venv/*
+node_modules/*
 cypress/fixtures/
 cypress/plugins/
@@ -28,6 +28,11 @@ Development of CoVizu is primarily being carried out on workstations and servers
 However, we have also run the system on desktop computers running macOS 10.13.
 Most web development is tested in Google Chrome.
 
+To setup the development environment:
+1. Navigate to the directory containing `package.json`
+2. Run the command `npm install`
+3. Run the `run-server.sh` script or `npm start`
+
 
 ## Coding style
 
 
@@ -9,6 +9,8 @@
 * [TreeTime](https://github.com/neherlab/treetime) version 0.7.5+
 * [RapidNJ](https://birc.au.dk/software/rapidnj/)
 * [git](https://git-scm.com/)
+* [Node.js](https://nodejs.org/en/download/)
+* [npm](https://docs.npmjs.com/about-npm-versions)
 
 If running locally (without dedicated GISAID feed):
 * [Pangolin](https://github.com/cov-lineages/pangolin/)
 
@@ -169,13 +169,18 @@ def process_feed(args, callback=None):
     # write data stats
     dbstat_file = os.path.join(args.outdir, 'dbstats.{}.json'.format(timestamp))
     with open(dbstat_file, 'w') as handle:
-        nseqs = sum([len(rows) for rows in by_lineage.values()])
+        # total number of sequences
+        nseqs = 0
+        for records in by_lineage.values():
+            for variant in records.values():
+                nseqs += len(variant)  # number of samples
         val = {
             'lastupdate': timestamp.split('T')[0],
             'noseqs': nseqs,
             'lineages': {}
         }
-        for lineage, samples in by_lineage.items():
+        for lineage, records in by_lineage.items():
+            samples = unpack_records(records)
             ndiffs = [len(x['diffs']) for x in samples]
             val['lineages'][lineage] = {
                 'nsamples': len(samples),
 
@@ -21,7 +21,7 @@ def recode_features(records, callback=None, limit=10000):
     Recode feature vectors with integer indices based on set union.
     Pass results to bootstrap() to reconstruct trees by neighbor-joining method.
 
-    :param records:  list, dict for each record
+    :param records:  dict, samples keyed by unique mutation set
     :param callback:  optional, function for progress monitoring
     :param limit:  int, maximum number of variants to prevent memory allocation crashes
     :return:  dict, key-value pairs of all features indexed by integers
@@ -30,16 +30,17 @@ def recode_features(records, callback=None, limit=10000):
     """
     # compress genomes with identical feature vectors
     fvecs = {}
-    for record in records:
-        label = '|'.join([record['covv_virus_name'], record['covv_accession_id'],
-                          record['covv_collection_date']])
-        key = tuple([tuple(x) for x in record['diffs']])
+    for muts, variant in records.items():
+        key = tuple([tuple(x.split('|')) for x in muts.split(',')])
         if key not in fvecs:
             fvecs.update({key: []})
-        fvecs[key].append(label)
+        for sample in variant:
+            label = "{covv_virus_name}|{covv_accession_id}|{covv_collection_date}".format(**sample)
+            fvecs[key].append(label)
 
     # limit to N most recently-sampled feature vectors
-    intermed = [(max([l.split('|')[-1] for l in label]), key) for key, label in fvecs.items()]
+    intermed = [(max([label.split('|')[-1] for label in labels]), key)
+                for key, labels in fvecs.items()]
     intermed.sort(reverse=True)
 
     # generate union of all features
 
@@ -6,7 +6,7 @@
 import json
 
 import covizu
-from covizu.utils import gisaid_utils
+import covizu.utils.gisaid_utils
 
 
 def apply_cigar(seq, rpos, cigar):
 
@@ -191,6 +191,8 @@ def retrieve_genomes(by_lineage, known_seqs, ref_file, earliest=True, callback=N
 
     # retrieve unaligned genomes from database
     for lineage, records in by_lineage.items():
+        records = covizu.utils.batch_utils.unpack_records(records)
+
         # filter records for lineage-defining genomes
         curated = filter(
             lambda r: r['covv_virus_name'].replace('hCoV-19/', '').replace(' ', '_')  # issue #313
@@ -284,6 +286,12 @@ def parse_args():
     cb.callback("Identifying lineage representative genomes")
     fasta = retrieve_genomes(by_lineage, known_seqs=lineages, ref_file=args.ref, earliest=args.earliest,
                              callback=cb.callback)
+    outfile = open("iss385.fasta", 'w')
+    for header, seq in fasta.items():
+        outfile.write(f">{header}\n{seq}\n")
+    outfile.close()
+
+    sys.exit()
 
     cb.callback("Reconstructing tree with {}".format(args.ft2bin))
     nwk = fasttree(fasta, binpath=args.ft2bin)
 
@@ -5,6 +5,38 @@
 import json
 
 
+def unpack_records(records):
+    """
+    by_lineage is a nested dict with the inner dicts keyed by serialized
+    mutation sets (diffs).  This function is used to reconstitute the mutations
+    as a list of tuples, restoring the diffs entry for each record, and to return
+    a list of dicts.
+    Used in:
+    - treetime:retrieve_genomes()
+    - get_mutations()
+    - batch.py, command line interface
+    See issue: https://github.com/PoonLab/covizu/issues/387
+
+    @param records:  dict, sets of genomes under a single lineage classification,
+                     each set keyed by their shared set of mutations (diffs)
+    @return:  a list of dicts, each dict representing a genome and its metadata
+    """
+    unpacked = []
+    for key, variant in records.items():
+        # reconstitute the mutations defining this variant
+        diffs = []
+        for mutation in key.split(','):
+            typ, pos, alt = mutation.split('|')
+            if typ == '-':
+                alt = int(alt)  # number of nucleotides in indel
+            diffs.append(tuple([typ, int(pos), alt]))
+
+        for sample in variant:
+            sample.update({'diffs': diffs})
+            unpacked.append(sample)
+    return unpacked
+
+
 def build_timetree(by_lineage, args, callback=None):
     """ Generate time-scaled tree of Pangolin lineages """
 
@@ -215,7 +247,9 @@ def get_mutations(by_lineage):
     :return:  dict, common mutations by lineage
     """
     result = {}
-    for lineage, samples in by_lineage.items():
+    for lineage, records in by_lineage.items():
+        samples = unpack_records(records)
+
         # enumerate features
         counts = {}
         for sample in samples:
@@ -224,8 +258,11 @@ def get_mutations(by_lineage):
                 if feat not in counts:
                     counts.update({feat: 0})
                 counts[feat] += 1
+
         # filter for mutations that occur in at least half of samples
-        common = dict([(feat, count/len(samples)) for feat, count in counts.items() if count/len(samples) >= 0.5])
+        common = dict([(feat, count/len(samples)) for feat, count in counts.items()
+                       if count/len(samples) >= 0.5])
         result.update({lineage: common})
+
     return result
 
@@ -9,10 +9,13 @@
 import getpass
 
 import covizu
-from covizu.minimap2 import minimap2, encode_diffs
+from covizu import minimap2
+#from covizu.minimap2 import minimap2, encode_diffs
 from covizu.utils.seq_utils import *
 from covizu.utils.progress_utils import Callback
 
+import gc
+
 
 def download_feed(url, user, password):
     """
@@ -23,6 +26,9 @@ def download_feed(url, user, password):
     :param password:  str, access credentials - if None, query user
     :return:  str, path to time-stamped download file
     """
+    if url is None:
+        print("Error: no URL specified in download_feed()")
+        sys.exit()
     if user is None:
         user = getpass.getpass("GISAID username: ")
     if password is None:
@@ -35,7 +41,9 @@ def download_feed(url, user, password):
 
 def load_gisaid(path, minlen=29000, mindate='2019-12-01', callback=None,
                 fields=("covv_accession_id", "covv_virus_name", "covv_lineage",
-                        "covv_collection_date", "covv_location", "sequence")):
+                        "covv_collection_date", "covv_location", "sequence"),
+                debug=None
+):
     """
     Read in GISAID feed as xz compressed JSON, applying some basic filters
 
@@ -44,13 +52,16 @@ def load_gisaid(path, minlen=29000, mindate='2019-12-01', callback=None,
     :param mindate:  datetime.date, earliest reasonable sample collection date
     :param callback:  function, optional callback function
     :param fields:  tuple, fieldnames to keep
+    :param debug:  int, if >0 then limits input JSON for debugging
 
     :yield:  dict, contents of each GISAID record
     """
     mindate = fromisoformat(mindate)
     rejects = {'short': 0, 'baddate': 0, 'nonhuman': 0, 'nolineage': 0}
     with lzma.open(path, 'rb') as handle:
-        for line in handle:
+        for ln, line in enumerate(handle):
+            if debug and ln > debug:
+                break
             record = json.loads(line)
 
             # remove unused data
@@ -132,9 +143,9 @@ def extract_features(batcher, ref_file, binpath='minimap2', nthread=3, minlen=29
         reflen = len(convert_fasta(handle)[0][1])
 
     for fasta, batch in batcher:
-        mm2 = minimap2(fasta, ref_file, stream=True, path=binpath, nthread=nthread,
+        mm2 = minimap2.minimap2(fasta, ref_file, stream=True, path=binpath, nthread=nthread,
                        minlen=minlen)
-        result = list(encode_diffs(mm2, reflen=reflen))
+        result = list(minimap2.encode_diffs(mm2, reflen=reflen))
         for row, record in zip(result, batch):
             # reconcile minimap2 output with GISAID record
             qname, diffs, missing = row
@@ -218,7 +229,8 @@ def filter_problematic(records, origin='2019-12-01', rate=0.0655, cutoff=0.005,
 
 def sort_by_lineage(records, callback=None, interval=10000):
     """
-    Resolve stream into a dictionary keyed by Pangolin lineage
+    Resolve stream into a dictionary keyed by Pangolin lineage.
+    Note: records yielded from generator accumulate in this function.
 
     :param records:  generator, return value of extract_features()
     :param callback:  optional, progress monitoring
@@ -231,14 +243,21 @@ def sort_by_lineage(records, callback=None, interval=10000):
             callback('aligned {} records'.format(i))
 
         lineage = record['covv_lineage']
+        diffs = record.pop('diffs')  # REMOVE entry from record!
+        if diffs is not None:
+            diffs.sort()
+        key = ','.join(['|'.join(map(str, diff)) for diff in diffs])
 
         if str(lineage) == "None" or lineage == '':
             # discard uncategorized genomes, #324, #335
             continue
 
         if lineage not in result:
-            result.update({lineage: []})
-        result[lineage].append(record)
+            result.update({lineage: {}})
+        if key not in result[lineage]:
+            result[lineage].update({key: []})
+
+        result[lineage][key].append(record)
 
     return result
 
@@ -288,11 +307,11 @@ def parse_args():
 
     parser.add_argument('--infile', type=str, default=None,
                         help="input, path to xz-compressed JSON")
-    parser.add_argument('--url', type=str, default=os.environ["GISAID_URL"],
+    parser.add_argument('--url', type=str, 
                         help="URL to download provision file, defaults to environment variable.")
-    parser.add_argument('--user', type=str, default=os.environ["GISAID_USER"],
+    parser.add_argument('--user', type=str, 
                         help="GISAID username, defaults to environment variable.")
-    parser.add_argument('--password', type=str, default=os.environ["GISAID_PSWD"],
+    parser.add_argument('--password', type=str, 
                         help="GISAID password, defaults to environment variable.")
 
     parser.add_argument('--minlen', type=int, default=29000, help='option, minimum genome length')
@@ -316,7 +335,20 @@ def parse_args():
                         help="Path to VCF file of problematic sites in SARS-COV-2 genome. "
                              "Source: https://github.com/W-L/ProblematicSites_SARS-CoV2")
 
-    return parser.parse_args()
+    parser.add_argument("--debug", type=int, help="int, limit number of rows of input xz file to parse for debugging")
+
+    args = parser.parse_args()
+
+    if args.url is None and "GISAID_URL" in os.environ:
+        args.url = os.environ["GISAID_URL"]
+    if args.user is None and "GISAID_USER" in os.environ:
+        args.user = os.environ["GISAID_USER"]
+        # otherwise download_feed() will prompt for username
+    if args.password is None and "GISAID_PSWD" in os.environ:
+        args.password = os.environ["GISAID_PSWD"]    
+        # otherwise download_feed() will prompt for password
+    
+    return args 
 
 
 if __name__ == '__main__':
@@ -329,7 +361,8 @@ def parse_args():
     if args.infile is None:
         args.infile = download_feed(args.url, args.user, args.password)
 
-    loader = load_gisaid(args.infile, minlen=args.minlen, mindate=args.mindate)
+    loader = load_gisaid(args.infile, minlen=args.minlen, mindate=args.mindate,
+                         debug=args.debug)
     batcher = batch_fasta(loader, size=args.batchsize)
     aligned = extract_features(batcher, ref_file=args.ref, binpath=args.binpath,
                                nthread=args.mmthreads, minlen=args.minlen)