Merge pull request #305 from PoonLab/dev

Dev
PoonLab · Jul 9, 2021 · 7e5614c · 7e5614c
2 parents 1a2b182 + 520ad13
commit 7e5614c
Show file tree

Hide file tree

Showing 9 changed files with 97 additions and 89 deletions.
diff --git a/batch.py b/batch.py
@@ -39,7 +39,7 @@ def parse_args():
                              'quantile of Poisson distribution (molecular clock).  Default 0.001 '
                              'corresponds to 99.9%% cutoff.')
 
-    parser.add_argument('--batchsize', type=int, default=500,
+    parser.add_argument('--batchsize', type=int, default=2000,
                         help='option, number of records to batch process with minimap2')
     parser.add_argument('--max-variants', type=int, default=5000,
                         help='option, limit number of variants per lineage (default 5000)')
@@ -49,7 +49,7 @@ def parse_args():
                         help="option, path to FASTA file with reference genome")
     parser.add_argument('--mmbin', type=str, default='minimap2',
                         help="option, path to minimap2 binary executable")
-    parser.add_argument('-mmt', "--mmthreads", type=int, default=8,
+    parser.add_argument('-mmt', "--mmthreads", type=int, default=16,
                         help="option, number of threads for minimap2.")
 
     parser.add_argument('--misstol', type=int, default=300,

diff --git a/covizu/clustering.py b/covizu/clustering.py
@@ -48,13 +48,15 @@ def recode_features(records, callback=None, limit=10000):
     union = {}
     labels = []
     indexed = []
-    for _, fvec in intermed[:limit]:
+    for count, item in enumerate(intermed):
+        fvec = item[1]
         labels.append(fvecs[fvec])
-        for feat in fvec:
-            if feat not in union:
-                union.update({feat: len(union)})
-        # recode feature vectors as integers (0-index)
-        indexed.append(set(union[feat] for feat in fvec))
+        if count < limit:
+            for feat in fvec:
+                if feat not in union:
+                    union.update({feat: len(union)})
+            # recode feature vectors as integers (0-index)
+            indexed.append(set(union[feat] for feat in fvec))
 
     return union, labels, indexed
 

diff --git a/covizu/utils/batch_utils.py b/covizu/utils/batch_utils.py
@@ -33,6 +33,7 @@ def beadplot_serial(lineage, features, args, callback=None):
         intermed = [label.split('|')[::-1] for label in labels[0]]
         intermed.sort()
         variant = intermed[0][1]
+        beaddict.update({'sampled_variants': len(labels)})
         beaddict['nodes'].update({variant: []})
 
         for coldate, accn, label1 in intermed:
@@ -49,6 +50,7 @@ def beadplot_serial(lineage, features, args, callback=None):
     # convert to JSON format
     beaddict = beadplot.serialize_tree(ctree)
     beaddict.update({'lineage': lineage})
+    beaddict.update({'sampled_variants': len(labels)})
     return beaddict
 
 
@@ -82,14 +84,15 @@ def make_beadplots(by_lineage, args, callback=None, t0=None, txtfile='minor_line
     :return:  list, beadplot data by lineage
     """
     # partition lineages into major and minor categories
-    major = set([lineage for lineage, features in by_lineage.items()
-                 if len(features) > args.mincount])
+    intermed = [(len(features), lineage) for lineage, features in by_lineage.items()
+                if len(features) < args.mincount]
+    intermed.sort(reverse=True)  # descending order
+    minor = dict([(lineage, None) for _, lineage in intermed if lineage is not None])
 
     # export minor lineages to text file
     with open(txtfile, 'w') as handle:
-        for lineage in by_lineage:
-            if lineage not in major and lineage is not None:
-                handle.write('{}\n'.format(lineage))
+        for lineage in minor:
+            handle.write('{}\n'.format(lineage))
 
     # launch MPI job across minor lineages
     if callback:
@@ -105,8 +108,9 @@ def make_beadplots(by_lineage, args, callback=None, t0=None, txtfile='minor_line
     subprocess.check_call(cmd)
 
     # process major lineages
-    for lineage in major:
-        features = by_lineage[lineage]
+    for lineage, features in by_lineage.items():
+        if lineage in minor:
+            continue
         if callback:
             callback('start {}, {} entries'.format(lineage, len(features)))
             # call out to MPI
@@ -115,7 +119,7 @@ def make_beadplots(by_lineage, args, callback=None, t0=None, txtfile='minor_line
                 args.bylineage, lineage,  # positional arguments <JSON file>, <str>
                 "--mode", "deep",
                 "--max-variants", str(args.max_variants),
-                "--nboot", str(args.nboot), "--outdir", "data"
+                "--nboot", str(args.nboot), "--outdir", "data", "--binpath", args.binpath
             ]
             if t0:
                 cmd.extend(["--timestamp", str(t0)])
@@ -155,6 +159,7 @@ def make_beadplots(by_lineage, args, callback=None, t0=None, txtfile='minor_line
 
             ctree = beadplot.annotate_tree(ctree, label_dict)
             beaddict = beadplot.serialize_tree(ctree)
+            beaddict.update({'sampled_variants': len(label_dict)})
 
             beaddict.update({'lineage': lineage})
         result.append(beaddict)

diff --git a/covizu/utils/gisaid_utils.py b/covizu/utils/gisaid_utils.py
@@ -213,9 +213,10 @@ def sort_by_lineage(records, callback=None):
         if callback and i % 1000 == 0:
             callback('aligned {} records'.format(i))
         lineage = record['covv_lineage']
-        if lineage not in result:
-            result.update({lineage: []})
-        result[lineage].append(record)
+        if lineage is not None:
+            if lineage not in result:
+                result.update({lineage: []})
+            result[lineage].append(record)
     return result
 
 

diff --git a/data/countries.json b/data/countries.json
@@ -210,7 +210,6 @@
     "Kashgar": "China",
     "Albania": "Europe",
     "Hebei": "China",
-    "Brasil": "South Africa",
     "Urumqi": "China",
     "Shulan": "China",
     "Crimea": "Europe",
@@ -241,4 +240,4 @@
     "Sint Eustatius": "North America",
     "Malawi": "Africa",
     "Eswatini": "Africa"
-}
+}
diff --git a/js/beadplot.js b/js/beadplot.js
@@ -493,6 +493,9 @@ function clear_selection() {
  * @param {Number} cid:  integer index of cluster to draw as beadplot
  */
 function beadplot(cid) {
+  // Update global cindex for SVG and NWK filenames
+  cindex = cid;
+
   var variants = beaddata[cid].variants,
       edgelist = beaddata[cid].edgelist,
       points = beaddata[cid].points;
@@ -872,6 +875,12 @@ function beadplot(cid) {
     var tickCount = 0.005*currentWidth;
     if (tickCount <= 4) tickCount = 4;
 
+    // Ensures that there aren't extra ticks in the time axis of the beadplot when the first and last coldates are within a few days
+    var dateDiff = d3.timeDay.count(
+                    d3.min(variants, function(d) {return d.x1}), 
+                    d3.max(variants, function(d) {return d.x2}));
+    if (dateDiff !=0 && dateDiff < tickCount) tickCount = dateDiff;
+
     // draw x-axis
     visBaxis.append("g")
         .attr("class", "treeaxis")

diff --git a/js/covizu.js b/js/covizu.js
@@ -572,12 +572,18 @@ function save_beadplot() {
 
 function export_svg() {
   var config = {filename: clusters[cindex].lineage};
-  var svg_beadplot = d3.select('#svg-cluster>svg').node();
-  d3_save_svg.save(svg_beadplot, config);
-  svg_beadplot.removeAttribute("style");
-  svg_beadplot.removeAttribute("version");
-  svg_beadplot.removeAttribute("xmlns");
-  svg_beadplot.removeAttribute("xmlns:xlink");
+
+  // Creates a duplicate of the beadplot
+  var svg_beadplot = d3.select('#svg-cluster>svg').clone(true);
+  var svg_axis = d3.select('#svg-clusteraxis>svg').clone(true);
+  svg_beadplot.select("g").attr("transform", "translate(0, 30)");
+  svg_axis.node().appendChild(svg_beadplot.selectAll("g").node())
+  svg_axis.attr("height", svg_beadplot.attr("height"))
+
+  d3_save_svg.save(svg_axis.node(), config);
+
+  svg_axis.remove();
+  svg_beadplot.remove()
 }
 
 function export_csv() {

diff --git a/js/drawtree.js b/js/drawtree.js
@@ -1,4 +1,4 @@
-var margin = {top: 10, right: 20, bottom: 10, left: 0},
+var margin = {top: 10, right: 40, bottom: 10, left: 0},
   width = 250 - margin.left - margin.right,
   height = 1200 - margin.top - margin.bottom;
 
@@ -12,10 +12,8 @@ var xValue = function(d) { return d.x; },
   xMap2 = function(d) { return xScale(d.x2); },
   xWide = function(d) { return xScale(d.x2 - d.x1)};
 
-// Scale to plot rects and axis using Dates
-var xAxisTree = d3.scaleTime().range([0, width]),
-  minRectWidth = 7,
-  firstDate, lastDate;
+// define minimum width for rect elements (required in instances where first and last coldates are the same)
+var minRectWidth = 7;
 
 var yValue = function(d) { return d.y; },
   yScale = d3.scaleLinear().range([height, 40]),  // inversion
@@ -46,19 +44,17 @@ let cTooltip = d3.select("#tooltipContainer")
  */
 function draw_cluster_box(rect) {
   var d = rect.datum();
-  var rectWidth = xAxisTree(d.last_date) - xAxisTree(d.first_date);
+  var rectWidth = xScale(date_to_xaxis(d.last_date)) - xScale(d.x2);
+
   // draw a box around the cluster rectangle
   vis.append("rect")
     .attr('class', "clickedH")
-    .attr("x", function() {
-        return xAxisTree(d.first_date) - 2
-    })
+    .attr("x", xMap2(d) - 2)
     .attr("y", yMap(d) - 2)
     .attr("width", function() {
       if (rectWidth < minRectWidth)
         return minRectWidth + 4
-      else
-        return (xAxisTree(d.last_date) - xAxisTree(d.first_date)) + 4
+      return xScale(date_to_xaxis(d.last_date)) - xScale(d.x2) + 4
     })
     .attr("height", 14)
     .attr("fill", "white")
@@ -146,29 +142,14 @@ function drawtree(df) {
     d3.min(df, yValue)-1, d3.max(df, yValue)+1
   ]);
 
-  firstDate = d3.min(df, function(d) {return d.first_date});
-  firstDate = d3.timeDay.offset(firstDate, -70)
-  lastDate = d3.max(df, function(d) {return d.last_date});
-  lastDate = d3.timeDay.offset(lastDate, 60);
-  xAxisTree.domain([firstDate, lastDate]);
-
   // draw lines
   vis.selectAll("lines")
     .data(edgeset)
     .enter().append("line")
     .attr("class", "lines")
-    .attr("x1", function(d) {
-      if (d.first_date!==undefined && d.y1 === d.y2) {
-        return xAxisTree(d.first_date);
-      }
-
-      // Moves the time scaled tree to the left
-      return xScale(d.x1) - 7.2
-    })
+    .attr("x1", xMap1)
     .attr("y1", yMap1)
-    .attr("x2", function(d) {
-      return xScale(d.x2) - 7.2
-    })
+    .attr("x2", xMap2)
     .attr("y2", yMap2)
     .attr("stroke-width", 1.5)
     .attr("stroke", "#777");
@@ -271,12 +252,24 @@ function map_clusters_to_tips(df, clusters) {
  * @returns {string}  new date in ISO format (yyyy-mm-dd)
  */
 function xaxis_to_date(x, tip) {
-  var coldate = new Date(tip.coldate);  // collection date of reference tip
-  coldate.setDate(coldate.getDate() + 365.25*(x - tip.x));
+  var coldate = new Date(tip.first_date);  // collection date of reference tip
+  coldate = d3.timeDay.offset(coldate, 365.25*(x - tip.x));
   return (coldate.toISOString().split('T')[0]);
 }
 
 
+/**
+ * Converts Date to x-coordinate of the tree scale
+ * 
+ * @param {Date} coldate
+ * @returns {float} x-coordinate value 
+ */
+function date_to_xaxis(coldate) {
+  var numDays = d3.timeDay.count(tips[0].first_date, coldate)
+  return (numDays/365.25) + tips[0].x;
+}
+
+
 function mutations_to_string(mutations) {
   let mutStr = `<b>${i18n_text.tip_mutations}:</b><br/>`;
   for (mutation of mutations) {
@@ -291,29 +284,16 @@ function mutations_to_string(mutations) {
  */
 function draw_clusters(tips) {
 
-  // numTicks + 1 ticks for the time-scaled tree
-  var numTicks = 3,
-      stepValue = Math.floor(d3.timeDay.count(firstDate, lastDate) / numTicks),
-      currStep = stepValue,
-      tickValues = [];
-
-  // Sets the values of the ticks
-  tickValues.push(firstDate);
-
-  for (i = 0 ; i < numTicks; i++) {
-    tickValues.push(d3.timeDay.offset(firstDate, currStep));
-    currStep += stepValue;
-  }
-
   // Draws the axis for the time scaled tree
   axis.append("g")
     .attr("class", "treeaxis")
     .attr("transform", "translate(0,20)")
-    .call(
-      d3.axisTop(xAxisTree)
-        .tickValues(tickValues)
-        .tickFormat(d3.timeFormat("%Y-%m-%d"))
-      );
+    .call(d3.axisTop(xScale)
+      .ticks(3)
+      .tickFormat(function(d) {
+        return xaxis_to_date(d, tips[0])
+      })
+    );
 
   function mouseover(d) {
     d3.select("[cidx=cidx-" + d.cluster_idx + "]")
@@ -349,16 +329,13 @@ function draw_clusters(tips) {
     .lower()
     .append("rect")
     //.attr("selected", false)
-    .attr("x", function(d) { 
-        return xAxisTree(d.first_date)
-    })
+    .attr("x", xMap2)
     .attr("y", yMap)
-    .attr("width", function(d) { 
-      var rectWidth = xAxisTree(d.last_date) - xAxisTree(d.first_date);
+    .attr("width", function(d) {
+      var rectWidth = xScale(date_to_xaxis(d.last_date)) - xScale(d.x2);
       if (rectWidth < minRectWidth)
-        return minRectWidth
-      else
-        return rectWidth 
+        return minRectWidth;
+      return rectWidth;
     })
     .attr("height", 10)
     .attr("class", "default")
@@ -408,10 +385,10 @@ function draw_clusters(tips) {
       .attr("cursor", "default")
       .attr("id", function(d) { return "cidx-" + d.cluster_idx; })
       .attr("x", function(d) {
-        if (xAxisTree(d.last_date) - xAxisTree(d.first_date) < minRectWidth)
-          return(xAxisTree(d.first_date) + minRectWidth + 3);
-
-        return(xAxisTree(d.last_date) + 3);
+        var rectWidth = xScale(date_to_xaxis(d.last_date)) - xScale(d.x2);
+        if (rectWidth < minRectWidth)
+          return xScale(d.x2) + minRectWidth + 3;
+        return xScale(d.x2) + rectWidth + 3;
       })
       .attr("y", function(d) {
         return(yScale(d.y-0.15));

diff --git a/local.py b/local.py
@@ -9,6 +9,7 @@
 from covizu.utils import seq_utils, gisaid_utils
 from covizu.utils.progress_utils import Callback
 from covizu.utils.batch_utils import *
+from covizu.utils.seq_utils import SC2Locator
 
 
 def parse_args():
@@ -191,6 +192,13 @@ def process_local(args, callback=None):
     outfile.write(json.dumps(result))  # serialize results to JSON
     outfile.close()
 
+    # get mutation info
+    locator = SC2Locator()
+    mutations = {}
+    for lineage, features in get_mutations(by_lineage).items():
+        annots = [locator.parse_mutation(f) for f in features]
+        mutations.update({lineage: [a for a in annots if a is not None]})
+
     # write data stats
     dbstat_file = os.path.join(args.outdir, 'dbstats.{}.json'.format(timestamp))
     with open(dbstat_file, 'w') as handle:
@@ -207,7 +215,8 @@ def process_local(args, callback=None):
                 'lastcoldate': max(x['covv_collection_date'] for x in samples),
                 'residual': residuals[lineage],
                 'max_ndiffs': max(ndiffs),
-                'mean_ndiffs': sum(ndiffs)/len(ndiffs)
+                'mean_ndiffs': sum(ndiffs)/len(ndiffs),
+                'mutations': mutations[lineage]
             }
         json.dump(val, handle)