+*/
\ No newline at end of file
From 519006b4e4b2487889f10fa262cd2aedf110536f Mon Sep 17 00:00:00 2001
From: DLiarakos <93622613+DLiarakos@users.noreply.github.com>
Date: Mon, 20 May 2024 12:55:29 -0700
Subject: [PATCH 3/4] Added backend workers
Added two files, nodeMapper.js and extract.js, which are used in generating the output for the SNP distance search. nodeMapper makes a map of the entire tree in order to perform downwards traversal, and extract does the actualn traversal and calculation of SNP distance.
---
taxonium_component/src/utils/extract.js | 119 +++++++++++++++
taxonium_component/src/utils/nodeMapper.js | 162 +++++++++++++++++++++
2 files changed, 281 insertions(+)
create mode 100644 taxonium_component/src/utils/extract.js
create mode 100644 taxonium_component/src/utils/nodeMapper.js
diff --git a/taxonium_component/src/utils/extract.js b/taxonium_component/src/utils/extract.js
new file mode 100644
index 00000000..528f88dd
--- /dev/null
+++ b/taxonium_component/src/utils/extract.js
@@ -0,0 +1,119 @@
+import processJsonLines from './nodeMapper.js';
+
+/*
+getParsimonySamples function outline:
+inputs:
+ sampleID, which is the explicit node or sample name, not internal ID
+ maxParsimony, which is the SNP distance threshold of interest
+ Nested functions:
+ processJsonLines: main worker of the backend, which reads in the jsonl file and constructs a map of all internal nodes, children, and mutations, and checks if the sample exists in the tree
+ findNodesWithinDistance: a helper function to find all nodes within a certain distance of a given node
+ traverses up and down the tree, adding nodes to a results array if they are within the distance threshold
+ returns the results array
+ traverseUp: helper function to traverse up the tree, adding nodes to the results array if they are within the distance threshold
+ traverseDown: helper function to traverse down the tree, adding nodes to the results array if they are within the distance threshold
+outputs: should output a simple list/array-like of internal IDs and their SNP distances from the queried node, some flags that determine whether or not a valid search was performed, as well as a map of each nodes name with genbank accession and pangolin lineage
+with this list as a result, we can then query the backend for more information about each node, like name, mutations, etc, if needed
+ this above includes if specificMut is passed through, which would allow for filtering based on whether or not a node has a specific mutation,
+ but the filtering is done after all SNPs within distance are found, to reduce processing time if flag isnt specified
+once the list is obtained, snpComponent formats the list for output into Taxonium(Big step)
+*/
+async function getParsimonySamples(sampleID, maxParsimony) {
+ return processJsonLines("https://cov2tree.nyc3.cdn.digitaloceanspaces.com/latest_public.jsonl.gz",sampleID).then(myResult => {//answersArray=[nodes, foundSample, foundSampleID, foundParentID, foundSNPCount, isBranch]
+ if (myResult==="Error parsing JSON"){//if error parsing JSON, return error
+ return "Error parsing JSON";
+ }
+ var nodeMap=myResult[0]//index of all internal nodes and children
+ // Main function to find all nodes within a certain distance of a given node
+ function findNodesWithinDistance(node, distanceThreshold) {
+ // Helper function to traverse up (towards the parent)
+ function traverseUp(node, currentDistance) {
+ var parent_id=nodeMap[node].parent_id
+ var snpCount=nodeMap[node].snpCount
+ if (parent_id===node || currentDistance > distanceThreshold) {//if root node(root has itself as parent), or if threshold is reached,
+ //console.log("reached root node or threshold, returning at distance "+currentDistance+" from node "+node+" with parent "+parent_id+" and snpCount "+snpCount)
+ return;//end traversal
+ }
+ //console.log("traversing up, new node is "+parent_id+" with distance "+(currentDistance + snpCount))
+ if (!visited.has(parent_id)) {// Check if this node has already been visited to avoid infinite loops
+ visited.add(parent_id);
+ traverseDown(parent_id, currentDistance + snpCount);//Traverse down from the parent
+ traverseUp(parent_id, currentDistance + snpCount);// Traverse further up
+ }
+ }
+ // Helper function to traverse down (towards the children)
+ function traverseDown(node, currentDistance) {
+ if (!nodeMap[node]|| currentDistance > distanceThreshold) {return;}//if node is a leaf node, or it threshold is reached, return
+ for (const child of nodeMap[node].children) {// Traverse all children
+ let decodedChild=child.split("=")//split encoded child into internal ID and SNP distance
+ let childId=decodedChild[0]//get internal ID of child
+ let childSnpDist=parseInt(decodedChild[1])//get SNP distance of child
+ let newTotal=currentDistance+childSnpDist//add SNP distance of child to current distance
+ //console.log("traversing down, new node is "+childId+" with distance "+childSnpDist+" for new total "+newTotal)
+ if (!visited.has(childId)&&!visited.has(decodedChild[3])) {//need a switch to add childs as genbank accession or node ID, since some sample names are repeated
+ if (childId.match(/^\d+$/)){//if its just numbers, its an internal node, so we add it to visited as is
+ visited.add(childId);
+ }
+ else {visited.add(decodedChild[3]);}//if its not just numbers, its a leaf node, so we add the genbank accession to visited
+ if ((newTotal <= distanceThreshold)){ //dont add the root node, as its always going to be within SNP distance of itself
+ //console.log("adding node to results:"+childId+" with distance "+newTotal)
+ if (!nodeMap[childId]){//if its not an entry in node map, means its not an internal node, so we add it to the results
+ //console.log("adding node to results:"+decodedChild)
+ results.push([decodedChild[0], newTotal, decodedChild[2], decodedChild[3]]);
+ }
+ }
+ if (nodeMap[childId]){//if the child is an internal node, traverse down
+ traverseDown(childId,newTotal);// Traverse further down; pass ID, not node info itself
+ }
+ }
+ }
+ }
+
+ // Start of the main function
+ //boolean obtained during traversal of the whole tree; if the queried sample exists in taxonium, this will flag as true
+ if (!myResult[1]) {//if boolean is falsey
+ console.log("Node not found in the tree");//its not a valid node, return error statement
+ return "Node not found in the tree";
+ }
+
+ let visited = new Set(); // To keep track of visited nodes
+ let results = []; // To store nodes within the distance threshold
+ visited.add(myResult[2]); //add ID of queried sample to visited
+ if (myResult[5]){//if the node is an internal node
+ traverseDown(myResult[2], 0);//start traversal from the internal node, we have a neutral distance of 0
+ traverseUp(myResult[2], 0);
+ }
+ else{
+ traverseDown(myResult[3], myResult[4]);
+ traverseUp(myResult[3], myResult[4]);
+
+ }
+ //internal ID of the queried sample
+ // Traverse as far down as possible first, then go up, and traverse down again ignoring visited nodes
+ return results;
+ }
+
+ let goodSamples = findNodesWithinDistance(sampleID, maxParsimony)
+ nodeMap=null;
+ return goodSamples
+ })
+ .catch(error => {
+ // Catch any errors from processJsonLines or thrown in the then block
+ console.error('Error in getParsimonySamples:', error);
+ return "Error processing samples";
+ });
+}
+/*
+getParsimonySamples("node_960478", 5)
+ .then(result => {
+ console.log("Results:", result);
+ })
+ .catch(error => {
+ console.error("Error processing samples:", error);
+ });
+*/
+export default getParsimonySamples;
+
+/*
+NOTES:
+*/
diff --git a/taxonium_component/src/utils/nodeMapper.js b/taxonium_component/src/utils/nodeMapper.js
new file mode 100644
index 00000000..14ac8c16
--- /dev/null
+++ b/taxonium_component/src/utils/nodeMapper.js
@@ -0,0 +1,162 @@
+/*
+TODO:
+*/
+
+async function processJsonLines(url,sampleID) {
+ // Fetch the gzipped JSONL file
+ //const startTime = new Date(); // Start timing
+ const response = await fetch(url);
+
+ // Ensure the fetch was successful
+ if (!response.ok) {
+ throw new Error(`HTTP error! status: ${response.status}`);
+ }
+
+ // Stream the response through decompression and decoding
+ const decompressedStream = response.body.pipeThrough(new DecompressionStream('gzip'));
+ const textStream = decompressedStream.pipeThrough(new TextDecoderStream());
+
+ // Reader to read the stream line by line
+ const reader = textStream.getReader();
+ let remainder = '';
+ let result;
+ let nodes = {};
+ let foundSample=false;//we will be looking for a specific ID when we construct
+ let foundSampleID=""
+ let foundParentID=""
+ let foundSNPCount=0
+ let isBranch=false
+ while (!(result = await reader.read()).done) {
+ const chunk = remainder + result.value;
+ const lines = chunk.split('\n');
+ remainder = lines.pop(); // Save the last line in case it's incomplete
+ for (const line of lines) {
+ if (line) {
+ var snpCount=0;
+ try {
+ const json = JSON.parse(line);
+ if (json.config){//if line has the config file, skip it to avoid an error
+ continue;//this first line also has mutations dictionary for decoding, if we need that later
+ }
+ for (const mut of json.mutations){
+ if (mut>107435){
+ snpCount+=1;
+ }
+ }
+ if (json.name===sampleID){//check if this is the sample we will be searching for
+ foundSample=true;//if it is, we have found it
+ foundSampleID=json.node_id//store its ID so we can use it later
+ foundParentID=json.parent_id//need to get parent ID of first node as a jumping off point for internal nodes, since theyre not being stored
+ foundSNPCount=snpCount
+ if (json.name.includes("node_")) {
+ isBranch=true
+ }
+ //console.log(json)
+ }
+
+ if (json.name.includes("node_")) { // Check if the node is internal
+ var encodedChild=(String(json.node_id)+"="+String(snpCount))//encode child and snp count without further nesting, as trying to store them as separate objects causes Stringify error due to excessive nesting
+ if (!nodes[json.node_id]) {//if internal, but not added to list
+ nodes[json.node_id] = {//create new node
+ parent_id: json.parent_id,
+ snpCount: snpCount,
+ children: []
+ };
+ if (!nodes[nodes[json.node_id].parent_id]){//if the parent is not yet added to the list,
+ nodes[nodes[json.node_id].parent_id] = {// add it to the list, with null name and parent, since we wont have that info until we read in parent node
+ parent_id: null,
+ snpCount: null,
+ children: [encodedChild]//store the node ID and the number of mutations
+ };
+ }
+ else{
+ nodes[nodes[json.node_id].parent_id].children.push(encodedChild);// if the parent node has been added, add this node to its children
+ }
+ }
+ if(nodes[json.node_id] && (nodes[json.node_id].parent_id===null || nodes[json.node_id].name===null)){//if we have added this parent node previously, but finally come across in JSON
+ //console.log("Node ID being updated:"+json.name)
+ nodes[json.node_id].parent_id=json.parent_id;//fill in the parent ID
+ nodes[json.node_id].snpCount=snpCount;//fill in the snp count
+ if (!nodes[nodes[json.node_id].parent_id]){//if this node, which was added by a previous step and therefore does not flag new internal step above, has a parent that has not been added to the list
+ nodes[nodes[json.node_id].parent_id] = {// so add it
+ parent_id: null,
+ snpCount: null,
+ children: [encodedChild]//store the node ID and the number of mutations
+ };
+ }
+ else{
+ nodes[nodes[json.node_id].parent_id].children.push(encodedChild);// if the parent node has been added, add this node to its children
+ }
+ }
+ }
+ else {// if doesnt contain "node_", then its a leaf node
+ encodedChild=(String(json.name)+"="+String(snpCount)+"="+String(json.meta_pangolin_lineage)+"="+String(json.meta_genbank_accession))//encode child and snp count without further nesting, as trying to store them as separate objects causes Stringify error due to excessive nesting
+ if (!nodes[json.parent_id]) {//we dont track leaf nodes, so if parent node is not in list, add it
+ nodes[json.parent_id] = {//add line which fills in these null values when we read in the parent node
+ parent_id: null,
+ snpCount: null,
+ children: [encodedChild]
+ };
+ } else {
+
+ nodes[json.parent_id].children.push(encodedChild);//if parent node is in list, add this node to its children
+ }
+ }
+ } catch (e) {
+ console.error('Error parsing JSON:', e);
+ return "Error parsing JSON"
+ }
+ }
+ }
+ }
+
+
+ var answersArray=[nodes, foundSample, foundSampleID, foundParentID, foundSNPCount, isBranch]
+ return answersArray;
+ }
+/*
+processJsonLines('https://cov2tree.nyc3.cdn.digitaloceanspaces.com/latest_public.jsonl.gz', "node_3").then(result => {
+ let sliced = Object.fromEntries(Object.entries(result[0][0]).slice(0,3))//get first 3 entries
+ console.log("First 3 entries: ",sliced)
+ //saveObjectToJson(result[0], 'C:/Users/david/my-app/src/InternalNodeMap.json');
+})
+.catch(error => {
+ console.error("Error processing samples:", error);
+});
+function saveObjectToJson(dataObject, outputPath) {
+ const fs = require('fs');
+ const JSONStream = require('JSONStream');
+ return new Promise((resolve, reject) => {
+ const writeStream = fs.createWriteStream(outputPath);
+ const stringifyStream = JSONStream.stringifyObject();
+ stringifyStream.pipe(writeStream);
+
+ writeStream.on('finish', () => {
+ console.log('JSON file has been written successfully.');
+ resolve();
+ });
+
+ writeStream.on('error', (error) => {
+ console.error('Stream write error:', error);
+ reject(error);
+ });
+
+ stringifyStream.on('error', (error) => {
+ console.error('JSON stringify error:', error);
+ reject(error);
+ });
+
+ for (const key in dataObject) {
+ stringifyStream.write([key, dataObject[key]]);
+ }
+ stringifyStream.end();
+ });
+}
+*/
+export default processJsonLines;
+
+// Usage example
+//at ~2gb of ram, 4.2ghz with 6 cores, a little under 60sec when reading from url
+//time to write to file is more extensive, but ideally not a factor if its happening in the backend
+//time to query backend for single node: ~0.6s
+//time to add snp dist when reading is negligible
\ No newline at end of file
From 06ca0706f230b276feced2d0b463908aa5d5e274 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
<66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 20 May 2024 20:19:29 +0000
Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---
.../src/components/SearchPanel.jsx | 35 +-
.../src/components/snpModal.jsx | 604 +++++++++++-------
taxonium_component/src/utils/extract.js | 260 ++++----
taxonium_component/src/utils/nodeMapper.js | 356 ++++++-----
4 files changed, 721 insertions(+), 534 deletions(-)
diff --git a/taxonium_component/src/components/SearchPanel.jsx b/taxonium_component/src/components/SearchPanel.jsx
index 5cb8836c..10adb9cf 100644
--- a/taxonium_component/src/components/SearchPanel.jsx
+++ b/taxonium_component/src/components/SearchPanel.jsx
@@ -82,7 +82,7 @@ function SearchPanel({
setShowSNPButton(true);
}
}, []);
-
+
const handleDownloadJson = () => {
if (selectedDetails.nodeDetails) {
const node_id = selectedDetails.nodeDetails.node_id;
@@ -278,7 +278,6 @@ function SearchPanel({
<>
Displaying {formatNumber(config.num_tips)}{" "}
{config.tipPluralNoun ? config.tipPluralNoun : "sequences"}
-
{config.source && ` from ${config.source}`}
>
)}
@@ -429,22 +428,22 @@ function SearchPanel({
Add a new search
- {showSNPButton && (
-
- )}
- {showSNPButton && (
-
- )}
+ {showSNPButton && (
+
+ )}
+ {showSNPButton && (
+
+ )}
{selectedDetails.nodeDetails && (
diff --git a/taxonium_component/src/components/snpModal.jsx b/taxonium_component/src/components/snpModal.jsx
index ef8c43b2..9a867264 100644
--- a/taxonium_component/src/components/snpModal.jsx
+++ b/taxonium_component/src/components/snpModal.jsx
@@ -1,235 +1,369 @@
-import Modal from "react-modal";
-import { useState} from "react";
-import getParsimonySamples from "../utils/extract.js";
-
-/*
-Testing Search: node_960478, 5
- -should return 97 results, exluding internal nodes
-Germany/IMS-10245-CVDP-57CCA4FC-E286-4E50-AB0F-727CDF76B7BF/2022, 4
-should return 0
-TODO:
-Additional Features to maybe include:
-Maybe make each element in the drop down a clickable element to select the node in the SearchPanel
-a csv download option for the output
-*/
-async function getSNPneighbors(nodeId, integerValue, callback) {
- // Dummy backend function
- try {
- let results= await getParsimonySamples(nodeId, integerValue);
- callback(null, results);
- } catch (err) {
- callback(err, null);
- }
-}
-
-const SNPOutputModal = ({
- snpOutputModalOpen,
- setSnpOutputModalOpen,
-}) => {
- const [nodeId, setNodeId] = useState("");
- const [integerValue, setIntegerValue] = useState("");
- const [fullOutput, setFullOutput] = useState([]);
- const [displayOutput, setDisplayOutput] = useState([]);
- const [remainingCount, setRemainingCount] = useState(0);
- const [loading, setLoading] = useState(false);
- const [JsonError, setJsonError] = useState(false);
- const [MissingNoError, setMissingError] = useState(false);
- const [ParsimonyError, setParsimonyError] = useState(false);
- const [emptyReturn, setEmpty] = useState(false);
-
- const handleSearch = () => {
- if (nodeId && integerValue) {
- setLoading(true);
- setFullOutput([]);
- setDisplayOutput([]);
- setRemainingCount(0);
- setJsonError(false);
- setMissingError(false);
- setParsimonyError(false);
- setEmpty(false);
- const startTime = new Date();
- getSNPneighbors(nodeId, integerValue, (err, res) => {
- if (err) {
- console.log(err);
- }
- else if(res==="Error parsing JSON") {
- setJsonError(true);
- }
- else if(res==="Node not found in the tree") {
- setMissingError(true);
- }
- else if(res==="Error parsing JSON") {
- setParsimonyError(true);
- }
- else if (res.length === 0) {
- setEmpty(true);
- }
- else {
- setFullOutput(res);
- setDisplayOutput(res.slice(0, 100));
- setRemainingCount(res.length > 100 ? res.length - 100 : 0);
- const endTime = new Date(); // End timing
- const timeDiff = endTime - startTime; // Time in milliseconds
- console.log(`Time taken: ${timeDiff} ms`);
- }
- setLoading(false);
- });
- }
- };
- const handleCloseModal = () => {
- // Reset all state variables on modal close
- setSnpOutputModalOpen(false);
- setNodeId("");
- setIntegerValue("");
- setFullOutput([]);
- setDisplayOutput([]);
- setRemainingCount(0);
- setLoading(false);
- setJsonError(false);
- setMissingError(false);
- setParsimonyError(false);
- setEmpty(false);
- };
- const convertToTSV = (data) => {
- const header = "Sample Name\tDistance\tPANGO Lineage\tGenbank Accession\n";
- const rows = data.map(row =>
- `${row[0]}\t${row[1]}\t${row[2]}\t${row[3]}`
- ).join('\n');
- return header + rows;
- };
- const downloadTSV = (data) => {
- const tsvString = convertToTSV(data);
- const blob = new Blob([tsvString], { type: 'text/tab-separated-values' });
- const href = URL.createObjectURL(blob);
- const link = document.createElement('a');
- link.href = href;
- link.download = `${nodeId}_SNP${integerValue}.tsv`;
- document.body.appendChild(link);
- link.click();
- document.body.removeChild(link);
- URL.revokeObjectURL(href);
- };
- return (
-
-
-
SNP Distance Search
-
-
-
-
-
-
-
-
-
- {loading &&
Loading data. This may take a minute...
}
- {!loading && JsonError &&
Error parsing JSON.
}
- {!loading && MissingNoError &&
Node not found in the tree.
}
- {!loading && ParsimonyError &&
Error traversing tree.
}
- {!loading && emptyReturn &&
No results found; check your sample ID or change SNP distance.
+*/
diff --git a/taxonium_component/src/utils/extract.js b/taxonium_component/src/utils/extract.js
index 528f88dd..a235701d 100644
--- a/taxonium_component/src/utils/extract.js
+++ b/taxonium_component/src/utils/extract.js
@@ -1,119 +1,141 @@
-import processJsonLines from './nodeMapper.js';
-
-/*
-getParsimonySamples function outline:
-inputs:
- sampleID, which is the explicit node or sample name, not internal ID
- maxParsimony, which is the SNP distance threshold of interest
- Nested functions:
- processJsonLines: main worker of the backend, which reads in the jsonl file and constructs a map of all internal nodes, children, and mutations, and checks if the sample exists in the tree
- findNodesWithinDistance: a helper function to find all nodes within a certain distance of a given node
- traverses up and down the tree, adding nodes to a results array if they are within the distance threshold
- returns the results array
- traverseUp: helper function to traverse up the tree, adding nodes to the results array if they are within the distance threshold
- traverseDown: helper function to traverse down the tree, adding nodes to the results array if they are within the distance threshold
-outputs: should output a simple list/array-like of internal IDs and their SNP distances from the queried node, some flags that determine whether or not a valid search was performed, as well as a map of each nodes name with genbank accession and pangolin lineage
-with this list as a result, we can then query the backend for more information about each node, like name, mutations, etc, if needed
- this above includes if specificMut is passed through, which would allow for filtering based on whether or not a node has a specific mutation,
- but the filtering is done after all SNPs within distance are found, to reduce processing time if flag isnt specified
-once the list is obtained, snpComponent formats the list for output into Taxonium(Big step)
-*/
-async function getParsimonySamples(sampleID, maxParsimony) {
- return processJsonLines("https://cov2tree.nyc3.cdn.digitaloceanspaces.com/latest_public.jsonl.gz",sampleID).then(myResult => {//answersArray=[nodes, foundSample, foundSampleID, foundParentID, foundSNPCount, isBranch]
- if (myResult==="Error parsing JSON"){//if error parsing JSON, return error
- return "Error parsing JSON";
- }
- var nodeMap=myResult[0]//index of all internal nodes and children
- // Main function to find all nodes within a certain distance of a given node
- function findNodesWithinDistance(node, distanceThreshold) {
- // Helper function to traverse up (towards the parent)
- function traverseUp(node, currentDistance) {
- var parent_id=nodeMap[node].parent_id
- var snpCount=nodeMap[node].snpCount
- if (parent_id===node || currentDistance > distanceThreshold) {//if root node(root has itself as parent), or if threshold is reached,
- //console.log("reached root node or threshold, returning at distance "+currentDistance+" from node "+node+" with parent "+parent_id+" and snpCount "+snpCount)
- return;//end traversal
- }
- //console.log("traversing up, new node is "+parent_id+" with distance "+(currentDistance + snpCount))
- if (!visited.has(parent_id)) {// Check if this node has already been visited to avoid infinite loops
- visited.add(parent_id);
- traverseDown(parent_id, currentDistance + snpCount);//Traverse down from the parent
- traverseUp(parent_id, currentDistance + snpCount);// Traverse further up
- }
- }
- // Helper function to traverse down (towards the children)
- function traverseDown(node, currentDistance) {
- if (!nodeMap[node]|| currentDistance > distanceThreshold) {return;}//if node is a leaf node, or it threshold is reached, return
- for (const child of nodeMap[node].children) {// Traverse all children
- let decodedChild=child.split("=")//split encoded child into internal ID and SNP distance
- let childId=decodedChild[0]//get internal ID of child
- let childSnpDist=parseInt(decodedChild[1])//get SNP distance of child
- let newTotal=currentDistance+childSnpDist//add SNP distance of child to current distance
- //console.log("traversing down, new node is "+childId+" with distance "+childSnpDist+" for new total "+newTotal)
- if (!visited.has(childId)&&!visited.has(decodedChild[3])) {//need a switch to add childs as genbank accession or node ID, since some sample names are repeated
- if (childId.match(/^\d+$/)){//if its just numbers, its an internal node, so we add it to visited as is
- visited.add(childId);
- }
- else {visited.add(decodedChild[3]);}//if its not just numbers, its a leaf node, so we add the genbank accession to visited
- if ((newTotal <= distanceThreshold)){ //dont add the root node, as its always going to be within SNP distance of itself
- //console.log("adding node to results:"+childId+" with distance "+newTotal)
- if (!nodeMap[childId]){//if its not an entry in node map, means its not an internal node, so we add it to the results
- //console.log("adding node to results:"+decodedChild)
- results.push([decodedChild[0], newTotal, decodedChild[2], decodedChild[3]]);
- }
- }
- if (nodeMap[childId]){//if the child is an internal node, traverse down
- traverseDown(childId,newTotal);// Traverse further down; pass ID, not node info itself
- }
- }
- }
- }
-
- // Start of the main function
- //boolean obtained during traversal of the whole tree; if the queried sample exists in taxonium, this will flag as true
- if (!myResult[1]) {//if boolean is falsey
- console.log("Node not found in the tree");//its not a valid node, return error statement
- return "Node not found in the tree";
- }
-
- let visited = new Set(); // To keep track of visited nodes
- let results = []; // To store nodes within the distance threshold
- visited.add(myResult[2]); //add ID of queried sample to visited
- if (myResult[5]){//if the node is an internal node
- traverseDown(myResult[2], 0);//start traversal from the internal node, we have a neutral distance of 0
- traverseUp(myResult[2], 0);
- }
- else{
- traverseDown(myResult[3], myResult[4]);
- traverseUp(myResult[3], myResult[4]);
-
- }
- //internal ID of the queried sample
- // Traverse as far down as possible first, then go up, and traverse down again ignoring visited nodes
- return results;
- }
-
- let goodSamples = findNodesWithinDistance(sampleID, maxParsimony)
- nodeMap=null;
- return goodSamples
- })
- .catch(error => {
- // Catch any errors from processJsonLines or thrown in the then block
- console.error('Error in getParsimonySamples:', error);
- return "Error processing samples";
- });
-}
-/*
-getParsimonySamples("node_960478", 5)
- .then(result => {
- console.log("Results:", result);
- })
- .catch(error => {
- console.error("Error processing samples:", error);
- });
-*/
-export default getParsimonySamples;
-
-/*
-NOTES:
-*/
+import processJsonLines from "./nodeMapper.js";
+
+/*
+getParsimonySamples function outline:
+inputs:
+ sampleID, which is the explicit node or sample name, not internal ID
+ maxParsimony, which is the SNP distance threshold of interest
+ Nested functions:
+ processJsonLines: main worker of the backend, which reads in the jsonl file and constructs a map of all internal nodes, children, and mutations, and checks if the sample exists in the tree
+ findNodesWithinDistance: a helper function to find all nodes within a certain distance of a given node
+ traverses up and down the tree, adding nodes to a results array if they are within the distance threshold
+ returns the results array
+ traverseUp: helper function to traverse up the tree, adding nodes to the results array if they are within the distance threshold
+ traverseDown: helper function to traverse down the tree, adding nodes to the results array if they are within the distance threshold
+outputs: should output a simple list/array-like of internal IDs and their SNP distances from the queried node, some flags that determine whether or not a valid search was performed, as well as a map of each nodes name with genbank accession and pangolin lineage
+with this list as a result, we can then query the backend for more information about each node, like name, mutations, etc, if needed
+ this above includes if specificMut is passed through, which would allow for filtering based on whether or not a node has a specific mutation,
+ but the filtering is done after all SNPs within distance are found, to reduce processing time if flag isnt specified
+once the list is obtained, snpComponent formats the list for output into Taxonium(Big step)
+*/
+async function getParsimonySamples(sampleID, maxParsimony) {
+ return processJsonLines(
+ "https://cov2tree.nyc3.cdn.digitaloceanspaces.com/latest_public.jsonl.gz",
+ sampleID
+ )
+ .then((myResult) => {
+ //answersArray=[nodes, foundSample, foundSampleID, foundParentID, foundSNPCount, isBranch]
+ if (myResult === "Error parsing JSON") {
+ //if error parsing JSON, return error
+ return "Error parsing JSON";
+ }
+ var nodeMap = myResult[0]; //index of all internal nodes and children
+ // Main function to find all nodes within a certain distance of a given node
+ function findNodesWithinDistance(node, distanceThreshold) {
+ // Helper function to traverse up (towards the parent)
+ function traverseUp(node, currentDistance) {
+ var parent_id = nodeMap[node].parent_id;
+ var snpCount = nodeMap[node].snpCount;
+ if (parent_id === node || currentDistance > distanceThreshold) {
+ //if root node(root has itself as parent), or if threshold is reached,
+ //console.log("reached root node or threshold, returning at distance "+currentDistance+" from node "+node+" with parent "+parent_id+" and snpCount "+snpCount)
+ return; //end traversal
+ }
+ //console.log("traversing up, new node is "+parent_id+" with distance "+(currentDistance + snpCount))
+ if (!visited.has(parent_id)) {
+ // Check if this node has already been visited to avoid infinite loops
+ visited.add(parent_id);
+ traverseDown(parent_id, currentDistance + snpCount); //Traverse down from the parent
+ traverseUp(parent_id, currentDistance + snpCount); // Traverse further up
+ }
+ }
+ // Helper function to traverse down (towards the children)
+ function traverseDown(node, currentDistance) {
+ if (!nodeMap[node] || currentDistance > distanceThreshold) {
+ return;
+ } //if node is a leaf node, or it threshold is reached, return
+ for (const child of nodeMap[node].children) {
+ // Traverse all children
+ let decodedChild = child.split("="); //split encoded child into internal ID and SNP distance
+ let childId = decodedChild[0]; //get internal ID of child
+ let childSnpDist = parseInt(decodedChild[1]); //get SNP distance of child
+ let newTotal = currentDistance + childSnpDist; //add SNP distance of child to current distance
+ //console.log("traversing down, new node is "+childId+" with distance "+childSnpDist+" for new total "+newTotal)
+ if (!visited.has(childId) && !visited.has(decodedChild[3])) {
+ //need a switch to add childs as genbank accession or node ID, since some sample names are repeated
+ if (childId.match(/^\d+$/)) {
+ //if its just numbers, its an internal node, so we add it to visited as is
+ visited.add(childId);
+ } else {
+ visited.add(decodedChild[3]);
+ } //if its not just numbers, its a leaf node, so we add the genbank accession to visited
+ if (newTotal <= distanceThreshold) {
+ //dont add the root node, as its always going to be within SNP distance of itself
+ //console.log("adding node to results:"+childId+" with distance "+newTotal)
+ if (!nodeMap[childId]) {
+ //if its not an entry in node map, means its not an internal node, so we add it to the results
+ //console.log("adding node to results:"+decodedChild)
+ results.push([
+ decodedChild[0],
+ newTotal,
+ decodedChild[2],
+ decodedChild[3],
+ ]);
+ }
+ }
+ if (nodeMap[childId]) {
+ //if the child is an internal node, traverse down
+ traverseDown(childId, newTotal); // Traverse further down; pass ID, not node info itself
+ }
+ }
+ }
+ }
+
+ // Start of the main function
+ //boolean obtained during traversal of the whole tree; if the queried sample exists in taxonium, this will flag as true
+ if (!myResult[1]) {
+ //if boolean is falsey
+ console.log("Node not found in the tree"); //its not a valid node, return error statement
+ return "Node not found in the tree";
+ }
+
+ let visited = new Set(); // To keep track of visited nodes
+ let results = []; // To store nodes within the distance threshold
+ visited.add(myResult[2]); //add ID of queried sample to visited
+ if (myResult[5]) {
+ //if the node is an internal node
+ traverseDown(myResult[2], 0); //start traversal from the internal node, we have a neutral distance of 0
+ traverseUp(myResult[2], 0);
+ } else {
+ traverseDown(myResult[3], myResult[4]);
+ traverseUp(myResult[3], myResult[4]);
+ }
+ //internal ID of the queried sample
+ // Traverse as far down as possible first, then go up, and traverse down again ignoring visited nodes
+ return results;
+ }
+
+ let goodSamples = findNodesWithinDistance(sampleID, maxParsimony);
+ nodeMap = null;
+ return goodSamples;
+ })
+ .catch((error) => {
+ // Catch any errors from processJsonLines or thrown in the then block
+ console.error("Error in getParsimonySamples:", error);
+ return "Error processing samples";
+ });
+}
+/*
+getParsimonySamples("node_960478", 5)
+ .then(result => {
+ console.log("Results:", result);
+ })
+ .catch(error => {
+ console.error("Error processing samples:", error);
+ });
+*/
+export default getParsimonySamples;
+
+/*
+NOTES:
+*/
diff --git a/taxonium_component/src/utils/nodeMapper.js b/taxonium_component/src/utils/nodeMapper.js
index 14ac8c16..3eb660af 100644
--- a/taxonium_component/src/utils/nodeMapper.js
+++ b/taxonium_component/src/utils/nodeMapper.js
@@ -1,162 +1,194 @@
-/*
-TODO:
-*/
-
-async function processJsonLines(url,sampleID) {
- // Fetch the gzipped JSONL file
- //const startTime = new Date(); // Start timing
- const response = await fetch(url);
-
- // Ensure the fetch was successful
- if (!response.ok) {
- throw new Error(`HTTP error! status: ${response.status}`);
- }
-
- // Stream the response through decompression and decoding
- const decompressedStream = response.body.pipeThrough(new DecompressionStream('gzip'));
- const textStream = decompressedStream.pipeThrough(new TextDecoderStream());
-
- // Reader to read the stream line by line
- const reader = textStream.getReader();
- let remainder = '';
- let result;
- let nodes = {};
- let foundSample=false;//we will be looking for a specific ID when we construct
- let foundSampleID=""
- let foundParentID=""
- let foundSNPCount=0
- let isBranch=false
- while (!(result = await reader.read()).done) {
- const chunk = remainder + result.value;
- const lines = chunk.split('\n');
- remainder = lines.pop(); // Save the last line in case it's incomplete
- for (const line of lines) {
- if (line) {
- var snpCount=0;
- try {
- const json = JSON.parse(line);
- if (json.config){//if line has the config file, skip it to avoid an error
- continue;//this first line also has mutations dictionary for decoding, if we need that later
- }
- for (const mut of json.mutations){
- if (mut>107435){
- snpCount+=1;
- }
- }
- if (json.name===sampleID){//check if this is the sample we will be searching for
- foundSample=true;//if it is, we have found it
- foundSampleID=json.node_id//store its ID so we can use it later
- foundParentID=json.parent_id//need to get parent ID of first node as a jumping off point for internal nodes, since theyre not being stored
- foundSNPCount=snpCount
- if (json.name.includes("node_")) {
- isBranch=true
- }
- //console.log(json)
- }
-
- if (json.name.includes("node_")) { // Check if the node is internal
- var encodedChild=(String(json.node_id)+"="+String(snpCount))//encode child and snp count without further nesting, as trying to store them as separate objects causes Stringify error due to excessive nesting
- if (!nodes[json.node_id]) {//if internal, but not added to list
- nodes[json.node_id] = {//create new node
- parent_id: json.parent_id,
- snpCount: snpCount,
- children: []
- };
- if (!nodes[nodes[json.node_id].parent_id]){//if the parent is not yet added to the list,
- nodes[nodes[json.node_id].parent_id] = {// add it to the list, with null name and parent, since we wont have that info until we read in parent node
- parent_id: null,
- snpCount: null,
- children: [encodedChild]//store the node ID and the number of mutations
- };
- }
- else{
- nodes[nodes[json.node_id].parent_id].children.push(encodedChild);// if the parent node has been added, add this node to its children
- }
- }
- if(nodes[json.node_id] && (nodes[json.node_id].parent_id===null || nodes[json.node_id].name===null)){//if we have added this parent node previously, but finally come across in JSON
- //console.log("Node ID being updated:"+json.name)
- nodes[json.node_id].parent_id=json.parent_id;//fill in the parent ID
- nodes[json.node_id].snpCount=snpCount;//fill in the snp count
- if (!nodes[nodes[json.node_id].parent_id]){//if this node, which was added by a previous step and therefore does not flag new internal step above, has a parent that has not been added to the list
- nodes[nodes[json.node_id].parent_id] = {// so add it
- parent_id: null,
- snpCount: null,
- children: [encodedChild]//store the node ID and the number of mutations
- };
- }
- else{
- nodes[nodes[json.node_id].parent_id].children.push(encodedChild);// if the parent node has been added, add this node to its children
- }
- }
- }
- else {// if doesnt contain "node_", then its a leaf node
- encodedChild=(String(json.name)+"="+String(snpCount)+"="+String(json.meta_pangolin_lineage)+"="+String(json.meta_genbank_accession))//encode child and snp count without further nesting, as trying to store them as separate objects causes Stringify error due to excessive nesting
- if (!nodes[json.parent_id]) {//we dont track leaf nodes, so if parent node is not in list, add it
- nodes[json.parent_id] = {//add line which fills in these null values when we read in the parent node
- parent_id: null,
- snpCount: null,
- children: [encodedChild]
- };
- } else {
-
- nodes[json.parent_id].children.push(encodedChild);//if parent node is in list, add this node to its children
- }
- }
- } catch (e) {
- console.error('Error parsing JSON:', e);
- return "Error parsing JSON"
- }
- }
- }
- }
-
-
- var answersArray=[nodes, foundSample, foundSampleID, foundParentID, foundSNPCount, isBranch]
- return answersArray;
- }
-/*
-processJsonLines('https://cov2tree.nyc3.cdn.digitaloceanspaces.com/latest_public.jsonl.gz', "node_3").then(result => {
- let sliced = Object.fromEntries(Object.entries(result[0][0]).slice(0,3))//get first 3 entries
- console.log("First 3 entries: ",sliced)
- //saveObjectToJson(result[0], 'C:/Users/david/my-app/src/InternalNodeMap.json');
-})
-.catch(error => {
- console.error("Error processing samples:", error);
-});
-function saveObjectToJson(dataObject, outputPath) {
- const fs = require('fs');
- const JSONStream = require('JSONStream');
- return new Promise((resolve, reject) => {
- const writeStream = fs.createWriteStream(outputPath);
- const stringifyStream = JSONStream.stringifyObject();
- stringifyStream.pipe(writeStream);
-
- writeStream.on('finish', () => {
- console.log('JSON file has been written successfully.');
- resolve();
- });
-
- writeStream.on('error', (error) => {
- console.error('Stream write error:', error);
- reject(error);
- });
-
- stringifyStream.on('error', (error) => {
- console.error('JSON stringify error:', error);
- reject(error);
- });
-
- for (const key in dataObject) {
- stringifyStream.write([key, dataObject[key]]);
- }
- stringifyStream.end();
- });
-}
-*/
-export default processJsonLines;
-
-// Usage example
-//at ~2gb of ram, 4.2ghz with 6 cores, a little under 60sec when reading from url
-//time to write to file is more extensive, but ideally not a factor if its happening in the backend
-//time to query backend for single node: ~0.6s
-//time to add snp dist when reading is negligible
\ No newline at end of file
+/*
+TODO:
+*/
+
+async function processJsonLines(url, sampleID) {
+ // Fetch the gzipped JSONL file
+ //const startTime = new Date(); // Start timing
+ const response = await fetch(url);
+
+ // Ensure the fetch was successful
+ if (!response.ok) {
+ throw new Error(`HTTP error! status: ${response.status}`);
+ }
+
+ // Stream the response through decompression and decoding
+ const decompressedStream = response.body.pipeThrough(
+ new DecompressionStream("gzip")
+ );
+ const textStream = decompressedStream.pipeThrough(new TextDecoderStream());
+
+ // Reader to read the stream line by line
+ const reader = textStream.getReader();
+ let remainder = "";
+ let result;
+ let nodes = {};
+ let foundSample = false; //we will be looking for a specific ID when we construct
+ let foundSampleID = "";
+ let foundParentID = "";
+ let foundSNPCount = 0;
+ let isBranch = false;
+ while (!(result = await reader.read()).done) {
+ const chunk = remainder + result.value;
+ const lines = chunk.split("\n");
+ remainder = lines.pop(); // Save the last line in case it's incomplete
+ for (const line of lines) {
+ if (line) {
+ var snpCount = 0;
+ try {
+ const json = JSON.parse(line);
+ if (json.config) {
+ //if line has the config file, skip it to avoid an error
+ continue; //this first line also has mutations dictionary for decoding, if we need that later
+ }
+ for (const mut of json.mutations) {
+ if (mut > 107435) {
+ snpCount += 1;
+ }
+ }
+ if (json.name === sampleID) {
+ //check if this is the sample we will be searching for
+ foundSample = true; //if it is, we have found it
+ foundSampleID = json.node_id; //store its ID so we can use it later
+ foundParentID = json.parent_id; //need to get parent ID of first node as a jumping off point for internal nodes, since theyre not being stored
+ foundSNPCount = snpCount;
+ if (json.name.includes("node_")) {
+ isBranch = true;
+ }
+ //console.log(json)
+ }
+
+ if (json.name.includes("node_")) {
+ // Check if the node is internal
+ var encodedChild = String(json.node_id) + "=" + String(snpCount); //encode child and snp count without further nesting, as trying to store them as separate objects causes Stringify error due to excessive nesting
+ if (!nodes[json.node_id]) {
+ //if internal, but not added to list
+ nodes[json.node_id] = {
+ //create new node
+ parent_id: json.parent_id,
+ snpCount: snpCount,
+ children: [],
+ };
+ if (!nodes[nodes[json.node_id].parent_id]) {
+ //if the parent is not yet added to the list,
+ nodes[nodes[json.node_id].parent_id] = {
+ // add it to the list, with null name and parent, since we wont have that info until we read in parent node
+ parent_id: null,
+ snpCount: null,
+ children: [encodedChild], //store the node ID and the number of mutations
+ };
+ } else {
+ nodes[nodes[json.node_id].parent_id].children.push(
+ encodedChild
+ ); // if the parent node has been added, add this node to its children
+ }
+ }
+ if (
+ nodes[json.node_id] &&
+ (nodes[json.node_id].parent_id === null ||
+ nodes[json.node_id].name === null)
+ ) {
+ //if we have added this parent node previously, but finally come across in JSON
+ //console.log("Node ID being updated:"+json.name)
+ nodes[json.node_id].parent_id = json.parent_id; //fill in the parent ID
+ nodes[json.node_id].snpCount = snpCount; //fill in the snp count
+ if (!nodes[nodes[json.node_id].parent_id]) {
+ //if this node, which was added by a previous step and therefore does not flag new internal step above, has a parent that has not been added to the list
+ nodes[nodes[json.node_id].parent_id] = {
+ // so add it
+ parent_id: null,
+ snpCount: null,
+ children: [encodedChild], //store the node ID and the number of mutations
+ };
+ } else {
+ nodes[nodes[json.node_id].parent_id].children.push(
+ encodedChild
+ ); // if the parent node has been added, add this node to its children
+ }
+ }
+ } else {
+ // if doesnt contain "node_", then its a leaf node
+ encodedChild =
+ String(json.name) +
+ "=" +
+ String(snpCount) +
+ "=" +
+ String(json.meta_pangolin_lineage) +
+ "=" +
+ String(json.meta_genbank_accession); //encode child and snp count without further nesting, as trying to store them as separate objects causes Stringify error due to excessive nesting
+ if (!nodes[json.parent_id]) {
+ //we dont track leaf nodes, so if parent node is not in list, add it
+ nodes[json.parent_id] = {
+ //add line which fills in these null values when we read in the parent node
+ parent_id: null,
+ snpCount: null,
+ children: [encodedChild],
+ };
+ } else {
+ nodes[json.parent_id].children.push(encodedChild); //if parent node is in list, add this node to its children
+ }
+ }
+ } catch (e) {
+ console.error("Error parsing JSON:", e);
+ return "Error parsing JSON";
+ }
+ }
+ }
+ }
+
+ var answersArray = [
+ nodes,
+ foundSample,
+ foundSampleID,
+ foundParentID,
+ foundSNPCount,
+ isBranch,
+ ];
+ return answersArray;
+}
+/*
+processJsonLines('https://cov2tree.nyc3.cdn.digitaloceanspaces.com/latest_public.jsonl.gz', "node_3").then(result => {
+ let sliced = Object.fromEntries(Object.entries(result[0][0]).slice(0,3))//get first 3 entries
+ console.log("First 3 entries: ",sliced)
+ //saveObjectToJson(result[0], 'C:/Users/david/my-app/src/InternalNodeMap.json');
+})
+.catch(error => {
+ console.error("Error processing samples:", error);
+});
+function saveObjectToJson(dataObject, outputPath) {
+ const fs = require('fs');
+ const JSONStream = require('JSONStream');
+ return new Promise((resolve, reject) => {
+ const writeStream = fs.createWriteStream(outputPath);
+ const stringifyStream = JSONStream.stringifyObject();
+ stringifyStream.pipe(writeStream);
+
+ writeStream.on('finish', () => {
+ console.log('JSON file has been written successfully.');
+ resolve();
+ });
+
+ writeStream.on('error', (error) => {
+ console.error('Stream write error:', error);
+ reject(error);
+ });
+
+ stringifyStream.on('error', (error) => {
+ console.error('JSON stringify error:', error);
+ reject(error);
+ });
+
+ for (const key in dataObject) {
+ stringifyStream.write([key, dataObject[key]]);
+ }
+ stringifyStream.end();
+ });
+}
+*/
+export default processJsonLines;
+
+// Usage example
+//at ~2gb of ram, 4.2ghz with 6 cores, a little under 60sec when reading from url
+//time to write to file is more extensive, but ideally not a factor if its happening in the backend
+//time to query backend for single node: ~0.6s
+//time to add snp dist when reading is negligible