From d9d1e3f7ca2fffd70cd7ad2ba0949e8871c73bab Mon Sep 17 00:00:00 2001 From: DLiarakos <93622613+DLiarakos@users.noreply.github.com> Date: Mon, 20 May 2024 12:38:39 -0700 Subject: [PATCH 1/4] Add files via upload --- .../src/components/SearchPanel.jsx | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/taxonium_component/src/components/SearchPanel.jsx b/taxonium_component/src/components/SearchPanel.jsx index f7eff11c..5cb8836c 100644 --- a/taxonium_component/src/components/SearchPanel.jsx +++ b/taxonium_component/src/components/SearchPanel.jsx @@ -18,6 +18,8 @@ import classNames from "classnames"; import SearchDisplayToggle from "./SearchDisplayToggle"; +import SNPOutputModal from "./snpModal"; + const prettify_x_types = { x_dist: "Distance", x_time: "Time" }; const formatNumber = (num) => { @@ -72,7 +74,15 @@ function SearchPanel({ }, [selectedDetails.nodeDetails]); const [listOutputModalOpen, setListOutputModalOpen] = useState(false); - + const [snpSearchModalOpen, setSnpSearchModalOpen] = useState(false); + const [showSNPButton, setShowSNPButton] = useState(false); + useEffect(() => { + // Check if the current URL includes "api.cov2tree.org" + if (window.location.href.includes("api.cov2tree.org")) { + setShowSNPButton(true); + } + }, []); + const handleDownloadJson = () => { if (selectedDetails.nodeDetails) { const node_id = selectedDetails.nodeDetails.node_id; @@ -268,6 +278,7 @@ function SearchPanel({ <> Displaying {formatNumber(config.num_tips)}{" "} {config.tipPluralNoun ? config.tipPluralNoun : "sequences"} + {config.source && ` from ${config.source}`} )} @@ -418,6 +429,22 @@ function SearchPanel({ Add a new search + {showSNPButton && ( + + )} + {showSNPButton && ( + + )} {selectedDetails.nodeDetails && ( From 141b5608e0b1f15bc1c3bf074bb560103748adf0 Mon Sep 17 00:00:00 2001 From: DLiarakos <93622613+DLiarakos@users.noreply.github.com> Date: Mon, 20 May 2024 12:41:24 -0700 Subject: [PATCH 2/4] Added SNP Distance Search Component Added snpModal.jsx, main component for rendering SNP Distance Search --- .../src/components/snpModal.jsx | 235 ++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 taxonium_component/src/components/snpModal.jsx diff --git a/taxonium_component/src/components/snpModal.jsx b/taxonium_component/src/components/snpModal.jsx new file mode 100644 index 00000000..ef8c43b2 --- /dev/null +++ b/taxonium_component/src/components/snpModal.jsx @@ -0,0 +1,235 @@ +import Modal from "react-modal"; +import { useState} from "react"; +import getParsimonySamples from "../utils/extract.js"; + +/* +Testing Search: node_960478, 5 + -should return 97 results, exluding internal nodes +Germany/IMS-10245-CVDP-57CCA4FC-E286-4E50-AB0F-727CDF76B7BF/2022, 4 +should return 0 +TODO: +Additional Features to maybe include: +Maybe make each element in the drop down a clickable element to select the node in the SearchPanel +a csv download option for the output +*/ +async function getSNPneighbors(nodeId, integerValue, callback) { + // Dummy backend function + try { + let results= await getParsimonySamples(nodeId, integerValue); + callback(null, results); + } catch (err) { + callback(err, null); + } +} + +const SNPOutputModal = ({ + snpOutputModalOpen, + setSnpOutputModalOpen, +}) => { + const [nodeId, setNodeId] = useState(""); + const [integerValue, setIntegerValue] = useState(""); + const [fullOutput, setFullOutput] = useState([]); + const [displayOutput, setDisplayOutput] = useState([]); + const [remainingCount, setRemainingCount] = useState(0); + const [loading, setLoading] = useState(false); + const [JsonError, setJsonError] = useState(false); + const [MissingNoError, setMissingError] = useState(false); + const [ParsimonyError, setParsimonyError] = useState(false); + const [emptyReturn, setEmpty] = useState(false); + + const handleSearch = () => { + if (nodeId && integerValue) { + setLoading(true); + setFullOutput([]); + setDisplayOutput([]); + setRemainingCount(0); + setJsonError(false); + setMissingError(false); + setParsimonyError(false); + setEmpty(false); + const startTime = new Date(); + getSNPneighbors(nodeId, integerValue, (err, res) => { + if (err) { + console.log(err); + } + else if(res==="Error parsing JSON") { + setJsonError(true); + } + else if(res==="Node not found in the tree") { + setMissingError(true); + } + else if(res==="Error parsing JSON") { + setParsimonyError(true); + } + else if (res.length === 0) { + setEmpty(true); + } + else { + setFullOutput(res); + setDisplayOutput(res.slice(0, 100)); + setRemainingCount(res.length > 100 ? res.length - 100 : 0); + const endTime = new Date(); // End timing + const timeDiff = endTime - startTime; // Time in milliseconds + console.log(`Time taken: ${timeDiff} ms`); + } + setLoading(false); + }); + } + }; + const handleCloseModal = () => { + // Reset all state variables on modal close + setSnpOutputModalOpen(false); + setNodeId(""); + setIntegerValue(""); + setFullOutput([]); + setDisplayOutput([]); + setRemainingCount(0); + setLoading(false); + setJsonError(false); + setMissingError(false); + setParsimonyError(false); + setEmpty(false); + }; + const convertToTSV = (data) => { + const header = "Sample Name\tDistance\tPANGO Lineage\tGenbank Accession\n"; + const rows = data.map(row => + `${row[0]}\t${row[1]}\t${row[2]}\t${row[3]}` + ).join('\n'); + return header + rows; + }; + const downloadTSV = (data) => { + const tsvString = convertToTSV(data); + const blob = new Blob([tsvString], { type: 'text/tab-separated-values' }); + const href = URL.createObjectURL(blob); + const link = document.createElement('a'); + link.href = href; + link.download = `${nodeId}_SNP${integerValue}.tsv`; + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + URL.revokeObjectURL(href); + }; + return ( + + +

SNP Distance Search

+
+
+ +
+
+ +
+ +
+ {loading &&
Loading data. This may take a minute...
} + {!loading && JsonError &&
Error parsing JSON.
} + {!loading && MissingNoError &&
Node not found in the tree.
} + {!loading && ParsimonyError &&
Error traversing tree.
} + {!loading && emptyReturn &&
No results found; check your sample ID or change SNP distance.
} + {!loading && !emptyReturn && displayOutput.length === 0 &&
No data available.
} + {!loading && displayOutput.length > 0 && ( + <> +
+ + + + + + + + + + + {displayOutput.map(([name, distance, lineage, accession], index) => ( + + + + + + + ))} + +
Sample NameDistancePANGO LineageGenbank Accession
{name}{distance}{lineage}{accession}
+ {remainingCount>0 &&
...with {remainingCount} more items
} +
+ + )} + {!loading && displayOutput.length > 0 &&} +
+ ); +}; + +export default SNPOutputModal; +/* +
+ +
+*/ \ No newline at end of file From 519006b4e4b2487889f10fa262cd2aedf110536f Mon Sep 17 00:00:00 2001 From: DLiarakos <93622613+DLiarakos@users.noreply.github.com> Date: Mon, 20 May 2024 12:55:29 -0700 Subject: [PATCH 3/4] Added backend workers Added two files, nodeMapper.js and extract.js, which are used in generating the output for the SNP distance search. nodeMapper makes a map of the entire tree in order to perform downwards traversal, and extract does the actualn traversal and calculation of SNP distance. --- taxonium_component/src/utils/extract.js | 119 +++++++++++++++ taxonium_component/src/utils/nodeMapper.js | 162 +++++++++++++++++++++ 2 files changed, 281 insertions(+) create mode 100644 taxonium_component/src/utils/extract.js create mode 100644 taxonium_component/src/utils/nodeMapper.js diff --git a/taxonium_component/src/utils/extract.js b/taxonium_component/src/utils/extract.js new file mode 100644 index 00000000..528f88dd --- /dev/null +++ b/taxonium_component/src/utils/extract.js @@ -0,0 +1,119 @@ +import processJsonLines from './nodeMapper.js'; + +/* +getParsimonySamples function outline: +inputs: + sampleID, which is the explicit node or sample name, not internal ID + maxParsimony, which is the SNP distance threshold of interest + Nested functions: + processJsonLines: main worker of the backend, which reads in the jsonl file and constructs a map of all internal nodes, children, and mutations, and checks if the sample exists in the tree + findNodesWithinDistance: a helper function to find all nodes within a certain distance of a given node + traverses up and down the tree, adding nodes to a results array if they are within the distance threshold + returns the results array + traverseUp: helper function to traverse up the tree, adding nodes to the results array if they are within the distance threshold + traverseDown: helper function to traverse down the tree, adding nodes to the results array if they are within the distance threshold +outputs: should output a simple list/array-like of internal IDs and their SNP distances from the queried node, some flags that determine whether or not a valid search was performed, as well as a map of each nodes name with genbank accession and pangolin lineage +with this list as a result, we can then query the backend for more information about each node, like name, mutations, etc, if needed + this above includes if specificMut is passed through, which would allow for filtering based on whether or not a node has a specific mutation, + but the filtering is done after all SNPs within distance are found, to reduce processing time if flag isnt specified +once the list is obtained, snpComponent formats the list for output into Taxonium(Big step) +*/ +async function getParsimonySamples(sampleID, maxParsimony) { + return processJsonLines("https://cov2tree.nyc3.cdn.digitaloceanspaces.com/latest_public.jsonl.gz",sampleID).then(myResult => {//answersArray=[nodes, foundSample, foundSampleID, foundParentID, foundSNPCount, isBranch] + if (myResult==="Error parsing JSON"){//if error parsing JSON, return error + return "Error parsing JSON"; + } + var nodeMap=myResult[0]//index of all internal nodes and children + // Main function to find all nodes within a certain distance of a given node + function findNodesWithinDistance(node, distanceThreshold) { + // Helper function to traverse up (towards the parent) + function traverseUp(node, currentDistance) { + var parent_id=nodeMap[node].parent_id + var snpCount=nodeMap[node].snpCount + if (parent_id===node || currentDistance > distanceThreshold) {//if root node(root has itself as parent), or if threshold is reached, + //console.log("reached root node or threshold, returning at distance "+currentDistance+" from node "+node+" with parent "+parent_id+" and snpCount "+snpCount) + return;//end traversal + } + //console.log("traversing up, new node is "+parent_id+" with distance "+(currentDistance + snpCount)) + if (!visited.has(parent_id)) {// Check if this node has already been visited to avoid infinite loops + visited.add(parent_id); + traverseDown(parent_id, currentDistance + snpCount);//Traverse down from the parent + traverseUp(parent_id, currentDistance + snpCount);// Traverse further up + } + } + // Helper function to traverse down (towards the children) + function traverseDown(node, currentDistance) { + if (!nodeMap[node]|| currentDistance > distanceThreshold) {return;}//if node is a leaf node, or it threshold is reached, return + for (const child of nodeMap[node].children) {// Traverse all children + let decodedChild=child.split("=")//split encoded child into internal ID and SNP distance + let childId=decodedChild[0]//get internal ID of child + let childSnpDist=parseInt(decodedChild[1])//get SNP distance of child + let newTotal=currentDistance+childSnpDist//add SNP distance of child to current distance + //console.log("traversing down, new node is "+childId+" with distance "+childSnpDist+" for new total "+newTotal) + if (!visited.has(childId)&&!visited.has(decodedChild[3])) {//need a switch to add childs as genbank accession or node ID, since some sample names are repeated + if (childId.match(/^\d+$/)){//if its just numbers, its an internal node, so we add it to visited as is + visited.add(childId); + } + else {visited.add(decodedChild[3]);}//if its not just numbers, its a leaf node, so we add the genbank accession to visited + if ((newTotal <= distanceThreshold)){ //dont add the root node, as its always going to be within SNP distance of itself + //console.log("adding node to results:"+childId+" with distance "+newTotal) + if (!nodeMap[childId]){//if its not an entry in node map, means its not an internal node, so we add it to the results + //console.log("adding node to results:"+decodedChild) + results.push([decodedChild[0], newTotal, decodedChild[2], decodedChild[3]]); + } + } + if (nodeMap[childId]){//if the child is an internal node, traverse down + traverseDown(childId,newTotal);// Traverse further down; pass ID, not node info itself + } + } + } + } + + // Start of the main function + //boolean obtained during traversal of the whole tree; if the queried sample exists in taxonium, this will flag as true + if (!myResult[1]) {//if boolean is falsey + console.log("Node not found in the tree");//its not a valid node, return error statement + return "Node not found in the tree"; + } + + let visited = new Set(); // To keep track of visited nodes + let results = []; // To store nodes within the distance threshold + visited.add(myResult[2]); //add ID of queried sample to visited + if (myResult[5]){//if the node is an internal node + traverseDown(myResult[2], 0);//start traversal from the internal node, we have a neutral distance of 0 + traverseUp(myResult[2], 0); + } + else{ + traverseDown(myResult[3], myResult[4]); + traverseUp(myResult[3], myResult[4]); + + } + //internal ID of the queried sample + // Traverse as far down as possible first, then go up, and traverse down again ignoring visited nodes + return results; + } + + let goodSamples = findNodesWithinDistance(sampleID, maxParsimony) + nodeMap=null; + return goodSamples + }) + .catch(error => { + // Catch any errors from processJsonLines or thrown in the then block + console.error('Error in getParsimonySamples:', error); + return "Error processing samples"; + }); +} +/* +getParsimonySamples("node_960478", 5) + .then(result => { + console.log("Results:", result); + }) + .catch(error => { + console.error("Error processing samples:", error); + }); +*/ +export default getParsimonySamples; + +/* +NOTES: +*/ diff --git a/taxonium_component/src/utils/nodeMapper.js b/taxonium_component/src/utils/nodeMapper.js new file mode 100644 index 00000000..14ac8c16 --- /dev/null +++ b/taxonium_component/src/utils/nodeMapper.js @@ -0,0 +1,162 @@ +/* +TODO: +*/ + +async function processJsonLines(url,sampleID) { + // Fetch the gzipped JSONL file + //const startTime = new Date(); // Start timing + const response = await fetch(url); + + // Ensure the fetch was successful + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } + + // Stream the response through decompression and decoding + const decompressedStream = response.body.pipeThrough(new DecompressionStream('gzip')); + const textStream = decompressedStream.pipeThrough(new TextDecoderStream()); + + // Reader to read the stream line by line + const reader = textStream.getReader(); + let remainder = ''; + let result; + let nodes = {}; + let foundSample=false;//we will be looking for a specific ID when we construct + let foundSampleID="" + let foundParentID="" + let foundSNPCount=0 + let isBranch=false + while (!(result = await reader.read()).done) { + const chunk = remainder + result.value; + const lines = chunk.split('\n'); + remainder = lines.pop(); // Save the last line in case it's incomplete + for (const line of lines) { + if (line) { + var snpCount=0; + try { + const json = JSON.parse(line); + if (json.config){//if line has the config file, skip it to avoid an error + continue;//this first line also has mutations dictionary for decoding, if we need that later + } + for (const mut of json.mutations){ + if (mut>107435){ + snpCount+=1; + } + } + if (json.name===sampleID){//check if this is the sample we will be searching for + foundSample=true;//if it is, we have found it + foundSampleID=json.node_id//store its ID so we can use it later + foundParentID=json.parent_id//need to get parent ID of first node as a jumping off point for internal nodes, since theyre not being stored + foundSNPCount=snpCount + if (json.name.includes("node_")) { + isBranch=true + } + //console.log(json) + } + + if (json.name.includes("node_")) { // Check if the node is internal + var encodedChild=(String(json.node_id)+"="+String(snpCount))//encode child and snp count without further nesting, as trying to store them as separate objects causes Stringify error due to excessive nesting + if (!nodes[json.node_id]) {//if internal, but not added to list + nodes[json.node_id] = {//create new node + parent_id: json.parent_id, + snpCount: snpCount, + children: [] + }; + if (!nodes[nodes[json.node_id].parent_id]){//if the parent is not yet added to the list, + nodes[nodes[json.node_id].parent_id] = {// add it to the list, with null name and parent, since we wont have that info until we read in parent node + parent_id: null, + snpCount: null, + children: [encodedChild]//store the node ID and the number of mutations + }; + } + else{ + nodes[nodes[json.node_id].parent_id].children.push(encodedChild);// if the parent node has been added, add this node to its children + } + } + if(nodes[json.node_id] && (nodes[json.node_id].parent_id===null || nodes[json.node_id].name===null)){//if we have added this parent node previously, but finally come across in JSON + //console.log("Node ID being updated:"+json.name) + nodes[json.node_id].parent_id=json.parent_id;//fill in the parent ID + nodes[json.node_id].snpCount=snpCount;//fill in the snp count + if (!nodes[nodes[json.node_id].parent_id]){//if this node, which was added by a previous step and therefore does not flag new internal step above, has a parent that has not been added to the list + nodes[nodes[json.node_id].parent_id] = {// so add it + parent_id: null, + snpCount: null, + children: [encodedChild]//store the node ID and the number of mutations + }; + } + else{ + nodes[nodes[json.node_id].parent_id].children.push(encodedChild);// if the parent node has been added, add this node to its children + } + } + } + else {// if doesnt contain "node_", then its a leaf node + encodedChild=(String(json.name)+"="+String(snpCount)+"="+String(json.meta_pangolin_lineage)+"="+String(json.meta_genbank_accession))//encode child and snp count without further nesting, as trying to store them as separate objects causes Stringify error due to excessive nesting + if (!nodes[json.parent_id]) {//we dont track leaf nodes, so if parent node is not in list, add it + nodes[json.parent_id] = {//add line which fills in these null values when we read in the parent node + parent_id: null, + snpCount: null, + children: [encodedChild] + }; + } else { + + nodes[json.parent_id].children.push(encodedChild);//if parent node is in list, add this node to its children + } + } + } catch (e) { + console.error('Error parsing JSON:', e); + return "Error parsing JSON" + } + } + } + } + + + var answersArray=[nodes, foundSample, foundSampleID, foundParentID, foundSNPCount, isBranch] + return answersArray; + } +/* +processJsonLines('https://cov2tree.nyc3.cdn.digitaloceanspaces.com/latest_public.jsonl.gz', "node_3").then(result => { + let sliced = Object.fromEntries(Object.entries(result[0][0]).slice(0,3))//get first 3 entries + console.log("First 3 entries: ",sliced) + //saveObjectToJson(result[0], 'C:/Users/david/my-app/src/InternalNodeMap.json'); +}) +.catch(error => { + console.error("Error processing samples:", error); +}); +function saveObjectToJson(dataObject, outputPath) { + const fs = require('fs'); + const JSONStream = require('JSONStream'); + return new Promise((resolve, reject) => { + const writeStream = fs.createWriteStream(outputPath); + const stringifyStream = JSONStream.stringifyObject(); + stringifyStream.pipe(writeStream); + + writeStream.on('finish', () => { + console.log('JSON file has been written successfully.'); + resolve(); + }); + + writeStream.on('error', (error) => { + console.error('Stream write error:', error); + reject(error); + }); + + stringifyStream.on('error', (error) => { + console.error('JSON stringify error:', error); + reject(error); + }); + + for (const key in dataObject) { + stringifyStream.write([key, dataObject[key]]); + } + stringifyStream.end(); + }); +} +*/ +export default processJsonLines; + +// Usage example +//at ~2gb of ram, 4.2ghz with 6 cores, a little under 60sec when reading from url +//time to write to file is more extensive, but ideally not a factor if its happening in the backend +//time to query backend for single node: ~0.6s +//time to add snp dist when reading is negligible \ No newline at end of file From 06ca0706f230b276feced2d0b463908aa5d5e274 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 20 May 2024 20:19:29 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../src/components/SearchPanel.jsx | 35 +- .../src/components/snpModal.jsx | 604 +++++++++++------- taxonium_component/src/utils/extract.js | 260 ++++---- taxonium_component/src/utils/nodeMapper.js | 356 ++++++----- 4 files changed, 721 insertions(+), 534 deletions(-) diff --git a/taxonium_component/src/components/SearchPanel.jsx b/taxonium_component/src/components/SearchPanel.jsx index 5cb8836c..10adb9cf 100644 --- a/taxonium_component/src/components/SearchPanel.jsx +++ b/taxonium_component/src/components/SearchPanel.jsx @@ -82,7 +82,7 @@ function SearchPanel({ setShowSNPButton(true); } }, []); - + const handleDownloadJson = () => { if (selectedDetails.nodeDetails) { const node_id = selectedDetails.nodeDetails.node_id; @@ -278,7 +278,6 @@ function SearchPanel({ <> Displaying {formatNumber(config.num_tips)}{" "} {config.tipPluralNoun ? config.tipPluralNoun : "sequences"} - {config.source && ` from ${config.source}`} )} @@ -429,22 +428,22 @@ function SearchPanel({ Add a new search - {showSNPButton && ( - - )} - {showSNPButton && ( - - )} + {showSNPButton && ( + + )} + {showSNPButton && ( + + )} {selectedDetails.nodeDetails && ( diff --git a/taxonium_component/src/components/snpModal.jsx b/taxonium_component/src/components/snpModal.jsx index ef8c43b2..9a867264 100644 --- a/taxonium_component/src/components/snpModal.jsx +++ b/taxonium_component/src/components/snpModal.jsx @@ -1,235 +1,369 @@ -import Modal from "react-modal"; -import { useState} from "react"; -import getParsimonySamples from "../utils/extract.js"; - -/* -Testing Search: node_960478, 5 - -should return 97 results, exluding internal nodes -Germany/IMS-10245-CVDP-57CCA4FC-E286-4E50-AB0F-727CDF76B7BF/2022, 4 -should return 0 -TODO: -Additional Features to maybe include: -Maybe make each element in the drop down a clickable element to select the node in the SearchPanel -a csv download option for the output -*/ -async function getSNPneighbors(nodeId, integerValue, callback) { - // Dummy backend function - try { - let results= await getParsimonySamples(nodeId, integerValue); - callback(null, results); - } catch (err) { - callback(err, null); - } -} - -const SNPOutputModal = ({ - snpOutputModalOpen, - setSnpOutputModalOpen, -}) => { - const [nodeId, setNodeId] = useState(""); - const [integerValue, setIntegerValue] = useState(""); - const [fullOutput, setFullOutput] = useState([]); - const [displayOutput, setDisplayOutput] = useState([]); - const [remainingCount, setRemainingCount] = useState(0); - const [loading, setLoading] = useState(false); - const [JsonError, setJsonError] = useState(false); - const [MissingNoError, setMissingError] = useState(false); - const [ParsimonyError, setParsimonyError] = useState(false); - const [emptyReturn, setEmpty] = useState(false); - - const handleSearch = () => { - if (nodeId && integerValue) { - setLoading(true); - setFullOutput([]); - setDisplayOutput([]); - setRemainingCount(0); - setJsonError(false); - setMissingError(false); - setParsimonyError(false); - setEmpty(false); - const startTime = new Date(); - getSNPneighbors(nodeId, integerValue, (err, res) => { - if (err) { - console.log(err); - } - else if(res==="Error parsing JSON") { - setJsonError(true); - } - else if(res==="Node not found in the tree") { - setMissingError(true); - } - else if(res==="Error parsing JSON") { - setParsimonyError(true); - } - else if (res.length === 0) { - setEmpty(true); - } - else { - setFullOutput(res); - setDisplayOutput(res.slice(0, 100)); - setRemainingCount(res.length > 100 ? res.length - 100 : 0); - const endTime = new Date(); // End timing - const timeDiff = endTime - startTime; // Time in milliseconds - console.log(`Time taken: ${timeDiff} ms`); - } - setLoading(false); - }); - } - }; - const handleCloseModal = () => { - // Reset all state variables on modal close - setSnpOutputModalOpen(false); - setNodeId(""); - setIntegerValue(""); - setFullOutput([]); - setDisplayOutput([]); - setRemainingCount(0); - setLoading(false); - setJsonError(false); - setMissingError(false); - setParsimonyError(false); - setEmpty(false); - }; - const convertToTSV = (data) => { - const header = "Sample Name\tDistance\tPANGO Lineage\tGenbank Accession\n"; - const rows = data.map(row => - `${row[0]}\t${row[1]}\t${row[2]}\t${row[3]}` - ).join('\n'); - return header + rows; - }; - const downloadTSV = (data) => { - const tsvString = convertToTSV(data); - const blob = new Blob([tsvString], { type: 'text/tab-separated-values' }); - const href = URL.createObjectURL(blob); - const link = document.createElement('a'); - link.href = href; - link.download = `${nodeId}_SNP${integerValue}.tsv`; - document.body.appendChild(link); - link.click(); - document.body.removeChild(link); - URL.revokeObjectURL(href); - }; - return ( - - -

SNP Distance Search

-
-
- -
-
- -
- -
- {loading &&
Loading data. This may take a minute...
} - {!loading && JsonError &&
Error parsing JSON.
} - {!loading && MissingNoError &&
Node not found in the tree.
} - {!loading && ParsimonyError &&
Error traversing tree.
} - {!loading && emptyReturn &&
No results found; check your sample ID or change SNP distance.
} - {!loading && !emptyReturn && displayOutput.length === 0 &&
No data available.
} - {!loading && displayOutput.length > 0 && ( - <> -
- - - - - - - - - - - {displayOutput.map(([name, distance, lineage, accession], index) => ( - - - - - - - ))} - -
Sample NameDistancePANGO LineageGenbank Accession
{name}{distance}{lineage}{accession}
- {remainingCount>0 &&
...with {remainingCount} more items
} -
- - )} - {!loading && displayOutput.length > 0 &&} -
- ); -}; - -export default SNPOutputModal; -/* -
- -
-*/ \ No newline at end of file +import Modal from "react-modal"; +import { useState } from "react"; +import getParsimonySamples from "../utils/extract.js"; + +/* +Testing Search: node_960478, 5 + -should return 97 results, exluding internal nodes +Germany/IMS-10245-CVDP-57CCA4FC-E286-4E50-AB0F-727CDF76B7BF/2022, 4 +should return 0 +TODO: +Additional Features to maybe include: +Maybe make each element in the drop down a clickable element to select the node in the SearchPanel +a csv download option for the output +*/ +async function getSNPneighbors(nodeId, integerValue, callback) { + // Dummy backend function + try { + let results = await getParsimonySamples(nodeId, integerValue); + callback(null, results); + } catch (err) { + callback(err, null); + } +} + +const SNPOutputModal = ({ snpOutputModalOpen, setSnpOutputModalOpen }) => { + const [nodeId, setNodeId] = useState(""); + const [integerValue, setIntegerValue] = useState(""); + const [fullOutput, setFullOutput] = useState([]); + const [displayOutput, setDisplayOutput] = useState([]); + const [remainingCount, setRemainingCount] = useState(0); + const [loading, setLoading] = useState(false); + const [JsonError, setJsonError] = useState(false); + const [MissingNoError, setMissingError] = useState(false); + const [ParsimonyError, setParsimonyError] = useState(false); + const [emptyReturn, setEmpty] = useState(false); + + const handleSearch = () => { + if (nodeId && integerValue) { + setLoading(true); + setFullOutput([]); + setDisplayOutput([]); + setRemainingCount(0); + setJsonError(false); + setMissingError(false); + setParsimonyError(false); + setEmpty(false); + const startTime = new Date(); + getSNPneighbors(nodeId, integerValue, (err, res) => { + if (err) { + console.log(err); + } else if (res === "Error parsing JSON") { + setJsonError(true); + } else if (res === "Node not found in the tree") { + setMissingError(true); + } else if (res === "Error parsing JSON") { + setParsimonyError(true); + } else if (res.length === 0) { + setEmpty(true); + } else { + setFullOutput(res); + setDisplayOutput(res.slice(0, 100)); + setRemainingCount(res.length > 100 ? res.length - 100 : 0); + const endTime = new Date(); // End timing + const timeDiff = endTime - startTime; // Time in milliseconds + console.log(`Time taken: ${timeDiff} ms`); + } + setLoading(false); + }); + } + }; + const handleCloseModal = () => { + // Reset all state variables on modal close + setSnpOutputModalOpen(false); + setNodeId(""); + setIntegerValue(""); + setFullOutput([]); + setDisplayOutput([]); + setRemainingCount(0); + setLoading(false); + setJsonError(false); + setMissingError(false); + setParsimonyError(false); + setEmpty(false); + }; + const convertToTSV = (data) => { + const header = "Sample Name\tDistance\tPANGO Lineage\tGenbank Accession\n"; + const rows = data + .map((row) => `${row[0]}\t${row[1]}\t${row[2]}\t${row[3]}`) + .join("\n"); + return header + rows; + }; + const downloadTSV = (data) => { + const tsvString = convertToTSV(data); + const blob = new Blob([tsvString], { type: "text/tab-separated-values" }); + const href = URL.createObjectURL(blob); + const link = document.createElement("a"); + link.href = href; + link.download = `${nodeId}_SNP${integerValue}.tsv`; + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + URL.revokeObjectURL(href); + }; + return ( + + +

+ SNP Distance Search +

+
+
+ +
+
+ +
+ +
+ {loading && ( +
+ Loading data. This may take a minute... +
+ )} + {!loading && JsonError && ( +
+ Error parsing JSON. +
+ )} + {!loading && MissingNoError && ( +
+ Node not found in the tree. +
+ )} + {!loading && ParsimonyError && ( +
+ Error traversing tree. +
+ )} + {!loading && emptyReturn && ( +
+ No results found; check your sample ID or change SNP distance. +
+ )} + {!loading && !emptyReturn && displayOutput.length === 0 && ( +
+ No data available. +
+ )} + {!loading && displayOutput.length > 0 && ( + <> +
+ + + + + + + + + + + {displayOutput.map( + ([name, distance, lineage, accession], index) => ( + + + + + + + ) + )} + +
+ Sample Name + + Distance + + PANGO Lineage + + Genbank Accession +
+ {name} + + {distance} + + {lineage} + + {accession} +
+ {remainingCount > 0 && ( +
+ ...with {remainingCount} more items +
+ )} +
+ + )} + {!loading && displayOutput.length > 0 && ( + + )} +
+ ); +}; + +export default SNPOutputModal; +/* +
+ +
+*/ diff --git a/taxonium_component/src/utils/extract.js b/taxonium_component/src/utils/extract.js index 528f88dd..a235701d 100644 --- a/taxonium_component/src/utils/extract.js +++ b/taxonium_component/src/utils/extract.js @@ -1,119 +1,141 @@ -import processJsonLines from './nodeMapper.js'; - -/* -getParsimonySamples function outline: -inputs: - sampleID, which is the explicit node or sample name, not internal ID - maxParsimony, which is the SNP distance threshold of interest - Nested functions: - processJsonLines: main worker of the backend, which reads in the jsonl file and constructs a map of all internal nodes, children, and mutations, and checks if the sample exists in the tree - findNodesWithinDistance: a helper function to find all nodes within a certain distance of a given node - traverses up and down the tree, adding nodes to a results array if they are within the distance threshold - returns the results array - traverseUp: helper function to traverse up the tree, adding nodes to the results array if they are within the distance threshold - traverseDown: helper function to traverse down the tree, adding nodes to the results array if they are within the distance threshold -outputs: should output a simple list/array-like of internal IDs and their SNP distances from the queried node, some flags that determine whether or not a valid search was performed, as well as a map of each nodes name with genbank accession and pangolin lineage -with this list as a result, we can then query the backend for more information about each node, like name, mutations, etc, if needed - this above includes if specificMut is passed through, which would allow for filtering based on whether or not a node has a specific mutation, - but the filtering is done after all SNPs within distance are found, to reduce processing time if flag isnt specified -once the list is obtained, snpComponent formats the list for output into Taxonium(Big step) -*/ -async function getParsimonySamples(sampleID, maxParsimony) { - return processJsonLines("https://cov2tree.nyc3.cdn.digitaloceanspaces.com/latest_public.jsonl.gz",sampleID).then(myResult => {//answersArray=[nodes, foundSample, foundSampleID, foundParentID, foundSNPCount, isBranch] - if (myResult==="Error parsing JSON"){//if error parsing JSON, return error - return "Error parsing JSON"; - } - var nodeMap=myResult[0]//index of all internal nodes and children - // Main function to find all nodes within a certain distance of a given node - function findNodesWithinDistance(node, distanceThreshold) { - // Helper function to traverse up (towards the parent) - function traverseUp(node, currentDistance) { - var parent_id=nodeMap[node].parent_id - var snpCount=nodeMap[node].snpCount - if (parent_id===node || currentDistance > distanceThreshold) {//if root node(root has itself as parent), or if threshold is reached, - //console.log("reached root node or threshold, returning at distance "+currentDistance+" from node "+node+" with parent "+parent_id+" and snpCount "+snpCount) - return;//end traversal - } - //console.log("traversing up, new node is "+parent_id+" with distance "+(currentDistance + snpCount)) - if (!visited.has(parent_id)) {// Check if this node has already been visited to avoid infinite loops - visited.add(parent_id); - traverseDown(parent_id, currentDistance + snpCount);//Traverse down from the parent - traverseUp(parent_id, currentDistance + snpCount);// Traverse further up - } - } - // Helper function to traverse down (towards the children) - function traverseDown(node, currentDistance) { - if (!nodeMap[node]|| currentDistance > distanceThreshold) {return;}//if node is a leaf node, or it threshold is reached, return - for (const child of nodeMap[node].children) {// Traverse all children - let decodedChild=child.split("=")//split encoded child into internal ID and SNP distance - let childId=decodedChild[0]//get internal ID of child - let childSnpDist=parseInt(decodedChild[1])//get SNP distance of child - let newTotal=currentDistance+childSnpDist//add SNP distance of child to current distance - //console.log("traversing down, new node is "+childId+" with distance "+childSnpDist+" for new total "+newTotal) - if (!visited.has(childId)&&!visited.has(decodedChild[3])) {//need a switch to add childs as genbank accession or node ID, since some sample names are repeated - if (childId.match(/^\d+$/)){//if its just numbers, its an internal node, so we add it to visited as is - visited.add(childId); - } - else {visited.add(decodedChild[3]);}//if its not just numbers, its a leaf node, so we add the genbank accession to visited - if ((newTotal <= distanceThreshold)){ //dont add the root node, as its always going to be within SNP distance of itself - //console.log("adding node to results:"+childId+" with distance "+newTotal) - if (!nodeMap[childId]){//if its not an entry in node map, means its not an internal node, so we add it to the results - //console.log("adding node to results:"+decodedChild) - results.push([decodedChild[0], newTotal, decodedChild[2], decodedChild[3]]); - } - } - if (nodeMap[childId]){//if the child is an internal node, traverse down - traverseDown(childId,newTotal);// Traverse further down; pass ID, not node info itself - } - } - } - } - - // Start of the main function - //boolean obtained during traversal of the whole tree; if the queried sample exists in taxonium, this will flag as true - if (!myResult[1]) {//if boolean is falsey - console.log("Node not found in the tree");//its not a valid node, return error statement - return "Node not found in the tree"; - } - - let visited = new Set(); // To keep track of visited nodes - let results = []; // To store nodes within the distance threshold - visited.add(myResult[2]); //add ID of queried sample to visited - if (myResult[5]){//if the node is an internal node - traverseDown(myResult[2], 0);//start traversal from the internal node, we have a neutral distance of 0 - traverseUp(myResult[2], 0); - } - else{ - traverseDown(myResult[3], myResult[4]); - traverseUp(myResult[3], myResult[4]); - - } - //internal ID of the queried sample - // Traverse as far down as possible first, then go up, and traverse down again ignoring visited nodes - return results; - } - - let goodSamples = findNodesWithinDistance(sampleID, maxParsimony) - nodeMap=null; - return goodSamples - }) - .catch(error => { - // Catch any errors from processJsonLines or thrown in the then block - console.error('Error in getParsimonySamples:', error); - return "Error processing samples"; - }); -} -/* -getParsimonySamples("node_960478", 5) - .then(result => { - console.log("Results:", result); - }) - .catch(error => { - console.error("Error processing samples:", error); - }); -*/ -export default getParsimonySamples; - -/* -NOTES: -*/ +import processJsonLines from "./nodeMapper.js"; + +/* +getParsimonySamples function outline: +inputs: + sampleID, which is the explicit node or sample name, not internal ID + maxParsimony, which is the SNP distance threshold of interest + Nested functions: + processJsonLines: main worker of the backend, which reads in the jsonl file and constructs a map of all internal nodes, children, and mutations, and checks if the sample exists in the tree + findNodesWithinDistance: a helper function to find all nodes within a certain distance of a given node + traverses up and down the tree, adding nodes to a results array if they are within the distance threshold + returns the results array + traverseUp: helper function to traverse up the tree, adding nodes to the results array if they are within the distance threshold + traverseDown: helper function to traverse down the tree, adding nodes to the results array if they are within the distance threshold +outputs: should output a simple list/array-like of internal IDs and their SNP distances from the queried node, some flags that determine whether or not a valid search was performed, as well as a map of each nodes name with genbank accession and pangolin lineage +with this list as a result, we can then query the backend for more information about each node, like name, mutations, etc, if needed + this above includes if specificMut is passed through, which would allow for filtering based on whether or not a node has a specific mutation, + but the filtering is done after all SNPs within distance are found, to reduce processing time if flag isnt specified +once the list is obtained, snpComponent formats the list for output into Taxonium(Big step) +*/ +async function getParsimonySamples(sampleID, maxParsimony) { + return processJsonLines( + "https://cov2tree.nyc3.cdn.digitaloceanspaces.com/latest_public.jsonl.gz", + sampleID + ) + .then((myResult) => { + //answersArray=[nodes, foundSample, foundSampleID, foundParentID, foundSNPCount, isBranch] + if (myResult === "Error parsing JSON") { + //if error parsing JSON, return error + return "Error parsing JSON"; + } + var nodeMap = myResult[0]; //index of all internal nodes and children + // Main function to find all nodes within a certain distance of a given node + function findNodesWithinDistance(node, distanceThreshold) { + // Helper function to traverse up (towards the parent) + function traverseUp(node, currentDistance) { + var parent_id = nodeMap[node].parent_id; + var snpCount = nodeMap[node].snpCount; + if (parent_id === node || currentDistance > distanceThreshold) { + //if root node(root has itself as parent), or if threshold is reached, + //console.log("reached root node or threshold, returning at distance "+currentDistance+" from node "+node+" with parent "+parent_id+" and snpCount "+snpCount) + return; //end traversal + } + //console.log("traversing up, new node is "+parent_id+" with distance "+(currentDistance + snpCount)) + if (!visited.has(parent_id)) { + // Check if this node has already been visited to avoid infinite loops + visited.add(parent_id); + traverseDown(parent_id, currentDistance + snpCount); //Traverse down from the parent + traverseUp(parent_id, currentDistance + snpCount); // Traverse further up + } + } + // Helper function to traverse down (towards the children) + function traverseDown(node, currentDistance) { + if (!nodeMap[node] || currentDistance > distanceThreshold) { + return; + } //if node is a leaf node, or it threshold is reached, return + for (const child of nodeMap[node].children) { + // Traverse all children + let decodedChild = child.split("="); //split encoded child into internal ID and SNP distance + let childId = decodedChild[0]; //get internal ID of child + let childSnpDist = parseInt(decodedChild[1]); //get SNP distance of child + let newTotal = currentDistance + childSnpDist; //add SNP distance of child to current distance + //console.log("traversing down, new node is "+childId+" with distance "+childSnpDist+" for new total "+newTotal) + if (!visited.has(childId) && !visited.has(decodedChild[3])) { + //need a switch to add childs as genbank accession or node ID, since some sample names are repeated + if (childId.match(/^\d+$/)) { + //if its just numbers, its an internal node, so we add it to visited as is + visited.add(childId); + } else { + visited.add(decodedChild[3]); + } //if its not just numbers, its a leaf node, so we add the genbank accession to visited + if (newTotal <= distanceThreshold) { + //dont add the root node, as its always going to be within SNP distance of itself + //console.log("adding node to results:"+childId+" with distance "+newTotal) + if (!nodeMap[childId]) { + //if its not an entry in node map, means its not an internal node, so we add it to the results + //console.log("adding node to results:"+decodedChild) + results.push([ + decodedChild[0], + newTotal, + decodedChild[2], + decodedChild[3], + ]); + } + } + if (nodeMap[childId]) { + //if the child is an internal node, traverse down + traverseDown(childId, newTotal); // Traverse further down; pass ID, not node info itself + } + } + } + } + + // Start of the main function + //boolean obtained during traversal of the whole tree; if the queried sample exists in taxonium, this will flag as true + if (!myResult[1]) { + //if boolean is falsey + console.log("Node not found in the tree"); //its not a valid node, return error statement + return "Node not found in the tree"; + } + + let visited = new Set(); // To keep track of visited nodes + let results = []; // To store nodes within the distance threshold + visited.add(myResult[2]); //add ID of queried sample to visited + if (myResult[5]) { + //if the node is an internal node + traverseDown(myResult[2], 0); //start traversal from the internal node, we have a neutral distance of 0 + traverseUp(myResult[2], 0); + } else { + traverseDown(myResult[3], myResult[4]); + traverseUp(myResult[3], myResult[4]); + } + //internal ID of the queried sample + // Traverse as far down as possible first, then go up, and traverse down again ignoring visited nodes + return results; + } + + let goodSamples = findNodesWithinDistance(sampleID, maxParsimony); + nodeMap = null; + return goodSamples; + }) + .catch((error) => { + // Catch any errors from processJsonLines or thrown in the then block + console.error("Error in getParsimonySamples:", error); + return "Error processing samples"; + }); +} +/* +getParsimonySamples("node_960478", 5) + .then(result => { + console.log("Results:", result); + }) + .catch(error => { + console.error("Error processing samples:", error); + }); +*/ +export default getParsimonySamples; + +/* +NOTES: +*/ diff --git a/taxonium_component/src/utils/nodeMapper.js b/taxonium_component/src/utils/nodeMapper.js index 14ac8c16..3eb660af 100644 --- a/taxonium_component/src/utils/nodeMapper.js +++ b/taxonium_component/src/utils/nodeMapper.js @@ -1,162 +1,194 @@ -/* -TODO: -*/ - -async function processJsonLines(url,sampleID) { - // Fetch the gzipped JSONL file - //const startTime = new Date(); // Start timing - const response = await fetch(url); - - // Ensure the fetch was successful - if (!response.ok) { - throw new Error(`HTTP error! status: ${response.status}`); - } - - // Stream the response through decompression and decoding - const decompressedStream = response.body.pipeThrough(new DecompressionStream('gzip')); - const textStream = decompressedStream.pipeThrough(new TextDecoderStream()); - - // Reader to read the stream line by line - const reader = textStream.getReader(); - let remainder = ''; - let result; - let nodes = {}; - let foundSample=false;//we will be looking for a specific ID when we construct - let foundSampleID="" - let foundParentID="" - let foundSNPCount=0 - let isBranch=false - while (!(result = await reader.read()).done) { - const chunk = remainder + result.value; - const lines = chunk.split('\n'); - remainder = lines.pop(); // Save the last line in case it's incomplete - for (const line of lines) { - if (line) { - var snpCount=0; - try { - const json = JSON.parse(line); - if (json.config){//if line has the config file, skip it to avoid an error - continue;//this first line also has mutations dictionary for decoding, if we need that later - } - for (const mut of json.mutations){ - if (mut>107435){ - snpCount+=1; - } - } - if (json.name===sampleID){//check if this is the sample we will be searching for - foundSample=true;//if it is, we have found it - foundSampleID=json.node_id//store its ID so we can use it later - foundParentID=json.parent_id//need to get parent ID of first node as a jumping off point for internal nodes, since theyre not being stored - foundSNPCount=snpCount - if (json.name.includes("node_")) { - isBranch=true - } - //console.log(json) - } - - if (json.name.includes("node_")) { // Check if the node is internal - var encodedChild=(String(json.node_id)+"="+String(snpCount))//encode child and snp count without further nesting, as trying to store them as separate objects causes Stringify error due to excessive nesting - if (!nodes[json.node_id]) {//if internal, but not added to list - nodes[json.node_id] = {//create new node - parent_id: json.parent_id, - snpCount: snpCount, - children: [] - }; - if (!nodes[nodes[json.node_id].parent_id]){//if the parent is not yet added to the list, - nodes[nodes[json.node_id].parent_id] = {// add it to the list, with null name and parent, since we wont have that info until we read in parent node - parent_id: null, - snpCount: null, - children: [encodedChild]//store the node ID and the number of mutations - }; - } - else{ - nodes[nodes[json.node_id].parent_id].children.push(encodedChild);// if the parent node has been added, add this node to its children - } - } - if(nodes[json.node_id] && (nodes[json.node_id].parent_id===null || nodes[json.node_id].name===null)){//if we have added this parent node previously, but finally come across in JSON - //console.log("Node ID being updated:"+json.name) - nodes[json.node_id].parent_id=json.parent_id;//fill in the parent ID - nodes[json.node_id].snpCount=snpCount;//fill in the snp count - if (!nodes[nodes[json.node_id].parent_id]){//if this node, which was added by a previous step and therefore does not flag new internal step above, has a parent that has not been added to the list - nodes[nodes[json.node_id].parent_id] = {// so add it - parent_id: null, - snpCount: null, - children: [encodedChild]//store the node ID and the number of mutations - }; - } - else{ - nodes[nodes[json.node_id].parent_id].children.push(encodedChild);// if the parent node has been added, add this node to its children - } - } - } - else {// if doesnt contain "node_", then its a leaf node - encodedChild=(String(json.name)+"="+String(snpCount)+"="+String(json.meta_pangolin_lineage)+"="+String(json.meta_genbank_accession))//encode child and snp count without further nesting, as trying to store them as separate objects causes Stringify error due to excessive nesting - if (!nodes[json.parent_id]) {//we dont track leaf nodes, so if parent node is not in list, add it - nodes[json.parent_id] = {//add line which fills in these null values when we read in the parent node - parent_id: null, - snpCount: null, - children: [encodedChild] - }; - } else { - - nodes[json.parent_id].children.push(encodedChild);//if parent node is in list, add this node to its children - } - } - } catch (e) { - console.error('Error parsing JSON:', e); - return "Error parsing JSON" - } - } - } - } - - - var answersArray=[nodes, foundSample, foundSampleID, foundParentID, foundSNPCount, isBranch] - return answersArray; - } -/* -processJsonLines('https://cov2tree.nyc3.cdn.digitaloceanspaces.com/latest_public.jsonl.gz', "node_3").then(result => { - let sliced = Object.fromEntries(Object.entries(result[0][0]).slice(0,3))//get first 3 entries - console.log("First 3 entries: ",sliced) - //saveObjectToJson(result[0], 'C:/Users/david/my-app/src/InternalNodeMap.json'); -}) -.catch(error => { - console.error("Error processing samples:", error); -}); -function saveObjectToJson(dataObject, outputPath) { - const fs = require('fs'); - const JSONStream = require('JSONStream'); - return new Promise((resolve, reject) => { - const writeStream = fs.createWriteStream(outputPath); - const stringifyStream = JSONStream.stringifyObject(); - stringifyStream.pipe(writeStream); - - writeStream.on('finish', () => { - console.log('JSON file has been written successfully.'); - resolve(); - }); - - writeStream.on('error', (error) => { - console.error('Stream write error:', error); - reject(error); - }); - - stringifyStream.on('error', (error) => { - console.error('JSON stringify error:', error); - reject(error); - }); - - for (const key in dataObject) { - stringifyStream.write([key, dataObject[key]]); - } - stringifyStream.end(); - }); -} -*/ -export default processJsonLines; - -// Usage example -//at ~2gb of ram, 4.2ghz with 6 cores, a little under 60sec when reading from url -//time to write to file is more extensive, but ideally not a factor if its happening in the backend -//time to query backend for single node: ~0.6s -//time to add snp dist when reading is negligible \ No newline at end of file +/* +TODO: +*/ + +async function processJsonLines(url, sampleID) { + // Fetch the gzipped JSONL file + //const startTime = new Date(); // Start timing + const response = await fetch(url); + + // Ensure the fetch was successful + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } + + // Stream the response through decompression and decoding + const decompressedStream = response.body.pipeThrough( + new DecompressionStream("gzip") + ); + const textStream = decompressedStream.pipeThrough(new TextDecoderStream()); + + // Reader to read the stream line by line + const reader = textStream.getReader(); + let remainder = ""; + let result; + let nodes = {}; + let foundSample = false; //we will be looking for a specific ID when we construct + let foundSampleID = ""; + let foundParentID = ""; + let foundSNPCount = 0; + let isBranch = false; + while (!(result = await reader.read()).done) { + const chunk = remainder + result.value; + const lines = chunk.split("\n"); + remainder = lines.pop(); // Save the last line in case it's incomplete + for (const line of lines) { + if (line) { + var snpCount = 0; + try { + const json = JSON.parse(line); + if (json.config) { + //if line has the config file, skip it to avoid an error + continue; //this first line also has mutations dictionary for decoding, if we need that later + } + for (const mut of json.mutations) { + if (mut > 107435) { + snpCount += 1; + } + } + if (json.name === sampleID) { + //check if this is the sample we will be searching for + foundSample = true; //if it is, we have found it + foundSampleID = json.node_id; //store its ID so we can use it later + foundParentID = json.parent_id; //need to get parent ID of first node as a jumping off point for internal nodes, since theyre not being stored + foundSNPCount = snpCount; + if (json.name.includes("node_")) { + isBranch = true; + } + //console.log(json) + } + + if (json.name.includes("node_")) { + // Check if the node is internal + var encodedChild = String(json.node_id) + "=" + String(snpCount); //encode child and snp count without further nesting, as trying to store them as separate objects causes Stringify error due to excessive nesting + if (!nodes[json.node_id]) { + //if internal, but not added to list + nodes[json.node_id] = { + //create new node + parent_id: json.parent_id, + snpCount: snpCount, + children: [], + }; + if (!nodes[nodes[json.node_id].parent_id]) { + //if the parent is not yet added to the list, + nodes[nodes[json.node_id].parent_id] = { + // add it to the list, with null name and parent, since we wont have that info until we read in parent node + parent_id: null, + snpCount: null, + children: [encodedChild], //store the node ID and the number of mutations + }; + } else { + nodes[nodes[json.node_id].parent_id].children.push( + encodedChild + ); // if the parent node has been added, add this node to its children + } + } + if ( + nodes[json.node_id] && + (nodes[json.node_id].parent_id === null || + nodes[json.node_id].name === null) + ) { + //if we have added this parent node previously, but finally come across in JSON + //console.log("Node ID being updated:"+json.name) + nodes[json.node_id].parent_id = json.parent_id; //fill in the parent ID + nodes[json.node_id].snpCount = snpCount; //fill in the snp count + if (!nodes[nodes[json.node_id].parent_id]) { + //if this node, which was added by a previous step and therefore does not flag new internal step above, has a parent that has not been added to the list + nodes[nodes[json.node_id].parent_id] = { + // so add it + parent_id: null, + snpCount: null, + children: [encodedChild], //store the node ID and the number of mutations + }; + } else { + nodes[nodes[json.node_id].parent_id].children.push( + encodedChild + ); // if the parent node has been added, add this node to its children + } + } + } else { + // if doesnt contain "node_", then its a leaf node + encodedChild = + String(json.name) + + "=" + + String(snpCount) + + "=" + + String(json.meta_pangolin_lineage) + + "=" + + String(json.meta_genbank_accession); //encode child and snp count without further nesting, as trying to store them as separate objects causes Stringify error due to excessive nesting + if (!nodes[json.parent_id]) { + //we dont track leaf nodes, so if parent node is not in list, add it + nodes[json.parent_id] = { + //add line which fills in these null values when we read in the parent node + parent_id: null, + snpCount: null, + children: [encodedChild], + }; + } else { + nodes[json.parent_id].children.push(encodedChild); //if parent node is in list, add this node to its children + } + } + } catch (e) { + console.error("Error parsing JSON:", e); + return "Error parsing JSON"; + } + } + } + } + + var answersArray = [ + nodes, + foundSample, + foundSampleID, + foundParentID, + foundSNPCount, + isBranch, + ]; + return answersArray; +} +/* +processJsonLines('https://cov2tree.nyc3.cdn.digitaloceanspaces.com/latest_public.jsonl.gz', "node_3").then(result => { + let sliced = Object.fromEntries(Object.entries(result[0][0]).slice(0,3))//get first 3 entries + console.log("First 3 entries: ",sliced) + //saveObjectToJson(result[0], 'C:/Users/david/my-app/src/InternalNodeMap.json'); +}) +.catch(error => { + console.error("Error processing samples:", error); +}); +function saveObjectToJson(dataObject, outputPath) { + const fs = require('fs'); + const JSONStream = require('JSONStream'); + return new Promise((resolve, reject) => { + const writeStream = fs.createWriteStream(outputPath); + const stringifyStream = JSONStream.stringifyObject(); + stringifyStream.pipe(writeStream); + + writeStream.on('finish', () => { + console.log('JSON file has been written successfully.'); + resolve(); + }); + + writeStream.on('error', (error) => { + console.error('Stream write error:', error); + reject(error); + }); + + stringifyStream.on('error', (error) => { + console.error('JSON stringify error:', error); + reject(error); + }); + + for (const key in dataObject) { + stringifyStream.write([key, dataObject[key]]); + } + stringifyStream.end(); + }); +} +*/ +export default processJsonLines; + +// Usage example +//at ~2gb of ram, 4.2ghz with 6 cores, a little under 60sec when reading from url +//time to write to file is more extensive, but ideally not a factor if its happening in the backend +//time to query backend for single node: ~0.6s +//time to add snp dist when reading is negligible