diff --git a/taxonium_component/src/components/SearchPanel.jsx b/taxonium_component/src/components/SearchPanel.jsx index f7eff11c..10adb9cf 100644 --- a/taxonium_component/src/components/SearchPanel.jsx +++ b/taxonium_component/src/components/SearchPanel.jsx @@ -18,6 +18,8 @@ import classNames from "classnames"; import SearchDisplayToggle from "./SearchDisplayToggle"; +import SNPOutputModal from "./snpModal"; + const prettify_x_types = { x_dist: "Distance", x_time: "Time" }; const formatNumber = (num) => { @@ -72,6 +74,14 @@ function SearchPanel({ }, [selectedDetails.nodeDetails]); const [listOutputModalOpen, setListOutputModalOpen] = useState(false); + const [snpSearchModalOpen, setSnpSearchModalOpen] = useState(false); + const [showSNPButton, setShowSNPButton] = useState(false); + useEffect(() => { + // Check if the current URL includes "api.cov2tree.org" + if (window.location.href.includes("api.cov2tree.org")) { + setShowSNPButton(true); + } + }, []); const handleDownloadJson = () => { if (selectedDetails.nodeDetails) { @@ -418,6 +428,22 @@ function SearchPanel({ Add a new search + {showSNPButton && ( + + )} + {showSNPButton && ( + + )} {selectedDetails.nodeDetails && ( diff --git a/taxonium_component/src/components/snpModal.jsx b/taxonium_component/src/components/snpModal.jsx new file mode 100644 index 00000000..9a867264 --- /dev/null +++ b/taxonium_component/src/components/snpModal.jsx @@ -0,0 +1,369 @@ +import Modal from "react-modal"; +import { useState } from "react"; +import getParsimonySamples from "../utils/extract.js"; + +/* +Testing Search: node_960478, 5 + -should return 97 results, exluding internal nodes +Germany/IMS-10245-CVDP-57CCA4FC-E286-4E50-AB0F-727CDF76B7BF/2022, 4 +should return 0 +TODO: +Additional Features to maybe include: +Maybe make each element in the drop down a clickable element to select the node in the SearchPanel +a csv download option for the output +*/ +async function getSNPneighbors(nodeId, integerValue, callback) { + // Dummy backend function + try { + let results = await getParsimonySamples(nodeId, integerValue); + callback(null, results); + } catch (err) { + callback(err, null); + } +} + +const SNPOutputModal = ({ snpOutputModalOpen, setSnpOutputModalOpen }) => { + const [nodeId, setNodeId] = useState(""); + const [integerValue, setIntegerValue] = useState(""); + const [fullOutput, setFullOutput] = useState([]); + const [displayOutput, setDisplayOutput] = useState([]); + const [remainingCount, setRemainingCount] = useState(0); + const [loading, setLoading] = useState(false); + const [JsonError, setJsonError] = useState(false); + const [MissingNoError, setMissingError] = useState(false); + const [ParsimonyError, setParsimonyError] = useState(false); + const [emptyReturn, setEmpty] = useState(false); + + const handleSearch = () => { + if (nodeId && integerValue) { + setLoading(true); + setFullOutput([]); + setDisplayOutput([]); + setRemainingCount(0); + setJsonError(false); + setMissingError(false); + setParsimonyError(false); + setEmpty(false); + const startTime = new Date(); + getSNPneighbors(nodeId, integerValue, (err, res) => { + if (err) { + console.log(err); + } else if (res === "Error parsing JSON") { + setJsonError(true); + } else if (res === "Node not found in the tree") { + setMissingError(true); + } else if (res === "Error parsing JSON") { + setParsimonyError(true); + } else if (res.length === 0) { + setEmpty(true); + } else { + setFullOutput(res); + setDisplayOutput(res.slice(0, 100)); + setRemainingCount(res.length > 100 ? res.length - 100 : 0); + const endTime = new Date(); // End timing + const timeDiff = endTime - startTime; // Time in milliseconds + console.log(`Time taken: ${timeDiff} ms`); + } + setLoading(false); + }); + } + }; + const handleCloseModal = () => { + // Reset all state variables on modal close + setSnpOutputModalOpen(false); + setNodeId(""); + setIntegerValue(""); + setFullOutput([]); + setDisplayOutput([]); + setRemainingCount(0); + setLoading(false); + setJsonError(false); + setMissingError(false); + setParsimonyError(false); + setEmpty(false); + }; + const convertToTSV = (data) => { + const header = "Sample Name\tDistance\tPANGO Lineage\tGenbank Accession\n"; + const rows = data + .map((row) => `${row[0]}\t${row[1]}\t${row[2]}\t${row[3]}`) + .join("\n"); + return header + rows; + }; + const downloadTSV = (data) => { + const tsvString = convertToTSV(data); + const blob = new Blob([tsvString], { type: "text/tab-separated-values" }); + const href = URL.createObjectURL(blob); + const link = document.createElement("a"); + link.href = href; + link.download = `${nodeId}_SNP${integerValue}.tsv`; + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + URL.revokeObjectURL(href); + }; + return ( + + +

+ SNP Distance Search +

+
+
+ +
+
+ +
+ +
+ {loading && ( +
+ Loading data. This may take a minute... +
+ )} + {!loading && JsonError && ( +
+ Error parsing JSON. +
+ )} + {!loading && MissingNoError && ( +
+ Node not found in the tree. +
+ )} + {!loading && ParsimonyError && ( +
+ Error traversing tree. +
+ )} + {!loading && emptyReturn && ( +
+ No results found; check your sample ID or change SNP distance. +
+ )} + {!loading && !emptyReturn && displayOutput.length === 0 && ( +
+ No data available. +
+ )} + {!loading && displayOutput.length > 0 && ( + <> +
+ + + + + + + + + + + {displayOutput.map( + ([name, distance, lineage, accession], index) => ( + + + + + + + ) + )} + +
+ Sample Name + + Distance + + PANGO Lineage + + Genbank Accession +
+ {name} + + {distance} + + {lineage} + + {accession} +
+ {remainingCount > 0 && ( +
+ ...with {remainingCount} more items +
+ )} +
+ + )} + {!loading && displayOutput.length > 0 && ( + + )} +
+ ); +}; + +export default SNPOutputModal; +/* +
+ +
+*/ diff --git a/taxonium_component/src/utils/extract.js b/taxonium_component/src/utils/extract.js new file mode 100644 index 00000000..a235701d --- /dev/null +++ b/taxonium_component/src/utils/extract.js @@ -0,0 +1,141 @@ +import processJsonLines from "./nodeMapper.js"; + +/* +getParsimonySamples function outline: +inputs: + sampleID, which is the explicit node or sample name, not internal ID + maxParsimony, which is the SNP distance threshold of interest + Nested functions: + processJsonLines: main worker of the backend, which reads in the jsonl file and constructs a map of all internal nodes, children, and mutations, and checks if the sample exists in the tree + findNodesWithinDistance: a helper function to find all nodes within a certain distance of a given node + traverses up and down the tree, adding nodes to a results array if they are within the distance threshold + returns the results array + traverseUp: helper function to traverse up the tree, adding nodes to the results array if they are within the distance threshold + traverseDown: helper function to traverse down the tree, adding nodes to the results array if they are within the distance threshold +outputs: should output a simple list/array-like of internal IDs and their SNP distances from the queried node, some flags that determine whether or not a valid search was performed, as well as a map of each nodes name with genbank accession and pangolin lineage +with this list as a result, we can then query the backend for more information about each node, like name, mutations, etc, if needed + this above includes if specificMut is passed through, which would allow for filtering based on whether or not a node has a specific mutation, + but the filtering is done after all SNPs within distance are found, to reduce processing time if flag isnt specified +once the list is obtained, snpComponent formats the list for output into Taxonium(Big step) +*/ +async function getParsimonySamples(sampleID, maxParsimony) { + return processJsonLines( + "https://cov2tree.nyc3.cdn.digitaloceanspaces.com/latest_public.jsonl.gz", + sampleID + ) + .then((myResult) => { + //answersArray=[nodes, foundSample, foundSampleID, foundParentID, foundSNPCount, isBranch] + if (myResult === "Error parsing JSON") { + //if error parsing JSON, return error + return "Error parsing JSON"; + } + var nodeMap = myResult[0]; //index of all internal nodes and children + // Main function to find all nodes within a certain distance of a given node + function findNodesWithinDistance(node, distanceThreshold) { + // Helper function to traverse up (towards the parent) + function traverseUp(node, currentDistance) { + var parent_id = nodeMap[node].parent_id; + var snpCount = nodeMap[node].snpCount; + if (parent_id === node || currentDistance > distanceThreshold) { + //if root node(root has itself as parent), or if threshold is reached, + //console.log("reached root node or threshold, returning at distance "+currentDistance+" from node "+node+" with parent "+parent_id+" and snpCount "+snpCount) + return; //end traversal + } + //console.log("traversing up, new node is "+parent_id+" with distance "+(currentDistance + snpCount)) + if (!visited.has(parent_id)) { + // Check if this node has already been visited to avoid infinite loops + visited.add(parent_id); + traverseDown(parent_id, currentDistance + snpCount); //Traverse down from the parent + traverseUp(parent_id, currentDistance + snpCount); // Traverse further up + } + } + // Helper function to traverse down (towards the children) + function traverseDown(node, currentDistance) { + if (!nodeMap[node] || currentDistance > distanceThreshold) { + return; + } //if node is a leaf node, or it threshold is reached, return + for (const child of nodeMap[node].children) { + // Traverse all children + let decodedChild = child.split("="); //split encoded child into internal ID and SNP distance + let childId = decodedChild[0]; //get internal ID of child + let childSnpDist = parseInt(decodedChild[1]); //get SNP distance of child + let newTotal = currentDistance + childSnpDist; //add SNP distance of child to current distance + //console.log("traversing down, new node is "+childId+" with distance "+childSnpDist+" for new total "+newTotal) + if (!visited.has(childId) && !visited.has(decodedChild[3])) { + //need a switch to add childs as genbank accession or node ID, since some sample names are repeated + if (childId.match(/^\d+$/)) { + //if its just numbers, its an internal node, so we add it to visited as is + visited.add(childId); + } else { + visited.add(decodedChild[3]); + } //if its not just numbers, its a leaf node, so we add the genbank accession to visited + if (newTotal <= distanceThreshold) { + //dont add the root node, as its always going to be within SNP distance of itself + //console.log("adding node to results:"+childId+" with distance "+newTotal) + if (!nodeMap[childId]) { + //if its not an entry in node map, means its not an internal node, so we add it to the results + //console.log("adding node to results:"+decodedChild) + results.push([ + decodedChild[0], + newTotal, + decodedChild[2], + decodedChild[3], + ]); + } + } + if (nodeMap[childId]) { + //if the child is an internal node, traverse down + traverseDown(childId, newTotal); // Traverse further down; pass ID, not node info itself + } + } + } + } + + // Start of the main function + //boolean obtained during traversal of the whole tree; if the queried sample exists in taxonium, this will flag as true + if (!myResult[1]) { + //if boolean is falsey + console.log("Node not found in the tree"); //its not a valid node, return error statement + return "Node not found in the tree"; + } + + let visited = new Set(); // To keep track of visited nodes + let results = []; // To store nodes within the distance threshold + visited.add(myResult[2]); //add ID of queried sample to visited + if (myResult[5]) { + //if the node is an internal node + traverseDown(myResult[2], 0); //start traversal from the internal node, we have a neutral distance of 0 + traverseUp(myResult[2], 0); + } else { + traverseDown(myResult[3], myResult[4]); + traverseUp(myResult[3], myResult[4]); + } + //internal ID of the queried sample + // Traverse as far down as possible first, then go up, and traverse down again ignoring visited nodes + return results; + } + + let goodSamples = findNodesWithinDistance(sampleID, maxParsimony); + nodeMap = null; + return goodSamples; + }) + .catch((error) => { + // Catch any errors from processJsonLines or thrown in the then block + console.error("Error in getParsimonySamples:", error); + return "Error processing samples"; + }); +} +/* +getParsimonySamples("node_960478", 5) + .then(result => { + console.log("Results:", result); + }) + .catch(error => { + console.error("Error processing samples:", error); + }); +*/ +export default getParsimonySamples; + +/* +NOTES: +*/ diff --git a/taxonium_component/src/utils/nodeMapper.js b/taxonium_component/src/utils/nodeMapper.js new file mode 100644 index 00000000..3eb660af --- /dev/null +++ b/taxonium_component/src/utils/nodeMapper.js @@ -0,0 +1,194 @@ +/* +TODO: +*/ + +async function processJsonLines(url, sampleID) { + // Fetch the gzipped JSONL file + //const startTime = new Date(); // Start timing + const response = await fetch(url); + + // Ensure the fetch was successful + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } + + // Stream the response through decompression and decoding + const decompressedStream = response.body.pipeThrough( + new DecompressionStream("gzip") + ); + const textStream = decompressedStream.pipeThrough(new TextDecoderStream()); + + // Reader to read the stream line by line + const reader = textStream.getReader(); + let remainder = ""; + let result; + let nodes = {}; + let foundSample = false; //we will be looking for a specific ID when we construct + let foundSampleID = ""; + let foundParentID = ""; + let foundSNPCount = 0; + let isBranch = false; + while (!(result = await reader.read()).done) { + const chunk = remainder + result.value; + const lines = chunk.split("\n"); + remainder = lines.pop(); // Save the last line in case it's incomplete + for (const line of lines) { + if (line) { + var snpCount = 0; + try { + const json = JSON.parse(line); + if (json.config) { + //if line has the config file, skip it to avoid an error + continue; //this first line also has mutations dictionary for decoding, if we need that later + } + for (const mut of json.mutations) { + if (mut > 107435) { + snpCount += 1; + } + } + if (json.name === sampleID) { + //check if this is the sample we will be searching for + foundSample = true; //if it is, we have found it + foundSampleID = json.node_id; //store its ID so we can use it later + foundParentID = json.parent_id; //need to get parent ID of first node as a jumping off point for internal nodes, since theyre not being stored + foundSNPCount = snpCount; + if (json.name.includes("node_")) { + isBranch = true; + } + //console.log(json) + } + + if (json.name.includes("node_")) { + // Check if the node is internal + var encodedChild = String(json.node_id) + "=" + String(snpCount); //encode child and snp count without further nesting, as trying to store them as separate objects causes Stringify error due to excessive nesting + if (!nodes[json.node_id]) { + //if internal, but not added to list + nodes[json.node_id] = { + //create new node + parent_id: json.parent_id, + snpCount: snpCount, + children: [], + }; + if (!nodes[nodes[json.node_id].parent_id]) { + //if the parent is not yet added to the list, + nodes[nodes[json.node_id].parent_id] = { + // add it to the list, with null name and parent, since we wont have that info until we read in parent node + parent_id: null, + snpCount: null, + children: [encodedChild], //store the node ID and the number of mutations + }; + } else { + nodes[nodes[json.node_id].parent_id].children.push( + encodedChild + ); // if the parent node has been added, add this node to its children + } + } + if ( + nodes[json.node_id] && + (nodes[json.node_id].parent_id === null || + nodes[json.node_id].name === null) + ) { + //if we have added this parent node previously, but finally come across in JSON + //console.log("Node ID being updated:"+json.name) + nodes[json.node_id].parent_id = json.parent_id; //fill in the parent ID + nodes[json.node_id].snpCount = snpCount; //fill in the snp count + if (!nodes[nodes[json.node_id].parent_id]) { + //if this node, which was added by a previous step and therefore does not flag new internal step above, has a parent that has not been added to the list + nodes[nodes[json.node_id].parent_id] = { + // so add it + parent_id: null, + snpCount: null, + children: [encodedChild], //store the node ID and the number of mutations + }; + } else { + nodes[nodes[json.node_id].parent_id].children.push( + encodedChild + ); // if the parent node has been added, add this node to its children + } + } + } else { + // if doesnt contain "node_", then its a leaf node + encodedChild = + String(json.name) + + "=" + + String(snpCount) + + "=" + + String(json.meta_pangolin_lineage) + + "=" + + String(json.meta_genbank_accession); //encode child and snp count without further nesting, as trying to store them as separate objects causes Stringify error due to excessive nesting + if (!nodes[json.parent_id]) { + //we dont track leaf nodes, so if parent node is not in list, add it + nodes[json.parent_id] = { + //add line which fills in these null values when we read in the parent node + parent_id: null, + snpCount: null, + children: [encodedChild], + }; + } else { + nodes[json.parent_id].children.push(encodedChild); //if parent node is in list, add this node to its children + } + } + } catch (e) { + console.error("Error parsing JSON:", e); + return "Error parsing JSON"; + } + } + } + } + + var answersArray = [ + nodes, + foundSample, + foundSampleID, + foundParentID, + foundSNPCount, + isBranch, + ]; + return answersArray; +} +/* +processJsonLines('https://cov2tree.nyc3.cdn.digitaloceanspaces.com/latest_public.jsonl.gz', "node_3").then(result => { + let sliced = Object.fromEntries(Object.entries(result[0][0]).slice(0,3))//get first 3 entries + console.log("First 3 entries: ",sliced) + //saveObjectToJson(result[0], 'C:/Users/david/my-app/src/InternalNodeMap.json'); +}) +.catch(error => { + console.error("Error processing samples:", error); +}); +function saveObjectToJson(dataObject, outputPath) { + const fs = require('fs'); + const JSONStream = require('JSONStream'); + return new Promise((resolve, reject) => { + const writeStream = fs.createWriteStream(outputPath); + const stringifyStream = JSONStream.stringifyObject(); + stringifyStream.pipe(writeStream); + + writeStream.on('finish', () => { + console.log('JSON file has been written successfully.'); + resolve(); + }); + + writeStream.on('error', (error) => { + console.error('Stream write error:', error); + reject(error); + }); + + stringifyStream.on('error', (error) => { + console.error('JSON stringify error:', error); + reject(error); + }); + + for (const key in dataObject) { + stringifyStream.write([key, dataObject[key]]); + } + stringifyStream.end(); + }); +} +*/ +export default processJsonLines; + +// Usage example +//at ~2gb of ram, 4.2ghz with 6 cores, a little under 60sec when reading from url +//time to write to file is more extensive, but ideally not a factor if its happening in the backend +//time to query backend for single node: ~0.6s +//time to add snp dist when reading is negligible