Add top k and histogram examples

andrewdalpino · Oct 15, 2024 · 389066a · 389066a
1 parent 8be99e4
commit 389066a
Show file tree

Hide file tree

Showing 6 changed files with 426 additions and 12 deletions.
diff --git a/examples/covid-19-virus.fasta b/examples/covid-19-virus.fasta
diff --git a/examples/histogram.py b/examples/histogram.py
@@ -0,0 +1,22 @@
+from dna_hash import DNAHash, tokenizers
+
+from Bio import SeqIO
+
+import matplotlib.pyplot as plt
+
+hash_table = DNAHash(max_false_positive_rate=0.001)
+
+tokenizer = tokenizers.Canonical(tokenizers.Kmer(6))
+
+with open('covid-19-virus.fasta', 'r') as file:
+    for record in SeqIO.parse(file, 'fasta'):
+        for token in tokenizer.tokenize(str(record.seq)):
+            hash_table.increment(token)
+
+counts, bins = hash_table.histogram(20)
+
+plt.stairs(counts, bins)
+plt.title('Histogram of SARS-CoV-2 Genome')
+plt.xlabel('Counts')
+plt.ylabel('Frequency')
+plt.show()
diff --git a/examples/top_k.py b/examples/top_k.py
@@ -0,0 +1,19 @@
+from dna_hash import DNAHash, tokenizers
+
+from Bio import SeqIO
+
+hash_table = DNAHash(max_false_positive_rate=0.001)
+
+tokenizer = tokenizers.Canonical(tokenizers.Kmer(6))
+
+with open('covid-19-virus.fasta', 'r') as file:
+    for record in SeqIO.parse(file, 'fasta'):
+        for token in tokenizer.tokenize(str(record.seq)):
+            hash_table.increment(token)
+
+for sequence, count in hash_table.top(25):
+    print(f'{sequence}: {count}')
+
+print(f'Total sequences: {hash_table.num_sequences}')
+print(f'# of unique sequences: {hash_table.num_unique_sequences}')
+print(f'# of singletons: {hash_table.num_singletons}')
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,7 +19,7 @@ readme = "README.md"
 license = {text = "MIT"}
 
 [project.optional-dependencies]
-dev = ["mypy"]
+dev = ["mypy", "biopython", "matplotlib", "PyQt6"]
 test = ["mypy"]
 
 [project.urls]

diff --git a/src/dna_hash/dna_hash.py b/src/dna_hash/dna_hash.py
@@ -1,5 +1,5 @@
 import math
-from typing import Iterator
+from typing import Iterator, Tuple
 import sys
 
 import numpy as np
@@ -163,11 +163,9 @@ def top(self, k: int = 10) -> Iterator:
 
             yield (sequence, count)
 
-    def histogram(self, bins: int = 10) -> NDArray:
+    def histogram(self, bins: int = 10) -> Tuple[NDArray, NDArray]:
         """Return a histogram of sequences bucketed by their counts."""
-        histogram, edges = np.histogram(list(self.counts.values()), bins=bins)
-
-        return histogram
+        return np.histogram(list(self.counts.values()), bins=bins)
 
     def __setitem__(self, sequence: str, count: int) -> None:
         self.insert(sequence, count)

diff --git a/tests/test_dna_hash.py b/tests/test_dna_hash.py
@@ -3,14 +3,14 @@
 
 import dna_hash
 
-BASES = ['A', 'C', 'T', 'G']
-
 class TestDNAHash(unittest.TestCase):
-    @staticmethod
-    def random_read(k: int) -> str:
-        return ''.join(BASES[random.randint(0, 3)] for i in range(0, k))
+    BASES = ['A', 'C', 'T', 'G']
+
+    @classmethod
+    def random_read(cls, k: int) -> str:
+        return ''.join(cls.BASES[random.randint(0, 3)] for i in range(0, k))
 
-    def test_basic(self):
+    def test_increment(self):
         hash_table = dna_hash.DNAHash()
 
         self.assertEqual(hash_table.num_singletons, 0)