Skip to content

Commit

Permalink
Add top k and histogram examples
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewdalpino committed Oct 15, 2024
1 parent 8be99e4 commit 389066a
Show file tree
Hide file tree
Showing 6 changed files with 426 additions and 12 deletions.
375 changes: 375 additions & 0 deletions examples/covid-19-virus.fasta

Large diffs are not rendered by default.

22 changes: 22 additions & 0 deletions examples/histogram.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from dna_hash import DNAHash, tokenizers

from Bio import SeqIO

import matplotlib.pyplot as plt

hash_table = DNAHash(max_false_positive_rate=0.001)

tokenizer = tokenizers.Canonical(tokenizers.Kmer(6))

with open('covid-19-virus.fasta', 'r') as file:
for record in SeqIO.parse(file, 'fasta'):
for token in tokenizer.tokenize(str(record.seq)):
hash_table.increment(token)

counts, bins = hash_table.histogram(20)

plt.stairs(counts, bins)
plt.title('Histogram of SARS-CoV-2 Genome')
plt.xlabel('Counts')
plt.ylabel('Frequency')
plt.show()
19 changes: 19 additions & 0 deletions examples/top_k.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from dna_hash import DNAHash, tokenizers

from Bio import SeqIO

hash_table = DNAHash(max_false_positive_rate=0.001)

tokenizer = tokenizers.Canonical(tokenizers.Kmer(6))

with open('covid-19-virus.fasta', 'r') as file:
for record in SeqIO.parse(file, 'fasta'):
for token in tokenizer.tokenize(str(record.seq)):
hash_table.increment(token)

for sequence, count in hash_table.top(25):
print(f'{sequence}: {count}')

print(f'Total sequences: {hash_table.num_sequences}')
print(f'# of unique sequences: {hash_table.num_unique_sequences}')
print(f'# of singletons: {hash_table.num_singletons}')
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ readme = "README.md"
license = {text = "MIT"}

[project.optional-dependencies]
dev = ["mypy"]
dev = ["mypy", "biopython", "matplotlib", "PyQt6"]
test = ["mypy"]

[project.urls]
Expand Down
8 changes: 3 additions & 5 deletions src/dna_hash/dna_hash.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import math
from typing import Iterator
from typing import Iterator, Tuple
import sys

import numpy as np
Expand Down Expand Up @@ -163,11 +163,9 @@ def top(self, k: int = 10) -> Iterator:

yield (sequence, count)

def histogram(self, bins: int = 10) -> NDArray:
def histogram(self, bins: int = 10) -> Tuple[NDArray, NDArray]:
"""Return a histogram of sequences bucketed by their counts."""
histogram, edges = np.histogram(list(self.counts.values()), bins=bins)

return histogram
return np.histogram(list(self.counts.values()), bins=bins)

def __setitem__(self, sequence: str, count: int) -> None:
self.insert(sequence, count)
Expand Down
12 changes: 6 additions & 6 deletions tests/test_dna_hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@

import dna_hash

BASES = ['A', 'C', 'T', 'G']

class TestDNAHash(unittest.TestCase):
@staticmethod
def random_read(k: int) -> str:
return ''.join(BASES[random.randint(0, 3)] for i in range(0, k))
BASES = ['A', 'C', 'T', 'G']

@classmethod
def random_read(cls, k: int) -> str:
return ''.join(cls.BASES[random.randint(0, 3)] for i in range(0, k))

def test_basic(self):
def test_increment(self):
hash_table = dna_hash.DNAHash()

self.assertEqual(hash_table.num_singletons, 0)
Expand Down

0 comments on commit 389066a

Please sign in to comment.