Skip to content

Commit

Permalink
Fix invalid base skipping
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewdalpino committed Oct 14, 2024
1 parent 59071de commit 86eec41
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions src/dna_hash/tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
INVALID_BASE_REGEX = r'[^ACTG]'

class Tokenizer(ABC):
"""Base tokenizer class"""

@abstractmethod
def tokenize(self, sequence: str):
def tokenize(self, sequence: str) -> Generator[str, None, None]:
pass

class Kmer(Tokenizer):
Expand All @@ -22,7 +24,9 @@ def __init__(self, k: int) -> None:

def tokenize(self, sequence: str) -> Generator[str, None, None]:
"""Tokenize the sequence."""
for i in range(0, len(sequence) - self.k):
i = 0

while i < len(sequence) - self.k:
token = sequence[i:i + self.k]

invalid_token = self.invalid_base.search(token)
Expand All @@ -38,6 +42,8 @@ def tokenize(self, sequence: str) -> Generator[str, None, None]:

yield token

i += 1

class Canonical(Tokenizer):
"""Tokenize sequences in their canonical form."""

Expand Down

0 comments on commit 86eec41

Please sign in to comment.