4
4
5
5
import itertools as it
6
6
from typing import Iterable
7
+ from typing import Optional
7
8
from typing import Sequence
8
9
9
10
from gcgc .exceptions import GCGCAlphabetLetterEncodingException
@@ -15,16 +16,24 @@ class EncodingAlphabet:
15
16
PADDING : str = "|"
16
17
START : str = ">"
17
18
END : str = "<"
19
+ MASK : str = "#"
18
20
19
21
# Convince linting that EncodingAlphabet will have a letters attribute.
20
22
letters : str
21
23
22
- def __init__ (self , kmer_size : int = 1 , start_token : bool = True , end_token : bool = True ):
24
+ def __init__ (
25
+ self ,
26
+ kmer_size : int = 1 ,
27
+ start_token : bool = True ,
28
+ end_token : bool = True ,
29
+ masked : bool = False ,
30
+ ):
23
31
"""Create the EncodingAlphabet object."""
24
32
25
33
self .start = start_token
26
34
self .end = end_token
27
35
self .kmer_size = kmer_size
36
+ self .masked = masked
28
37
29
38
self .encoding_index = {letter : idx for idx , letter in enumerate (self .kmers_and_tokens )}
30
39
self .decoding_index = {idx : letter for letter , idx in self .encoding_index .items ()}
@@ -42,6 +51,8 @@ def tokens(self):
42
51
append_string .append (self .START )
43
52
if self .end :
44
53
append_string .append (self .END )
54
+ if self .masked :
55
+ append_string .append (self .MASK )
45
56
46
57
return "" .join (append_string )
47
58
@@ -64,6 +75,11 @@ def encoded_start(self):
64
75
"""Get the integer for the start character."""
65
76
return self .encode_token (self .START )
66
77
78
+ @property
79
+ def encoded_mask (self ):
80
+ """Get the integer for the mask character."""
81
+ return self .encode_token (self .MASK )
82
+
67
83
@property
68
84
def encoded_end (self ):
69
85
"""Get the integer for the end character."""
@@ -98,23 +114,31 @@ def _kmer_one(self, seq):
98
114
except KeyError :
99
115
raise GCGCAlphabetLetterEncodingException (f"{ kmer } not in { self .encoding_index } " )
100
116
101
- def _kmer_n (self , seq : str ) -> Sequence [int ]:
117
+ def _kmer_n (self , seq : str , kmer_step_size : int ) -> Sequence [int ]:
102
118
try :
103
119
encoded = []
104
120
105
121
seq_len = len (seq )
106
122
iterations = seq_len - self .kmer_size + 1
107
123
108
- for i in range (0 , iterations ):
124
+ for i in range (0 , iterations , kmer_step_size ):
109
125
kmer = seq [i : i + self .kmer_size ]
110
126
encoded .append (self .encoding_index [kmer ])
111
127
return encoded
112
128
113
129
except KeyError :
114
130
raise GCGCAlphabetLetterEncodingException (f"{ kmer } not in { self .encoding_index } " )
115
131
116
- def integer_encode (self , seq : str ) -> Sequence [int ]:
117
- """Integer encode the sequence."""
132
+ def integer_encode (self , seq : str , kmer_step_size : Optional [int ] = None ) -> Sequence [int ]:
133
+ """Integer encode the sequence.
134
+
135
+ Args:
136
+ seq: The sequence to encode.
137
+ kmer_step_size: The size of the kmer step, if None uses self.kmer
138
+
139
+ Returns:
140
+ The list of integers that represent the sequence.
141
+ """
118
142
119
143
stripped_seq = "" .join (s for s in seq if s not in {self .START , self .END , self .PADDING })
120
144
seq_len = len (stripped_seq )
@@ -127,10 +151,11 @@ def integer_encode(self, seq: str) -> Sequence[int]:
127
151
if self .kmer_size == 1 :
128
152
encoded_seq = self ._kmer_one (stripped_seq )
129
153
else :
130
- encoded_seq = self ._kmer_n (stripped_seq )
154
+ passed_kmer_step_size = kmer_step_size if kmer_step_size is not None else self .kmer_size
155
+ encoded_seq = self ._kmer_n (stripped_seq , passed_kmer_step_size )
131
156
132
- if seq [ 0 ] == self .START :
133
- encoded_seq = [self .encoding_index [ self . START ] ] + encoded_seq
157
+ if seq . startswith ( self .START ) :
158
+ encoded_seq = [self .encoded_start ] + encoded_seq
134
159
135
160
non_seq_ending = "" .join (s for s in seq if s in {self .END , self .PADDING })
136
161
if non_seq_ending :
0 commit comments