Skip to content
This repository was archived by the owner on Dec 21, 2022. It is now read-only.

Commit d49c060

Browse files
committed
Merge branch 'release/0.9.0'
2 parents 088504b + 251124a commit d49c060

File tree

17 files changed

+209
-79
lines changed

17 files changed

+209
-79
lines changed

.yeyo.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
version: 0.8.0
1+
version: 0.9.0
22
tag_template: v{{ yeyo_version }}
33
commit_template: v{{ yeyo_version }}
44
files:

CHANGELOG.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,23 @@
22

33
## Development
44

5+
### Added
6+
7+
- Parser now outputs the length of the tensor not including padding. This is
8+
useful for packing and length based iteration.
9+
- Generating masked output from the parse_record method is now available.
10+
- Alphabet can include an optional mask token.
11+
12+
### Changed
13+
14+
- Can now specify how large of kmer step size to generate when supplying a kmer
15+
value.
16+
- Renames EncodedSeq.integer_encoded to EncodedSeq.get_integer_encoding which
17+
takes a kmer_step_size to specify how large of steps to take when encoding.
18+
- Add parsed_seq_len to the SequenceParser object to control how much padding to
19+
apply to the end of the integer encoded sequence. This is useful since a batch
20+
of tensors is expected to have the same size.
21+
522
## 0.8.0 (2019-07-04)
623

724
### Fixed

Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,3 +67,7 @@ test_integration:
6767
.PHONY: test_unit
6868
test_unit:
6969
pytest --cov-report term-missing --cov=gcgc -m 'not integration'
70+
71+
.PHONY: fmt
72+
fmt:
73+
black .

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.8.0
1+
0.9.0

docs/index.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,4 @@ $ pip install gcgc[torch]
3939

4040
## Documentation Version
4141

42-
The documentation you're reading was build for version: `0.8.0`.
42+
The documentation you're reading was build for version: `0.9.0`.

gcgc/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22
# All Rights Reserved
33
"""Top-level GCGC module."""
44

5-
__version__ = "0.8.0"
5+
__version__ = "0.9.0"

gcgc/alphabet/base.py

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import itertools as it
66
from typing import Iterable
7+
from typing import Optional
78
from typing import Sequence
89

910
from gcgc.exceptions import GCGCAlphabetLetterEncodingException
@@ -15,16 +16,24 @@ class EncodingAlphabet:
1516
PADDING: str = "|"
1617
START: str = ">"
1718
END: str = "<"
19+
MASK: str = "#"
1820

1921
# Convince linting that EncodingAlphabet will have a letters attribute.
2022
letters: str
2123

22-
def __init__(self, kmer_size: int = 1, start_token: bool = True, end_token: bool = True):
24+
def __init__(
25+
self,
26+
kmer_size: int = 1,
27+
start_token: bool = True,
28+
end_token: bool = True,
29+
masked: bool = False,
30+
):
2331
"""Create the EncodingAlphabet object."""
2432

2533
self.start = start_token
2634
self.end = end_token
2735
self.kmer_size = kmer_size
36+
self.masked = masked
2837

2938
self.encoding_index = {letter: idx for idx, letter in enumerate(self.kmers_and_tokens)}
3039
self.decoding_index = {idx: letter for letter, idx in self.encoding_index.items()}
@@ -42,6 +51,8 @@ def tokens(self):
4251
append_string.append(self.START)
4352
if self.end:
4453
append_string.append(self.END)
54+
if self.masked:
55+
append_string.append(self.MASK)
4556

4657
return "".join(append_string)
4758

@@ -64,6 +75,11 @@ def encoded_start(self):
6475
"""Get the integer for the start character."""
6576
return self.encode_token(self.START)
6677

78+
@property
79+
def encoded_mask(self):
80+
"""Get the integer for the mask character."""
81+
return self.encode_token(self.MASK)
82+
6783
@property
6884
def encoded_end(self):
6985
"""Get the integer for the end character."""
@@ -98,23 +114,31 @@ def _kmer_one(self, seq):
98114
except KeyError:
99115
raise GCGCAlphabetLetterEncodingException(f"{kmer} not in {self.encoding_index}")
100116

101-
def _kmer_n(self, seq: str) -> Sequence[int]:
117+
def _kmer_n(self, seq: str, kmer_step_size: int) -> Sequence[int]:
102118
try:
103119
encoded = []
104120

105121
seq_len = len(seq)
106122
iterations = seq_len - self.kmer_size + 1
107123

108-
for i in range(0, iterations):
124+
for i in range(0, iterations, kmer_step_size):
109125
kmer = seq[i : i + self.kmer_size]
110126
encoded.append(self.encoding_index[kmer])
111127
return encoded
112128

113129
except KeyError:
114130
raise GCGCAlphabetLetterEncodingException(f"{kmer} not in {self.encoding_index}")
115131

116-
def integer_encode(self, seq: str) -> Sequence[int]:
117-
"""Integer encode the sequence."""
132+
def integer_encode(self, seq: str, kmer_step_size: Optional[int] = None) -> Sequence[int]:
133+
"""Integer encode the sequence.
134+
135+
Args:
136+
seq: The sequence to encode.
137+
kmer_step_size: The size of the kmer step, if None uses self.kmer
138+
139+
Returns:
140+
The list of integers that represent the sequence.
141+
"""
118142

119143
stripped_seq = "".join(s for s in seq if s not in {self.START, self.END, self.PADDING})
120144
seq_len = len(stripped_seq)
@@ -127,10 +151,11 @@ def integer_encode(self, seq: str) -> Sequence[int]:
127151
if self.kmer_size == 1:
128152
encoded_seq = self._kmer_one(stripped_seq)
129153
else:
130-
encoded_seq = self._kmer_n(stripped_seq)
154+
passed_kmer_step_size = kmer_step_size if kmer_step_size is not None else self.kmer_size
155+
encoded_seq = self._kmer_n(stripped_seq, passed_kmer_step_size)
131156

132-
if seq[0] == self.START:
133-
encoded_seq = [self.encoding_index[self.START]] + encoded_seq
157+
if seq.startswith(self.START):
158+
encoded_seq = [self.encoded_start] + encoded_seq
134159

135160
non_seq_ending = "".join(s for s in seq if s in {self.END, self.PADDING})
136161
if non_seq_ending:

gcgc/encoded_seq/encoded_seq.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,9 @@
22
# All Rights Reserved
33
"""Contains the EncodedSeq object."""
44

5-
from typing import Iterable, Sequence, Union
5+
from typing import Iterable, Union, Optional
66

77
from Bio.Seq import Seq
8-
import numpy as np
98

109
from gcgc.alphabet.base import EncodingAlphabet
1110
from gcgc.alphabet.utils import biopython_alphabet_to_gcgc_alphabet
@@ -117,10 +116,9 @@ def shift(self, offset: int) -> "EncodedSeq":
117116

118117
raise ValueError(f"Unsure how to handle {offset}.")
119118

120-
@property
121-
def integer_encoded(self):
119+
def get_integer_encoding(self, kmer_step_size: Optional[int] = None):
122120
"""Return the underlying sequence in its integer representation."""
123-
return self.alphabet.integer_encode(self)
121+
return self.alphabet.integer_encode(self, kmer_step_size)
124122

125123
@classmethod
126124
def from_integer_encoded_seq(

gcgc/ml/pytorch_utils/data.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""Objects and methods for dealing with PyTorch data."""
44

55
from pathlib import Path
6+
from typing import Optional
67
from typing import Sequence
78

89
from Bio import File
@@ -19,11 +20,17 @@
1920
class GenomicDataset(torch.utils.data.Dataset):
2021
"""GenomicDataset can be used to load sequence information into a format aminable to PyTorch."""
2122

22-
def __init__(self, file_index: File._SQLiteManySeqFilesDict, parser: TorchSequenceParser):
23+
def __init__(
24+
self,
25+
file_index: File._SQLiteManySeqFilesDict,
26+
parser: TorchSequenceParser,
27+
parsed_seq_len: Optional[int] = None,
28+
):
2329
"""Initialize the GenomicDataset object."""
2430

2531
self._file_index = file_index
2632
self._parser = parser
33+
self._parsed_seq_len = parsed_seq_len
2734

2835
super().__init__()
2936

@@ -34,10 +41,14 @@ def from_path(
3441
parser: TorchSequenceParser,
3542
file_format: str = "fasta",
3643
alphabet: EncodingAlphabet = ExtendedIUPACDNAEncoding(),
44+
index_db: str = ":memory:",
45+
parsed_seq_len: Optional[int] = None,
3746
) -> "GenomicDataset":
3847
"""Init from a single file. This is a convience method that delegates to from_paths."""
3948

40-
return cls.from_paths([path], parser, file_format, alphabet)
49+
return cls.from_paths(
50+
[path], parser, file_format, alphabet, index_db, parsed_seq_len=parsed_seq_len
51+
)
4152

4253
@classmethod
4354
def from_paths(
@@ -47,14 +58,15 @@ def from_paths(
4758
file_format: str = "fasta",
4859
alphabet: EncodingAlphabet = ExtendedIUPACDNAEncoding(),
4960
index_db: str = ":memory:",
61+
parsed_seq_len: Optional[int] = None,
5062
**kwargs,
5163
) -> "GenomicDataset":
5264
"""Initialize the GenomicDataset from a pathlib.Path sequence."""
5365

5466
file_index = SeqIO.index_db(
5567
index_db, [str(p) for p in path_sequence], file_format, alphabet=alphabet, **kwargs
5668
)
57-
return cls(file_index, parser)
69+
return cls(file_index, parser, parsed_seq_len)
5870

5971
def __len__(self) -> int:
6072
"""Return the length of the dataset."""
@@ -69,4 +81,4 @@ def __getitem__(self, i: int):
6981
file_name = Path(self._file_index._filenames[file_number])
7082

7183
r = GCGCRecord(path=file_name, seq_record=self._file_index[key])
72-
return self._parser.parse_record(r)
84+
return self._parser.parse_record(r, self._parsed_seq_len)

gcgc/ml/pytorch_utils/parser.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""PyTorch specific parser."""
44

55
from typing import Dict
6+
from typing import Optional
67

78
import torch
89

@@ -13,11 +14,15 @@
1314
class TorchSequenceParser(SequenceParser):
1415
"""A PyTorch Sequence Parser."""
1516

16-
def parse_record(self, gcgc_record: GCGCRecord) -> Dict:
17+
def parse_record(self, gcgc_record: GCGCRecord, parsed_seq_len: Optional[int] = None) -> Dict:
1718
"""Convert the incoming SeqRecord to a dictionary of features."""
1819

19-
parsed_features = super().parse_record(gcgc_record)
20+
parsed_features = super().parse_record(gcgc_record, parsed_seq_len)
2021
parsed_features["seq_tensor"] = torch.LongTensor(parsed_features["seq_tensor"])
22+
parsed_features["seq_tensor_masked"] = torch.LongTensor(
23+
parsed_features["seq_tensor_masked"]
24+
)
25+
parsed_features["seq_len"] = torch.tensor(parsed_features["seq_len"])
2126

2227
if self.has_offset:
2328
parsed_features["offset_seq_tensor"] = torch.LongTensor(

0 commit comments

Comments
 (0)