Skip to content

Commit

Permalink
add initial files
Browse files Browse the repository at this point in the history
Signed-off-by: Peter St. John <[email protected]>
  • Loading branch information
pstjohn committed Jan 22, 2025
1 parent cbee135 commit 5d404f4
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@
# limitations under the License.


from pytest import fixture, mark, raises
import transformers
from nemo.lightning.io import IOMixin


def test_todo() -> None:
raise ValueError(
f"Implement tests! Make use of {fixture} for data, {raises} to check for "
f"exceptional cases, and {mark} as needed"
)
class BioNeMoAMPLIFYTokenizer(transformers.PreTrainedTokenizerFast, IOMixin): # noqa D101
def __init__(self):
"""A wrapper to make AutoTokenizer serializable for the ESM2 tokenizer."""
other = transformers.AutoTokenizer.from_pretrained("chandar-lab/AMPLIFY_350M", use_fast=True)
self.__dict__.update(dict(other.__dict__))
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,24 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import pytest
import torch
from nemo.lightning import io

from bionemo.amplify.tokenizer import BioNeMoAMPLIFYTokenizer


@pytest.fixture
def tokenizer():
return BioNeMoAMPLIFYTokenizer()


def test_tokenizer_serialization(tokenizer, tmp_path):
tokenizer.io_dump(tmp_path / "tokenizer", yaml_attrs=[]) # BioNeMoESMTokenizer takes no __init__ arguments
deserialized_tokenizer = io.load(tmp_path / "tokenizer", tokenizer.__class__)

our_tokens = deserialized_tokenizer.encode("K A <mask> I S Q", add_special_tokens=False)
esm_tokens = torch.tensor([15, 5, 32, 12, 8, 16])
torch.testing.assert_close(torch.tensor(our_tokens), esm_tokens)

0 comments on commit 5d404f4

Please sign in to comment.