Skip to content

Commit c585cdb

Browse files
committed
Merge remote-tracking branch 'origin/savitha/llama3-recipes-dataloader-add-tokenizer' into savitha/llama3-recipes-dataloader-add-dataset
2 parents 3562f19 + 9e3e470 commit c585cdb

File tree

2 files changed

+100
-170
lines changed

2 files changed

+100
-170
lines changed
Lines changed: 51 additions & 170 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: LicenseRef-Apache2
33

4-
"""
5-
Script to create the HuggingFace PreTrainedTokenizerFast for nucleotide sequences.
4+
"""Script to create the HuggingFace PreTrainedTokenizerFast for nucleotide sequences.
65
76
This script creates a tokenizer that:
87
1. Maps each character to its ord() value (ASCII encoding)
@@ -14,13 +13,11 @@
1413

1514
import logging
1615
import os
17-
import tempfile
1816

19-
import torch
2017
from tokenizers import Tokenizer, processors
2118
from tokenizers.models import WordLevel
2219
from tokenizers.pre_tokenizers import Split
23-
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, PreTrainedTokenizerFast
20+
from transformers import PreTrainedTokenizerFast
2421

2522

2623
logging.basicConfig(level=logging.INFO)
@@ -30,199 +27,83 @@
3027
def create_nucleotide_tokenizer(
3128
eos_id: int = 0,
3229
pad_id: int = 1,
33-
bos_id: int = None,
34-
unk_id: int = 2,
30+
bos_id: int = 2,
31+
unk_id: int = 3,
3532
) -> PreTrainedTokenizerFast:
36-
"""
37-
Create a PreTrainedTokenizerFast for nucleotide sequences.
38-
39-
Follows NeMo ByteTokenizer convention:
40-
- eos_id = 0
41-
- pad_id = 1
42-
- bos_id = None (optional, for causal LM we'll use 2)
43-
33+
"""Create a PreTrainedTokenizerFast for nucleotide sequences.
34+
35+
Uses special token IDs for causal language modeling:
36+
- BOS = 2 (beginning of sequence)
37+
- EOS = 0 (end of sequence)
38+
- PAD = 1 (padding)
39+
- UNK = 3 (unknown)
40+
4441
Args:
45-
eos_id: End-of-sequence token ID (NeMo convention: 0)
46-
pad_id: Padding token ID (NeMo convention: 1)
47-
bos_id: Beginning-of-sequence token ID (None in NeMo, but we use 2 for causal LM)
48-
unk_id: Unknown token ID (2 or 3)
49-
42+
eos_id: End-of-sequence token ID (default: 0)
43+
pad_id: Padding token ID (default: 1)
44+
bos_id: Beginning-of-sequence token ID (default: 2)
45+
unk_id: Unknown token ID (default: 3)
46+
5047
Returns:
5148
PreTrainedTokenizerFast ready to use and save
5249
"""
53-
# Define special tokens with NeMo convention
50+
# Define special tokens
5451
special_tokens = {
52+
"<BOS>": bos_id,
5553
"<EOS>": eos_id,
5654
"<PAD>": pad_id,
5755
"<UNK>": unk_id,
5856
}
59-
60-
if bos_id is not None:
61-
special_tokens["<BOS>"] = bos_id
62-
57+
6358
# Build vocab: Map each ASCII character to its ord() value
64-
# IMPORTANT: Exclude chr(0-3) to reserve those IDs for special tokens
65-
vocab = {**special_tokens}
66-
reserved_ids = {eos_id, pad_id, unk_id}
67-
if bos_id is not None:
68-
reserved_ids.add(bos_id)
69-
70-
for i in range(256):
71-
if i not in reserved_ids:
72-
char = chr(i)
73-
vocab[char] = i
74-
59+
# IMPORTANT: Exclude reserved IDs for special tokens
60+
reserved_ids = set(special_tokens.values())
61+
vocab = {chr(i): i for i in range(256) if i not in reserved_ids}
62+
vocab = {**vocab, **special_tokens}
63+
7564
# Create Rust tokenizer backend with WordLevel model
7665
tokenizer = Tokenizer(WordLevel(vocab, unk_token="<UNK>"))
77-
66+
7867
# Configure pre-tokenizer: Split into individual characters
7968
tokenizer.pre_tokenizer = Split(pattern="", behavior="isolated")
80-
69+
8170
# Configure post-processor: Add BOS/EOS tokens automatically
82-
if bos_id is not None:
83-
tokenizer.post_processor = processors.TemplateProcessing(
84-
single="<BOS> $A <EOS>",
85-
pair="<BOS> $A <EOS> <BOS> $B <EOS>",
86-
special_tokens=[
87-
("<BOS>", bos_id),
88-
("<EOS>", eos_id),
89-
],
90-
)
91-
bos_token = "<BOS>"
92-
else:
93-
tokenizer.post_processor = processors.TemplateProcessing(
94-
single="$A <EOS>",
95-
pair="$A <EOS> $B <EOS>",
96-
special_tokens=[
97-
("<EOS>", eos_id),
98-
],
99-
)
100-
bos_token = None
101-
71+
tokenizer.post_processor = processors.TemplateProcessing(
72+
single="<BOS> $A <EOS>",
73+
pair="<BOS> $A <EOS> <BOS> $B <EOS>",
74+
special_tokens=[
75+
("<BOS>", bos_id),
76+
("<EOS>", eos_id),
77+
],
78+
)
79+
10280
# Wrap in HuggingFace PreTrainedTokenizerFast
10381
hf_tokenizer = PreTrainedTokenizerFast(
10482
tokenizer_object=tokenizer,
10583
unk_token="<UNK>",
10684
pad_token="<PAD>",
10785
eos_token="<EOS>",
108-
bos_token=bos_token,
86+
bos_token="<BOS>",
10987
)
110-
88+
11189
return hf_tokenizer
11290

11391

11492
def main():
115-
"""Create and test the nucleotide tokenizer."""
116-
logger.info("="*80)
117-
logger.info("Creating HuggingFace PreTrainedTokenizerFast for Nucleotides")
118-
logger.info("="*80)
119-
120-
# Create tokenizer with NeMo convention (with BOS for causal LM)
121-
tokenizer = create_nucleotide_tokenizer(
122-
eos_id=0,
123-
pad_id=1,
124-
bos_id=2,
125-
unk_id=3,
126-
)
127-
128-
logger.info("Tokenizer created")
129-
logger.info(f" Vocab size: {tokenizer.vocab_size}")
130-
logger.info(" Special tokens:")
131-
logger.info(f" PAD: {tokenizer.pad_token} = {tokenizer.pad_token_id}")
132-
logger.info(f" EOS: {tokenizer.eos_token} = {tokenizer.eos_token_id}")
133-
logger.info(f" BOS: {tokenizer.bos_token} = {tokenizer.bos_token_id}")
134-
logger.info(f" UNK: {tokenizer.unk_token} = {tokenizer.unk_token_id}")
135-
136-
# Test encoding/decoding
137-
logger.info("\n" + "-"*80)
138-
logger.info("Test 1: Encoding/Decoding")
139-
logger.info("-"*80)
140-
141-
sequence = "ATCGATCG"
142-
encoded = tokenizer.encode(sequence, add_special_tokens=True)
143-
decoded = tokenizer.decode(encoded, skip_special_tokens=True)
144-
145-
logger.info(f"Original: '{sequence}'")
146-
logger.info(f"Encoded: {encoded}")
147-
logger.info(f"Expected: [2(BOS), 65(A), 84(T), 67(C), 71(G), 65(A), 84(T), 67(C), 71(G), 0(EOS)]")
148-
logger.info(f"Decoded: '{decoded}'")
149-
logger.info(f"Roundtrip successful: {sequence == decoded}")
150-
151-
# Test padding
152-
logger.info("\n" + "-"*80)
153-
logger.info("Test 2: Padding")
154-
logger.info("-"*80)
155-
156-
batch = tokenizer(
157-
["ATCG", "ATCGATCGATCG"],
158-
padding=True,
159-
return_tensors="pt"
160-
)
161-
162-
logger.info(f"Batch keys: {list(batch.keys())}")
163-
logger.info(f"Input IDs shape: {batch['input_ids'].shape}")
164-
logger.info(f"Input IDs:\n{batch['input_ids']}")
165-
logger.info(f"Attention mask:\n{batch['attention_mask']}")
166-
logger.info("Padding verified")
167-
168-
# Test with DataCollator
169-
logger.info("\n" + "-"*80)
170-
logger.info("Test 3: DataCollatorForLanguageModeling (mlm=False)")
171-
logger.info("-"*80)
172-
173-
collator = DataCollatorForLanguageModeling(
174-
tokenizer=tokenizer,
175-
mlm=False,
176-
)
177-
178-
examples = [
179-
{"input_ids": batch["input_ids"][0]},
180-
{"input_ids": batch["input_ids"][1]},
181-
]
182-
183-
collated = collator(examples)
184-
logger.info(f"Collated keys: {list(collated.keys())}")
185-
logger.info(f"Labels shape: {collated['labels'].shape}")
186-
logger.info(f"Labels (first 20): {collated['labels'][0][:20].tolist()}")
187-
logger.info("DataCollator integration verified")
188-
189-
# Test save/load
190-
logger.info("\n" + "-"*80)
191-
logger.info("Test 4: Save/Load with AutoTokenizer")
192-
logger.info("-"*80)
193-
194-
with tempfile.TemporaryDirectory() as tmpdir:
195-
save_path = os.path.join(tmpdir, "nucleotide_tokenizer")
196-
197-
# Save
198-
tokenizer.save_pretrained(save_path)
199-
logger.info(f"Saved to: {save_path}")
200-
logger.info("Files created:")
201-
for f in os.listdir(save_path):
202-
logger.info(f" - {f}")
203-
204-
# Load with AutoTokenizer
205-
loaded = AutoTokenizer.from_pretrained(save_path)
206-
logger.info("Loaded with AutoTokenizer.from_pretrained()")
207-
208-
# Verify it works
209-
test_seq = "ATCG"
210-
test_enc = loaded.encode(test_seq, add_special_tokens=True)
211-
test_dec = loaded.decode(test_enc, skip_special_tokens=True)
212-
logger.info(f"Test: '{test_seq}' -> {test_enc} -> '{test_dec}'")
213-
logger.info("Loaded tokenizer verified")
214-
215-
logger.info("\n" + "="*80)
216-
logger.info("ALL TESTS PASSED")
217-
logger.info("="*80)
218-
logger.info("\nIntegration workflow:")
219-
logger.info(" 1. Create tokenizer: tokenizer = create_nucleotide_tokenizer()")
220-
logger.info(" 2. Save to directory: tokenizer.save_pretrained('./nucleotide_fast_tokenizer')")
221-
logger.info(" 3. Load in training: from llama3.tokenizer import load_nucleotide_tokenizer")
222-
logger.info(" 4. Use with DataCollatorForLanguageModeling for batch collation")
93+
"""Create and save the nucleotide tokenizer."""
94+
logger.info("Creating nucleotide tokenizer")
22395

96+
# Create tokenizer with default settings (BOS=2, EOS=0, PAD=1, UNK=3)
97+
tokenizer = create_nucleotide_tokenizer()
22498

225-
if __name__ == "__main__":
226-
main()
99+
logger.info(f"Vocab size: {tokenizer.vocab_size}")
100+
logger.info(f"Special tokens: BOS={tokenizer.bos_token_id}, EOS={tokenizer.eos_token_id}, PAD={tokenizer.pad_token_id}, UNK={tokenizer.unk_token_id}")
101+
102+
# Save to default location
103+
save_path = os.path.join(os.path.dirname(__file__), "nucleotide_fast_tokenizer")
104+
tokenizer.save_pretrained(save_path)
105+
logger.info(f"Tokenizer saved to: {save_path}")
227106

228107

108+
if __name__ == "__main__":
109+
main()

bionemo-recipes/models/llama3/tests/test_tokenizer.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,16 @@ def test_tokenizer_encode_without_special_tokens(tokenizer):
5959
assert encoded == expected
6060

6161

62+
def test_tokenizer_roundtrip_encode_decode(tokenizer):
63+
"""Test that encoding and decoding produces the original sequence."""
64+
sequence = "ATCGATCG"
65+
encoded = tokenizer.encode(sequence, add_special_tokens=True)
66+
decoded = tokenizer.decode(encoded, skip_special_tokens=True)
67+
68+
# Decoded may have spaces between tokens, so compare without spaces
69+
assert sequence == decoded.replace(" ", "")
70+
71+
6272
def test_tokenizer_nucleotide_mappings(tokenizer):
6373
"""Test each nucleotide maps to its ASCII value."""
6474
# A=65, T=84, C=67, G=71
@@ -212,3 +222,42 @@ def test_short_sequences_dont_overflow(tokenizer):
212222
assert len(result["input_ids"][0]) == 402
213223

214224

225+
def test_bos_eos_in_overlapping_windows(tokenizer):
226+
"""Test that BOS/EOS tokens are added to every overlapping window.
227+
228+
Verifies that when using return_overflowing_tokens with add_special_tokens=True,
229+
each window gets its own BOS and EOS tokens, treating each as an independent sequence.
230+
This matches the behavior needed for causal language modeling training.
231+
"""
232+
# Use a short genomic sequence that will produce exactly 2 overlapping windows
233+
# With max_length=7 and stride=4, sequence of 8bp should give 2 windows
234+
sequence = "ATCGATCG" # 8bp
235+
236+
result = tokenizer(
237+
sequence,
238+
max_length=7, # BOS + 5 content + EOS = 7 tokens total
239+
stride=4, # Overlap of 4 tokens between windows
240+
truncation=True,
241+
return_overflowing_tokens=True,
242+
add_special_tokens=True,
243+
)
244+
245+
# Should produce exactly 2 windows
246+
num_windows = len(result["input_ids"])
247+
assert num_windows >= 2, f"Should produce at least 2 overlapping windows, got {num_windows}"
248+
249+
first_window = result["input_ids"][0]
250+
second_window = result["input_ids"][1]
251+
252+
# Verify both windows have BOS at start and EOS at end
253+
assert first_window[0] == tokenizer.bos_token_id
254+
assert first_window[-1] == tokenizer.eos_token_id
255+
assert second_window[0] == tokenizer.bos_token_id
256+
assert second_window[-1] == tokenizer.eos_token_id
257+
258+
# Verify windows are actually overlapping by checking they share some content
259+
first_content = set(first_window[1:-1])
260+
second_content = set(second_window[1:-1])
261+
assert len(first_content & second_content) > 0
262+
263+

0 commit comments

Comments
 (0)