|
1 | 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
2 | 2 | # SPDX-License-Identifier: LicenseRef-Apache2 |
3 | 3 |
|
4 | | -""" |
5 | | -Script to create the HuggingFace PreTrainedTokenizerFast for nucleotide sequences. |
| 4 | +"""Script to create the HuggingFace PreTrainedTokenizerFast for nucleotide sequences. |
6 | 5 |
|
7 | 6 | This script creates a tokenizer that: |
8 | 7 | 1. Maps each character to its ord() value (ASCII encoding) |
|
14 | 13 |
|
15 | 14 | import logging |
16 | 15 | import os |
17 | | -import tempfile |
18 | 16 |
|
19 | | -import torch |
20 | 17 | from tokenizers import Tokenizer, processors |
21 | 18 | from tokenizers.models import WordLevel |
22 | 19 | from tokenizers.pre_tokenizers import Split |
23 | | -from transformers import AutoTokenizer, DataCollatorForLanguageModeling, PreTrainedTokenizerFast |
| 20 | +from transformers import PreTrainedTokenizerFast |
24 | 21 |
|
25 | 22 |
|
26 | 23 | logging.basicConfig(level=logging.INFO) |
|
30 | 27 | def create_nucleotide_tokenizer( |
31 | 28 | eos_id: int = 0, |
32 | 29 | pad_id: int = 1, |
33 | | - bos_id: int = None, |
34 | | - unk_id: int = 2, |
| 30 | + bos_id: int = 2, |
| 31 | + unk_id: int = 3, |
35 | 32 | ) -> PreTrainedTokenizerFast: |
36 | | - """ |
37 | | - Create a PreTrainedTokenizerFast for nucleotide sequences. |
38 | | - |
39 | | - Follows NeMo ByteTokenizer convention: |
40 | | - - eos_id = 0 |
41 | | - - pad_id = 1 |
42 | | - - bos_id = None (optional, for causal LM we'll use 2) |
43 | | - |
| 33 | + """Create a PreTrainedTokenizerFast for nucleotide sequences. |
| 34 | +
|
| 35 | + Uses special token IDs for causal language modeling: |
| 36 | + - BOS = 2 (beginning of sequence) |
| 37 | + - EOS = 0 (end of sequence) |
| 38 | + - PAD = 1 (padding) |
| 39 | + - UNK = 3 (unknown) |
| 40 | +
|
44 | 41 | Args: |
45 | | - eos_id: End-of-sequence token ID (NeMo convention: 0) |
46 | | - pad_id: Padding token ID (NeMo convention: 1) |
47 | | - bos_id: Beginning-of-sequence token ID (None in NeMo, but we use 2 for causal LM) |
48 | | - unk_id: Unknown token ID (2 or 3) |
49 | | - |
| 42 | + eos_id: End-of-sequence token ID (default: 0) |
| 43 | + pad_id: Padding token ID (default: 1) |
| 44 | + bos_id: Beginning-of-sequence token ID (default: 2) |
| 45 | + unk_id: Unknown token ID (default: 3) |
| 46 | +
|
50 | 47 | Returns: |
51 | 48 | PreTrainedTokenizerFast ready to use and save |
52 | 49 | """ |
53 | | - # Define special tokens with NeMo convention |
| 50 | + # Define special tokens |
54 | 51 | special_tokens = { |
| 52 | + "<BOS>": bos_id, |
55 | 53 | "<EOS>": eos_id, |
56 | 54 | "<PAD>": pad_id, |
57 | 55 | "<UNK>": unk_id, |
58 | 56 | } |
59 | | - |
60 | | - if bos_id is not None: |
61 | | - special_tokens["<BOS>"] = bos_id |
62 | | - |
| 57 | + |
63 | 58 | # Build vocab: Map each ASCII character to its ord() value |
64 | | - # IMPORTANT: Exclude chr(0-3) to reserve those IDs for special tokens |
65 | | - vocab = {**special_tokens} |
66 | | - reserved_ids = {eos_id, pad_id, unk_id} |
67 | | - if bos_id is not None: |
68 | | - reserved_ids.add(bos_id) |
69 | | - |
70 | | - for i in range(256): |
71 | | - if i not in reserved_ids: |
72 | | - char = chr(i) |
73 | | - vocab[char] = i |
74 | | - |
| 59 | + # IMPORTANT: Exclude reserved IDs for special tokens |
| 60 | + reserved_ids = set(special_tokens.values()) |
| 61 | + vocab = {chr(i): i for i in range(256) if i not in reserved_ids} |
| 62 | + vocab = {**vocab, **special_tokens} |
| 63 | + |
75 | 64 | # Create Rust tokenizer backend with WordLevel model |
76 | 65 | tokenizer = Tokenizer(WordLevel(vocab, unk_token="<UNK>")) |
77 | | - |
| 66 | + |
78 | 67 | # Configure pre-tokenizer: Split into individual characters |
79 | 68 | tokenizer.pre_tokenizer = Split(pattern="", behavior="isolated") |
80 | | - |
| 69 | + |
81 | 70 | # Configure post-processor: Add BOS/EOS tokens automatically |
82 | | - if bos_id is not None: |
83 | | - tokenizer.post_processor = processors.TemplateProcessing( |
84 | | - single="<BOS> $A <EOS>", |
85 | | - pair="<BOS> $A <EOS> <BOS> $B <EOS>", |
86 | | - special_tokens=[ |
87 | | - ("<BOS>", bos_id), |
88 | | - ("<EOS>", eos_id), |
89 | | - ], |
90 | | - ) |
91 | | - bos_token = "<BOS>" |
92 | | - else: |
93 | | - tokenizer.post_processor = processors.TemplateProcessing( |
94 | | - single="$A <EOS>", |
95 | | - pair="$A <EOS> $B <EOS>", |
96 | | - special_tokens=[ |
97 | | - ("<EOS>", eos_id), |
98 | | - ], |
99 | | - ) |
100 | | - bos_token = None |
101 | | - |
| 71 | + tokenizer.post_processor = processors.TemplateProcessing( |
| 72 | + single="<BOS> $A <EOS>", |
| 73 | + pair="<BOS> $A <EOS> <BOS> $B <EOS>", |
| 74 | + special_tokens=[ |
| 75 | + ("<BOS>", bos_id), |
| 76 | + ("<EOS>", eos_id), |
| 77 | + ], |
| 78 | + ) |
| 79 | + |
102 | 80 | # Wrap in HuggingFace PreTrainedTokenizerFast |
103 | 81 | hf_tokenizer = PreTrainedTokenizerFast( |
104 | 82 | tokenizer_object=tokenizer, |
105 | 83 | unk_token="<UNK>", |
106 | 84 | pad_token="<PAD>", |
107 | 85 | eos_token="<EOS>", |
108 | | - bos_token=bos_token, |
| 86 | + bos_token="<BOS>", |
109 | 87 | ) |
110 | | - |
| 88 | + |
111 | 89 | return hf_tokenizer |
112 | 90 |
|
113 | 91 |
|
114 | 92 | def main(): |
115 | | - """Create and test the nucleotide tokenizer.""" |
116 | | - logger.info("="*80) |
117 | | - logger.info("Creating HuggingFace PreTrainedTokenizerFast for Nucleotides") |
118 | | - logger.info("="*80) |
119 | | - |
120 | | - # Create tokenizer with NeMo convention (with BOS for causal LM) |
121 | | - tokenizer = create_nucleotide_tokenizer( |
122 | | - eos_id=0, |
123 | | - pad_id=1, |
124 | | - bos_id=2, |
125 | | - unk_id=3, |
126 | | - ) |
127 | | - |
128 | | - logger.info("Tokenizer created") |
129 | | - logger.info(f" Vocab size: {tokenizer.vocab_size}") |
130 | | - logger.info(" Special tokens:") |
131 | | - logger.info(f" PAD: {tokenizer.pad_token} = {tokenizer.pad_token_id}") |
132 | | - logger.info(f" EOS: {tokenizer.eos_token} = {tokenizer.eos_token_id}") |
133 | | - logger.info(f" BOS: {tokenizer.bos_token} = {tokenizer.bos_token_id}") |
134 | | - logger.info(f" UNK: {tokenizer.unk_token} = {tokenizer.unk_token_id}") |
135 | | - |
136 | | - # Test encoding/decoding |
137 | | - logger.info("\n" + "-"*80) |
138 | | - logger.info("Test 1: Encoding/Decoding") |
139 | | - logger.info("-"*80) |
140 | | - |
141 | | - sequence = "ATCGATCG" |
142 | | - encoded = tokenizer.encode(sequence, add_special_tokens=True) |
143 | | - decoded = tokenizer.decode(encoded, skip_special_tokens=True) |
144 | | - |
145 | | - logger.info(f"Original: '{sequence}'") |
146 | | - logger.info(f"Encoded: {encoded}") |
147 | | - logger.info(f"Expected: [2(BOS), 65(A), 84(T), 67(C), 71(G), 65(A), 84(T), 67(C), 71(G), 0(EOS)]") |
148 | | - logger.info(f"Decoded: '{decoded}'") |
149 | | - logger.info(f"Roundtrip successful: {sequence == decoded}") |
150 | | - |
151 | | - # Test padding |
152 | | - logger.info("\n" + "-"*80) |
153 | | - logger.info("Test 2: Padding") |
154 | | - logger.info("-"*80) |
155 | | - |
156 | | - batch = tokenizer( |
157 | | - ["ATCG", "ATCGATCGATCG"], |
158 | | - padding=True, |
159 | | - return_tensors="pt" |
160 | | - ) |
161 | | - |
162 | | - logger.info(f"Batch keys: {list(batch.keys())}") |
163 | | - logger.info(f"Input IDs shape: {batch['input_ids'].shape}") |
164 | | - logger.info(f"Input IDs:\n{batch['input_ids']}") |
165 | | - logger.info(f"Attention mask:\n{batch['attention_mask']}") |
166 | | - logger.info("Padding verified") |
167 | | - |
168 | | - # Test with DataCollator |
169 | | - logger.info("\n" + "-"*80) |
170 | | - logger.info("Test 3: DataCollatorForLanguageModeling (mlm=False)") |
171 | | - logger.info("-"*80) |
172 | | - |
173 | | - collator = DataCollatorForLanguageModeling( |
174 | | - tokenizer=tokenizer, |
175 | | - mlm=False, |
176 | | - ) |
177 | | - |
178 | | - examples = [ |
179 | | - {"input_ids": batch["input_ids"][0]}, |
180 | | - {"input_ids": batch["input_ids"][1]}, |
181 | | - ] |
182 | | - |
183 | | - collated = collator(examples) |
184 | | - logger.info(f"Collated keys: {list(collated.keys())}") |
185 | | - logger.info(f"Labels shape: {collated['labels'].shape}") |
186 | | - logger.info(f"Labels (first 20): {collated['labels'][0][:20].tolist()}") |
187 | | - logger.info("DataCollator integration verified") |
188 | | - |
189 | | - # Test save/load |
190 | | - logger.info("\n" + "-"*80) |
191 | | - logger.info("Test 4: Save/Load with AutoTokenizer") |
192 | | - logger.info("-"*80) |
193 | | - |
194 | | - with tempfile.TemporaryDirectory() as tmpdir: |
195 | | - save_path = os.path.join(tmpdir, "nucleotide_tokenizer") |
196 | | - |
197 | | - # Save |
198 | | - tokenizer.save_pretrained(save_path) |
199 | | - logger.info(f"Saved to: {save_path}") |
200 | | - logger.info("Files created:") |
201 | | - for f in os.listdir(save_path): |
202 | | - logger.info(f" - {f}") |
203 | | - |
204 | | - # Load with AutoTokenizer |
205 | | - loaded = AutoTokenizer.from_pretrained(save_path) |
206 | | - logger.info("Loaded with AutoTokenizer.from_pretrained()") |
207 | | - |
208 | | - # Verify it works |
209 | | - test_seq = "ATCG" |
210 | | - test_enc = loaded.encode(test_seq, add_special_tokens=True) |
211 | | - test_dec = loaded.decode(test_enc, skip_special_tokens=True) |
212 | | - logger.info(f"Test: '{test_seq}' -> {test_enc} -> '{test_dec}'") |
213 | | - logger.info("Loaded tokenizer verified") |
214 | | - |
215 | | - logger.info("\n" + "="*80) |
216 | | - logger.info("ALL TESTS PASSED") |
217 | | - logger.info("="*80) |
218 | | - logger.info("\nIntegration workflow:") |
219 | | - logger.info(" 1. Create tokenizer: tokenizer = create_nucleotide_tokenizer()") |
220 | | - logger.info(" 2. Save to directory: tokenizer.save_pretrained('./nucleotide_fast_tokenizer')") |
221 | | - logger.info(" 3. Load in training: from llama3.tokenizer import load_nucleotide_tokenizer") |
222 | | - logger.info(" 4. Use with DataCollatorForLanguageModeling for batch collation") |
| 93 | + """Create and save the nucleotide tokenizer.""" |
| 94 | + logger.info("Creating nucleotide tokenizer") |
223 | 95 |
|
| 96 | + # Create tokenizer with default settings (BOS=2, EOS=0, PAD=1, UNK=3) |
| 97 | + tokenizer = create_nucleotide_tokenizer() |
224 | 98 |
|
225 | | -if __name__ == "__main__": |
226 | | - main() |
| 99 | + logger.info(f"Vocab size: {tokenizer.vocab_size}") |
| 100 | + logger.info(f"Special tokens: BOS={tokenizer.bos_token_id}, EOS={tokenizer.eos_token_id}, PAD={tokenizer.pad_token_id}, UNK={tokenizer.unk_token_id}") |
| 101 | + |
| 102 | + # Save to default location |
| 103 | + save_path = os.path.join(os.path.dirname(__file__), "nucleotide_fast_tokenizer") |
| 104 | + tokenizer.save_pretrained(save_path) |
| 105 | + logger.info(f"Tokenizer saved to: {save_path}") |
227 | 106 |
|
228 | 107 |
|
| 108 | +if __name__ == "__main__": |
| 109 | + main() |
0 commit comments