Merge remote-tracking branch 'origin/savitha/llama3-recipes-dataloader-add-tokenizer' into savitha/llama3-recipes-dataloader-add-dataset

savitha-eng · savitha-eng · commit c585cdb35e65 · 2025-11-12T23:33:20.000Z
diff --git a/bionemo-recipes/models/llama3/create_tokenizer.py b/bionemo-recipes/models/llama3/create_tokenizer.py
@@ -1,8 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-Apache2
 
-"""
-Script to create the HuggingFace PreTrainedTokenizerFast for nucleotide sequences.
+"""Script to create the HuggingFace PreTrainedTokenizerFast for nucleotide sequences.
 
 This script creates a tokenizer that:
 1. Maps each character to its ord() value (ASCII encoding)
@@ -14,13 +13,11 @@
 
 import logging
 import os
-import tempfile
 
-import torch
 from tokenizers import Tokenizer, processors
 from tokenizers.models import WordLevel
 from tokenizers.pre_tokenizers import Split
-from transformers import AutoTokenizer, DataCollatorForLanguageModeling, PreTrainedTokenizerFast
+from transformers import PreTrainedTokenizerFast
 
 
 logging.basicConfig(level=logging.INFO)
@@ -30,199 +27,83 @@
 def create_nucleotide_tokenizer(
     eos_id: int = 0,
     pad_id: int = 1,
-    bos_id: int = None,
-    unk_id: int = 2,
+    bos_id: int = 2,
+    unk_id: int = 3,
 ) -> PreTrainedTokenizerFast:
-    """
-    Create a PreTrainedTokenizerFast for nucleotide sequences.
-    
-    Follows NeMo ByteTokenizer convention:
-    - eos_id = 0
-    - pad_id = 1  
-    - bos_id = None (optional, for causal LM we'll use 2)
-    
+    """Create a PreTrainedTokenizerFast for nucleotide sequences.
+
+    Uses special token IDs for causal language modeling:
+    - BOS = 2 (beginning of sequence)
+    - EOS = 0 (end of sequence)
+    - PAD = 1 (padding)
+    - UNK = 3 (unknown)
+
     Args:
-        eos_id: End-of-sequence token ID (NeMo convention: 0)
-        pad_id: Padding token ID (NeMo convention: 1)
-        bos_id: Beginning-of-sequence token ID (None in NeMo, but we use 2 for causal LM)
-        unk_id: Unknown token ID (2 or 3)
-        
+        eos_id: End-of-sequence token ID (default: 0)
+        pad_id: Padding token ID (default: 1)
+        bos_id: Beginning-of-sequence token ID (default: 2)
+        unk_id: Unknown token ID (default: 3)
+
     Returns:
         PreTrainedTokenizerFast ready to use and save
     """
-    # Define special tokens with NeMo convention
+    # Define special tokens
     special_tokens = {
+        "<BOS>": bos_id,
         "<EOS>": eos_id,
         "<PAD>": pad_id,
         "<UNK>": unk_id,
     }
-    
-    if bos_id is not None:
-        special_tokens["<BOS>"] = bos_id
-    
+
     # Build vocab: Map each ASCII character to its ord() value
-    # IMPORTANT: Exclude chr(0-3) to reserve those IDs for special tokens
-    vocab = {**special_tokens}
-    reserved_ids = {eos_id, pad_id, unk_id}
-    if bos_id is not None:
-        reserved_ids.add(bos_id)
-    
-    for i in range(256):
-        if i not in reserved_ids:
-            char = chr(i)
-            vocab[char] = i
-    
+    # IMPORTANT: Exclude reserved IDs for special tokens
+    reserved_ids = set(special_tokens.values())
+    vocab = {chr(i): i for i in range(256) if i not in reserved_ids}
+    vocab = {**vocab, **special_tokens}
+
     # Create Rust tokenizer backend with WordLevel model
     tokenizer = Tokenizer(WordLevel(vocab, unk_token="<UNK>"))
-    
+
     # Configure pre-tokenizer: Split into individual characters
     tokenizer.pre_tokenizer = Split(pattern="", behavior="isolated")
-    
+
     # Configure post-processor: Add BOS/EOS tokens automatically
-    if bos_id is not None:
-        tokenizer.post_processor = processors.TemplateProcessing(
-            single="<BOS> $A <EOS>",
-            pair="<BOS> $A <EOS> <BOS> $B <EOS>",
-            special_tokens=[
-                ("<BOS>", bos_id),
-                ("<EOS>", eos_id),
-            ],
-        )
-        bos_token = "<BOS>"
-    else:
-        tokenizer.post_processor = processors.TemplateProcessing(
-            single="$A <EOS>",
-            pair="$A <EOS> $B <EOS>",
-            special_tokens=[
-                ("<EOS>", eos_id),
-            ],
-        )
-        bos_token = None
-    
+    tokenizer.post_processor = processors.TemplateProcessing(
+        single="<BOS> $A <EOS>",
+        pair="<BOS> $A <EOS> <BOS> $B <EOS>",
+        special_tokens=[
+            ("<BOS>", bos_id),
+            ("<EOS>", eos_id),
+        ],
+    )
+
     # Wrap in HuggingFace PreTrainedTokenizerFast
     hf_tokenizer = PreTrainedTokenizerFast(
         tokenizer_object=tokenizer,
         unk_token="<UNK>",
         pad_token="<PAD>",
         eos_token="<EOS>",
-        bos_token=bos_token,
+        bos_token="<BOS>",
     )
-    
+
     return hf_tokenizer
 
 
 def main():
-    """Create and test the nucleotide tokenizer."""
-    logger.info("="*80)
-    logger.info("Creating HuggingFace PreTrainedTokenizerFast for Nucleotides")
-    logger.info("="*80)
-    
-    # Create tokenizer with NeMo convention (with BOS for causal LM)
-    tokenizer = create_nucleotide_tokenizer(
-        eos_id=0,
-        pad_id=1,
-        bos_id=2,
-        unk_id=3,
-    )
-    
-    logger.info("Tokenizer created")
-    logger.info(f"  Vocab size: {tokenizer.vocab_size}")
-    logger.info("  Special tokens:")
-    logger.info(f"    PAD: {tokenizer.pad_token} = {tokenizer.pad_token_id}")
-    logger.info(f"    EOS: {tokenizer.eos_token} = {tokenizer.eos_token_id}")
-    logger.info(f"    BOS: {tokenizer.bos_token} = {tokenizer.bos_token_id}")
-    logger.info(f"    UNK: {tokenizer.unk_token} = {tokenizer.unk_token_id}")
-    
-    # Test encoding/decoding
-    logger.info("\n" + "-"*80)
-    logger.info("Test 1: Encoding/Decoding")
-    logger.info("-"*80)
-    
-    sequence = "ATCGATCG"
-    encoded = tokenizer.encode(sequence, add_special_tokens=True)
-    decoded = tokenizer.decode(encoded, skip_special_tokens=True)
-    
-    logger.info(f"Original:  '{sequence}'")
-    logger.info(f"Encoded:   {encoded}")
-    logger.info(f"Expected:  [2(BOS), 65(A), 84(T), 67(C), 71(G), 65(A), 84(T), 67(C), 71(G), 0(EOS)]")
-    logger.info(f"Decoded:   '{decoded}'")
-    logger.info(f"Roundtrip successful: {sequence == decoded}")
-    
-    # Test padding
-    logger.info("\n" + "-"*80)
-    logger.info("Test 2: Padding")
-    logger.info("-"*80)
-    
-    batch = tokenizer(
-        ["ATCG", "ATCGATCGATCG"],
-        padding=True,
-        return_tensors="pt"
-    )
-    
-    logger.info(f"Batch keys: {list(batch.keys())}")
-    logger.info(f"Input IDs shape: {batch['input_ids'].shape}")
-    logger.info(f"Input IDs:\n{batch['input_ids']}")
-    logger.info(f"Attention mask:\n{batch['attention_mask']}")
-    logger.info("Padding verified")
-    
-    # Test with DataCollator
-    logger.info("\n" + "-"*80)
-    logger.info("Test 3: DataCollatorForLanguageModeling (mlm=False)")
-    logger.info("-"*80)
-    
-    collator = DataCollatorForLanguageModeling(
-        tokenizer=tokenizer,
-        mlm=False,
-    )
-    
-    examples = [
-        {"input_ids": batch["input_ids"][0]},
-        {"input_ids": batch["input_ids"][1]},
-    ]
-    
-    collated = collator(examples)
-    logger.info(f"Collated keys: {list(collated.keys())}")
-    logger.info(f"Labels shape: {collated['labels'].shape}")
-    logger.info(f"Labels (first 20): {collated['labels'][0][:20].tolist()}")
-    logger.info("DataCollator integration verified")
-    
-    # Test save/load
-    logger.info("\n" + "-"*80)
-    logger.info("Test 4: Save/Load with AutoTokenizer")
-    logger.info("-"*80)
-    
-    with tempfile.TemporaryDirectory() as tmpdir:
-        save_path = os.path.join(tmpdir, "nucleotide_tokenizer")
-        
-        # Save
-        tokenizer.save_pretrained(save_path)
-        logger.info(f"Saved to: {save_path}")
-        logger.info("Files created:")
-        for f in os.listdir(save_path):
-            logger.info(f"  - {f}")
-        
-        # Load with AutoTokenizer
-        loaded = AutoTokenizer.from_pretrained(save_path)
-        logger.info("Loaded with AutoTokenizer.from_pretrained()")
-        
-        # Verify it works
-        test_seq = "ATCG"
-        test_enc = loaded.encode(test_seq, add_special_tokens=True)
-        test_dec = loaded.decode(test_enc, skip_special_tokens=True)
-        logger.info(f"Test: '{test_seq}' -> {test_enc} -> '{test_dec}'")
-        logger.info("Loaded tokenizer verified")
-    
-    logger.info("\n" + "="*80)
-    logger.info("ALL TESTS PASSED")
-    logger.info("="*80)
-    logger.info("\nIntegration workflow:")
-    logger.info("  1. Create tokenizer: tokenizer = create_nucleotide_tokenizer()")
-    logger.info("  2. Save to directory: tokenizer.save_pretrained('./nucleotide_fast_tokenizer')")
-    logger.info("  3. Load in training: from llama3.tokenizer import load_nucleotide_tokenizer")
-    logger.info("  4. Use with DataCollatorForLanguageModeling for batch collation")
+    """Create and save the nucleotide tokenizer."""
+    logger.info("Creating nucleotide tokenizer")
 
+    # Create tokenizer with default settings (BOS=2, EOS=0, PAD=1, UNK=3)
+    tokenizer = create_nucleotide_tokenizer()
 
-if __name__ == "__main__":
-    main()
+    logger.info(f"Vocab size: {tokenizer.vocab_size}")
+    logger.info(f"Special tokens: BOS={tokenizer.bos_token_id}, EOS={tokenizer.eos_token_id}, PAD={tokenizer.pad_token_id}, UNK={tokenizer.unk_token_id}")
+
+    # Save to default location
+    save_path = os.path.join(os.path.dirname(__file__), "nucleotide_fast_tokenizer")
+    tokenizer.save_pretrained(save_path)
+    logger.info(f"Tokenizer saved to: {save_path}")
 
 
+if __name__ == "__main__":
+    main()
diff --git a/bionemo-recipes/models/llama3/tests/test_tokenizer.py b/bionemo-recipes/models/llama3/tests/test_tokenizer.py
@@ -59,6 +59,16 @@ def test_tokenizer_encode_without_special_tokens(tokenizer):
     assert encoded == expected
 
 
+def test_tokenizer_roundtrip_encode_decode(tokenizer):
+    """Test that encoding and decoding produces the original sequence."""
+    sequence = "ATCGATCG"
+    encoded = tokenizer.encode(sequence, add_special_tokens=True)
+    decoded = tokenizer.decode(encoded, skip_special_tokens=True)
+    
+    # Decoded may have spaces between tokens, so compare without spaces
+    assert sequence == decoded.replace(" ", "")
+
+
 def test_tokenizer_nucleotide_mappings(tokenizer):
     """Test each nucleotide maps to its ASCII value."""
     # A=65, T=84, C=67, G=71
@@ -212,3 +222,42 @@ def test_short_sequences_dont_overflow(tokenizer):
     assert len(result["input_ids"][0]) == 402
 
 
+def test_bos_eos_in_overlapping_windows(tokenizer):
+    """Test that BOS/EOS tokens are added to every overlapping window.
+    
+    Verifies that when using return_overflowing_tokens with add_special_tokens=True,
+    each window gets its own BOS and EOS tokens, treating each as an independent sequence.
+    This matches the behavior needed for causal language modeling training.
+    """
+    # Use a short genomic sequence that will produce exactly 2 overlapping windows
+    # With max_length=7 and stride=4, sequence of 8bp should give 2 windows
+    sequence = "ATCGATCG"  # 8bp
+    
+    result = tokenizer(
+        sequence,
+        max_length=7,      # BOS + 5 content + EOS = 7 tokens total
+        stride=4,          # Overlap of 4 tokens between windows
+        truncation=True,
+        return_overflowing_tokens=True,
+        add_special_tokens=True,
+    )
+    
+    # Should produce exactly 2 windows
+    num_windows = len(result["input_ids"])
+    assert num_windows >= 2, f"Should produce at least 2 overlapping windows, got {num_windows}"
+    
+    first_window = result["input_ids"][0]
+    second_window = result["input_ids"][1]
+    
+    # Verify both windows have BOS at start and EOS at end
+    assert first_window[0] == tokenizer.bos_token_id
+    assert first_window[-1] == tokenizer.eos_token_id
+    assert second_window[0] == tokenizer.bos_token_id
+    assert second_window[-1] == tokenizer.eos_token_id
+    
+    # Verify windows are actually overlapping by checking they share some content
+    first_content = set(first_window[1:-1])
+    second_content = set(second_window[1:-1])
+    assert len(first_content & second_content) > 0
+
+