Dataset class and tests for LLAMA3; support for streaming, parquet

savitha-eng · savitha-eng · commit 3562f196aca8 · 2025-11-12T08:23:42.000Z
Signed-off-by: savitha-eng &lt;savithas@nvidia.com&gt;
diff --git a/bionemo-recipes/recipes/llama3/dataset.py b/bionemo-recipes/recipes/llama3/dataset.py
@@ -0,0 +1,176 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from pathlib import Path
+
+import datasets
+import datasets.distributed
+from torch.utils.data import DistributedSampler
+from torchdata.stateful_dataloader import StatefulDataLoader
+from transformers import AutoTokenizer
+from transformers.data.data_collator import DataCollatorForLanguageModeling
+
+from distributed_config import DistributedConfig
+
+
+logger = logging.getLogger(__name__)
+
+
+def create_tokenized_dataset(
+    distributed_config: DistributedConfig,
+    tokenizer_path: str,
+    load_dataset_kwargs: dict,
+    max_seq_length: int = 8192,
+    stride: int = 200,
+    buffer_size: int = 500_000,
+    use_lazy_tokenization: bool = True,
+):
+    """Create a tokenized dataset with windowing.
+    
+    Args:
+        distributed_config: The distributed configuration.
+        tokenizer_path: Path to the nucleotide tokenizer directory.
+        load_dataset_kwargs: Keyword arguments to pass to `load_dataset`.
+        max_seq_length: The maximum length of sequences (window size).
+        stride: The stride for windowing (overlap = stride tokens).
+        buffer_size: The buffer size for shuffle.
+        use_lazy_tokenization: Whether to use datasets.set_transform for tokenization.
+        
+    Returns:
+        Tuple of (tokenized_dataset, tokenizer).
+    """
+    logger.info(f"Loading dataset with kwargs: {load_dataset_kwargs}")
+    dataset = datasets.load_dataset(**load_dataset_kwargs)
+    logger.info(f"Loaded dataset: {dataset}")
+
+    # Handle DatasetDict (extract "train" split if present)
+    if isinstance(dataset, (datasets.DatasetDict, datasets.IterableDatasetDict)):
+        if "train" in dataset:
+            dataset = dataset["train"]
+        else:
+            raise ValueError(f"Dataset has splits {list(dataset.keys())} but no 'train' split found. "
+                           "Please specify split='train' in load_dataset_kwargs or ensure your dataset has a 'train' split.")
+
+    # Normalize column names - rename 'nt_sequence' to 'sequence' if present
+    # Only do this for non-streaming datasets (streaming datasets don't have column_names attribute)
+    if hasattr(dataset, "column_names") and dataset.column_names is not None:
+        if "nt_sequence" in dataset.column_names and "sequence" not in dataset.column_names:
+            logger.info("Renaming column 'nt_sequence' to 'sequence' for consistency")
+            dataset = dataset.rename_column("nt_sequence", "sequence")
+
+    if isinstance(dataset, datasets.IterableDataset):
+        dataset = datasets.distributed.split_dataset_by_node(
+            dataset,
+            rank=distributed_config.rank,
+            world_size=distributed_config.world_size,
+        )
+        dataset = dataset.shuffle(seed=42, buffer_size=buffer_size)
+
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+
+    def tokenize_with_windowing(examples):
+        """Tokenize nucleotide sequences with windowing (one-to-many mapping)."""
+        # Tokenize with windowing using return_overflowing_tokens
+        result = tokenizer(
+            examples["sequence"],
+            max_length=max_seq_length,
+            stride=stride,
+            truncation=True,
+            return_overflowing_tokens=True,
+            add_special_tokens=True,
+        )
+        return result
+
+    if isinstance(dataset, datasets.Dataset) and use_lazy_tokenization:
+        # Using dataset.map on a non-streaming dataset will automatically perform and cache the transform
+        tokenized_dataset = dataset.with_transform(tokenize_with_windowing)
+    else:
+        tokenized_dataset = dataset.map(
+            tokenize_with_windowing,
+            batched=True,
+            remove_columns=dataset.column_names,
+        )
+
+    return tokenized_dataset, tokenizer
+
+
+def create_bshd_dataloader(
+    distributed_config: DistributedConfig,
+    tokenizer_path: str,
+    load_dataset_kwargs: dict,
+    micro_batch_size: int,
+    num_workers: int = 0,
+    max_seq_length: int = 8192,
+    stride: int = 200,
+    seed: int = 42,
+    buffer_size: int = 500_000,
+    use_lazy_tokenization: bool = True,
+):
+    """Create a BSHD dataloader for genomic sequences using CLM (causal language modeling).
+    
+    Args:
+        distributed_config: The distributed configuration.
+        tokenizer_path: Path to the nucleotide tokenizer directory.
+        load_dataset_kwargs: Keyword arguments to pass to `load_dataset`.
+        micro_batch_size: The batch size per device.
+        num_workers: The number of workers to use for the dataloader.
+        max_seq_length: The maximum length of sequences (window size).
+        stride: The stride for windowing (overlap = stride tokens).
+        seed: The seed to use for the distributed sampler and data collator.
+        buffer_size: The buffer size for shuffle.
+        use_lazy_tokenization: Whether to use datasets.set_transform for tokenization.
+        
+    Returns:
+        A tuple of (dataloader, dataset_or_sampler).
+    """
+    tokenized_dataset, tokenizer = create_tokenized_dataset(
+        distributed_config=distributed_config,
+        tokenizer_path=tokenizer_path,
+        load_dataset_kwargs=load_dataset_kwargs,
+        max_seq_length=max_seq_length,
+        stride=stride,
+        buffer_size=buffer_size,
+        use_lazy_tokenization=use_lazy_tokenization,
+    )
+
+    if isinstance(tokenized_dataset, datasets.IterableDataset):
+        sampler = None
+    else:
+        sampler = DistributedSampler(
+            tokenized_dataset,
+            rank=distributed_config.rank,
+            num_replicas=distributed_config.world_size,
+            seed=seed,
+        )
+
+    # Use DataCollatorForLanguageModeling with mlm=False for CLM
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer,
+        mlm=False,  # Causal language modeling (no masking)
+    )
+
+    train_dataloader = StatefulDataLoader(
+        tokenized_dataset,
+        sampler=sampler,
+        batch_size=micro_batch_size,
+        collate_fn=data_collator,
+        num_workers=num_workers,
+        pin_memory=True,
+        persistent_workers=num_workers > 0,
+    )
+
+    return train_dataloader, tokenized_dataset if sampler is None else sampler
+
diff --git a/bionemo-recipes/recipes/llama3/distributed_config.py b/bionemo-recipes/recipes/llama3/distributed_config.py
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+from dataclasses import dataclass, field
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class DistributedConfig:
+    """Class to track distributed ranks and handle basic distributed training setup.
+
+    If torch distributed environment variables are not set, we set them to default values for single-process training.
+
+    Attributes:
+        rank: The rank of the process.
+        local_rank: The local rank of the process.
+        world_size: The total number of processes.
+    """
+
+    rank: int = field(default_factory=lambda: int(os.environ.setdefault("RANK", "0")))
+    local_rank: int = field(default_factory=lambda: int(os.environ.setdefault("LOCAL_RANK", "0")))
+    world_size: int = field(default_factory=lambda: int(os.environ.setdefault("WORLD_SIZE", "1")))
+    _master_addr: str = field(default_factory=lambda: os.environ.setdefault("MASTER_ADDR", "localhost"))
+    _master_port: str = field(default_factory=lambda: os.environ.setdefault("MASTER_PORT", "12355"))
+
+    def is_main_process(self) -> bool:
+        """This is the global rank 0 process, to be used for wandb logging, etc."""
+        return self.rank == 0
+
+
+
diff --git a/bionemo-recipes/recipes/llama3/tests/conftest.py b/bionemo-recipes/recipes/llama3/tests/conftest.py
@@ -0,0 +1,107 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from pathlib import Path
+from unittest import mock
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+import pytest
+import torch
+from torch.distributed.device_mesh import _mesh_resources, init_device_mesh
+
+
+sys.path.append(Path(__file__).parent.parent.as_posix())
+sys.path.append(Path(__file__).parent.as_posix())
+
+from distributed_config import DistributedConfig
+
+
+@pytest.fixture
+def recipe_path() -> Path:
+    """Return the root directory of the recipe."""
+    return Path(__file__).parent.parent
+
+
+@pytest.fixture(scope="session")
+def mock_genomic_parquet(tmp_path_factory) -> Path:
+    """Create a mock genomic sequences parquet file for testing.
+    
+    This fixture creates a small parquet file with synthetic genomic sequences
+    that can be used for training tests without relying on external data files.
+    
+    Returns:
+        Path to the generated parquet file
+    """
+    tmp_dir = tmp_path_factory.mktemp("data")
+    parquet_path = tmp_dir / "test_genomic_sequences.parquet"
+    
+    # Create mock genomic sequences with simple repeating patterns
+    # These are easy for the model to overfit to, which is perfect for sanity tests
+    sequences = [
+        "ATCG" * 300,  # 1200 bp - simple ATCG repeat
+        "AAAA" * 250 + "TTTT" * 250,  # 2000 bp - alternating A and T blocks
+        "GCGC" * 200,  # 800 bp - GC repeat
+        "ACGT" * 400,  # 1600 bp - all 4 nucleotides
+        "TGCA" * 350,  # 1400 bp - reverse pattern
+    ]
+    
+    # Create parquet table with 'sequence' column
+    table = pa.table({
+        "sequence": sequences,
+    })
+    
+    pq.write_table(table, parquet_path)
+    return parquet_path
+
+
+@pytest.fixture(scope="session", autouse=True)
+def device_mesh():
+    """Create a re-usable device mesh for testing.
+
+    This is a "auto-use", session-scope fixture so that a single device mesh is created and used in all tests.
+
+    Megatron-FSDP throws issues when re-creating the torch device mesh in the same process, starting in the 25.09 NGC
+    pytorch container release. To work around this, we create a re-usable device mesh that use in all single-process
+    tests.
+    """
+    # Initialize the distributed configuration, including creating the distributed process group.
+    dist_config = DistributedConfig()
+    device = torch.device(f"cuda:{dist_config.local_rank}")
+    torch.distributed.init_process_group(backend="nccl", device_id=device)
+    torch.cuda.set_device(dist_config.local_rank)
+    device_mesh = init_device_mesh("cuda", mesh_shape=(1, 1), mesh_dim_names=("dp", "tp"))
+
+    # Mock these torch.distributed functions so that we re-use the same device mesh, and don't re-create or destroy the
+    # global process group.
+    with (
+        mock.patch("torch.distributed.device_mesh.init_device_mesh", return_value=device_mesh),
+        mock.patch("torch.distributed.init_process_group", return_value=None),
+        mock.patch("torch.distributed.destroy_process_group", return_value=None),
+    ):
+        yield
+
+    # At the end of all tests, destroy the process group and clear the device mesh resources.
+    torch.distributed.destroy_process_group()
+    _mesh_resources.mesh_stack.clear()
+    _mesh_resources.child_to_root_mapping.clear()
+    _mesh_resources.root_to_flatten_mapping.clear()
+    _mesh_resources.flatten_name_to_root_dims.clear()
+    _mesh_resources.mesh_dim_group_options.clear()
+    torch.cuda.empty_cache()
+    torch.cuda.synchronize()
+
+
diff --git a/bionemo-recipes/recipes/llama3/tests/test_dataset.py b/bionemo-recipes/recipes/llama3/tests/test_dataset.py