NVIDIA
diff --git a/‎bionemo-recipes/recipes/llama3/dataset.py‎
Lines changed: 10 additions & 11 deletions b/‎bionemo-recipes/recipes/llama3/dataset.py‎
Lines changed: 10 additions & 11 deletions
diff --git a/‎bionemo-recipes/recipes/llama3/hydra_config/L0_sanity.yaml‎
Lines changed: 1 addition & 1 deletion b/‎bionemo-recipes/recipes/llama3/hydra_config/L0_sanity.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bionemo-recipes/recipes/llama3/hydra_config/defaults.yaml‎
Lines changed: 1 addition & 3 deletions b/‎bionemo-recipes/recipes/llama3/hydra_config/defaults.yaml‎
Lines changed: 1 addition & 3 deletions
@@ -14,17 +14,15 @@
 # limitations under the License.
 
 import logging
-from pathlib import Path
 
 import datasets
 import datasets.distributed
+from distributed_config import DistributedConfig
 from torch.utils.data import DistributedSampler
 from torchdata.stateful_dataloader import StatefulDataLoader
 from transformers import AutoTokenizer
 from transformers.data.data_collator import DataCollatorForLanguageModeling
 
-from distributed_config import DistributedConfig
-
 
 logger = logging.getLogger(__name__)
 
@@ -39,7 +37,7 @@ def create_tokenized_dataset(
     use_lazy_tokenization: bool = True,
 ):
     """Create a tokenized dataset with windowing.
-    
+
     Args:
         distributed_config: The distributed configuration.
         tokenizer_path: Path to the nucleotide tokenizer directory.
@@ -48,7 +46,7 @@ def create_tokenized_dataset(
         stride: The stride for windowing (overlap = stride tokens).
         buffer_size: The buffer size for shuffle.
         use_lazy_tokenization: Whether to use datasets.set_transform for tokenization.
-        
+
     Returns:
         Tuple of (tokenized_dataset, tokenizer).
     """
@@ -61,8 +59,10 @@ def create_tokenized_dataset(
         if "train" in dataset:
             dataset = dataset["train"]
         else:
-            raise ValueError(f"Dataset has splits {list(dataset.keys())} but no 'train' split found. "
-                           "Please specify split='train' in load_dataset_kwargs or ensure your dataset has a 'train' split.")
+            raise ValueError(
+                f"Dataset has splits {list(dataset.keys())} but no 'train' split found. "
+                "Please specify split='train' in load_dataset_kwargs or ensure your dataset has a 'train' split."
+            )
 
     # Normalize column names - rename 'nt_sequence' to 'sequence' if present
     # Only do this for non-streaming datasets (streaming datasets don't have column_names attribute)
@@ -120,7 +120,7 @@ def create_bshd_dataloader(
     use_lazy_tokenization: bool = True,
 ):
     """Create a BSHD dataloader for genomic sequences using CLM (causal language modeling).
-    
+
     Args:
         distributed_config: The distributed configuration.
         tokenizer_path: Path to the nucleotide tokenizer directory.
@@ -132,7 +132,7 @@ def create_bshd_dataloader(
         seed: The seed to use for the distributed sampler and data collator.
         buffer_size: The buffer size for shuffle.
         use_lazy_tokenization: Whether to use datasets.set_transform for tokenization.
-        
+
     Returns:
         A tuple of (dataloader, dataset_or_sampler).
     """
@@ -168,9 +168,8 @@ def create_bshd_dataloader(
         batch_size=micro_batch_size,
         collate_fn=data_collator,
         num_workers=num_workers,
-        pin_memory=True,
+        pin_memory=False,  # Disabled due to PyTorch 2.9 compatibility issue with torchdata 0.11.0
         persistent_workers=num_workers > 0,
     )
 
     return train_dataloader, tokenized_dataset if sampler is None else sampler
-
@@ -27,6 +27,7 @@ dataset:
 wandb_init_args:
   name: "llama3_8B_genomic_sanity"
   mode: "offline"
+  project: null  # Set to null by default, override with +wandb_init_args.project=your-project
 
 # Learning rate scheduler config
 lr_scheduler_kwargs:
@@ -41,4 +42,3 @@ checkpoint:
 
 logger:
   frequency: 1
-
@@ -26,6 +26,7 @@ dataset:
 # WandB config
 wandb_init_args:
   name: ???
+  project: null  # Optional: set to your wandb project name
 
 # mFSDP config
 fully_shard_kwargs:
@@ -73,6 +74,3 @@ checkpoint:
 
 logger:
   frequency: 100
-
-
-