NVIDIA · kevalmorabia97 · Mar 13, 2026 · Mar 10, 2026 · Mar 13, 2026 · coderabbitai
@@ -15,9 +15,12 @@
 
 # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 
-"""Processing large data to tokenize for pretraining.
+"""Processing large data pretraining and post-training datasets to tokenize for usage in megatron pretraining scripts.
 
-Usage to tokenize one or more JSONL files:
+We apply chat_template to the data if the JSON key is a list of message dicts (e.g. Nemotron-Post-Training-Dataset-v2)
+so that we can tokenize the data for usage in megatron pretraining scripts.
+
+Usage to tokenize one or more JSONL files (pretraining, ``text`` key):
 
 ```bash
 python -m modelopt.torch.utils.plugins.megatron_preprocess_data \
@@ -37,6 +40,21 @@
     --tokenizer Qwen/Qwen3-0.6B
 ```
 
+Usage to tokenize a post-training dataset with ``messages`` key (chat format):
+
+```bash
+python -m modelopt.torch.utils.plugins.megatron_preprocess_data \
+    --jsonl_paths path/to/sft_data.jsonl \
+    --json_keys messages \
+    --output_dir /path/to/tokenized/Qwen3/ \
+    --tokenizer Qwen/Qwen3-0.6B
+```
+
+When the value for a JSON key is a list of message dicts (e.g.
+``[{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]``),
+``tokenizer.apply_chat_template`` is automatically used to render the conversation
+into a single text string before tokenization.
+
 Usage to download and tokenize a dataset from Hugging Face Hub:
 
 ```bash
@@ -69,6 +87,7 @@
 
 class _Encoder:
     tokenizer: AutoTokenizer = None
+    _chat_template_logged: set[str] = set()
 
     def __init__(
         self,
@@ -97,21 +116,35 @@ def encode(self, json_line: str):
         doc_len = 0
         enc_len = 0
         for key in self.json_keys:
-            text = data[key]
+            value = data[key]
+
+            if isinstance(value, list):
+                if key not in _Encoder._chat_template_logged:
+                    _Encoder._chat_template_logged.add(key)
+                    print(f"Applying chat_template to '{key}' key")
+                kwargs = {}
+                tools = data.get("tools")
+                if tools:
+                    kwargs["tools"] = tools
+                text = _Encoder.tokenizer.apply_chat_template(value, tokenize=False, **kwargs)
+            else:
+                text = value
 
             # Truncate text by character length if specified
-            doc_len += len(text)
             if self.max_document_length is not None:
+                original_length = len(text)
                 text = text[: self.max_document_length]
-                # print(f"Document truncated from {original_length} to {self.max_document_length} characters")
+                if original_length != len(text):
+                    print(f"Document truncated from {original_length} to {len(text)} characters")
+            doc_len += len(text)
 
             # Tokenize the entire text as one document
             encoded = _Encoder.tokenizer.encode(text)
 
-            enc_len += len(encoded)
             if self.max_sequence_length is not None:
                 encoded = encoded[: self.max_sequence_length]
                 # print(f"Sequence truncated from {original_length} to {self.max_sequence_length} tokens")
+            enc_len += len(encoded)
 
             if len(encoded) > 0 and self.append_eod:
                 encoded.append(_Encoder.tokenizer.eos_token_id)

diff --git a/tests/gpu_megatron/torch/utils/plugins/test_megatron_preprocess_data.py b/tests/gpu_megatron/torch/utils/plugins/test_megatron_preprocess_data.py
@@ -17,6 +17,8 @@
 import os
 from pathlib import Path
 
+import pytest
+
 from modelopt.torch.utils.dataset_utils import download_hf_dataset_as_jsonl
 from modelopt.torch.utils.plugins.megatron_preprocess_data import megatron_preprocess_data
 
@@ -65,19 +67,27 @@ def test_megatron_preprocess_data_with_minipile_jsonl(tmp_path):
     assert os.path.getsize(expected_idx_file) > 0, "Index file should not be empty"
 
 
-def test_megatron_preprocess_data_with_hf_dataset(tmp_path):
+@pytest.mark.parametrize(
+    ("hf_dataset", "hf_split", "json_keys"),
+    [
+        ("nanotron/minipile_100_samples", "train", ["text"]),
+        ("HuggingFaceTB/everyday-conversations-llama3.1-2k", "test_sft", ["messages"]),
+    ],
+)
+def test_megatron_preprocess_data_with_hf_dataset(tmp_path, hf_dataset, hf_split, json_keys):
     """Test megatron_preprocess_data with dataset download, --append_eod and --max_sequence_length.
 
     Downloads nanotron/minipile_100_samples train split from Hugging Face and tokenizes it.
     """
-    """Test megatron_preprocess_data with dataset download, --append_eod and --max_sequence_length.
-
-    Downloads nanotron/minipile_100_samples train split from Hugging Face and tokenizes it.
-    """
+    """Test megatron_preprocess_data with dataset download, --append_eod and --max_sequence_length.
+
+    Downloads parameterized HuggingFace datasets and tokenizes them with the specified json_keys.
+    Tests both plain text and chat-based datasets.
+    """
-    """Test megatron_preprocess_data with dataset download, --append_eod and --max_sequence_length.
-
-    Downloads nanotron/minipile_100_samples train split from Hugging Face and tokenizes it.
-    """
+    """Test megatron_preprocess_data with dataset download, --append_eod and --max_sequence_length.
+
+    Downloads parameterized HuggingFace datasets and tokenizes them with the specified json_keys.
+    Tests both plain text and chat-based datasets.
+    """
     megatron_preprocess_data(
-        hf_dataset="nanotron/minipile_100_samples",
-        hf_split="train",
+        hf_dataset=hf_dataset,
+        hf_split=hf_split,
+        hf_max_samples_per_split=10,
         output_dir=tmp_path,
-        tokenizer_name_or_path="gpt2",
-        json_keys=["text"],
+        tokenizer_name_or_path="Qwen/Qwen3-0.6B",
+        json_keys=json_keys,
         append_eod=True,
-        max_sequence_length=512,
+        max_sequence_length=32,
         workers=4,
     )