diff --git a/modelopt/torch/utils/plugins/megatron_preprocess_data.py b/modelopt/torch/utils/plugins/megatron_preprocess_data.py index fbe298d5e..81afc83a8 100644 --- a/modelopt/torch/utils/plugins/megatron_preprocess_data.py +++ b/modelopt/torch/utils/plugins/megatron_preprocess_data.py @@ -15,9 +15,12 @@ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -"""Processing large data to tokenize for pretraining. +"""Processing large data pretraining and post-training datasets to tokenize for usage in megatron pretraining scripts. -Usage to tokenize one or more JSONL files: +We apply chat_template to the data if the JSON key is a list of message dicts (e.g. Nemotron-Post-Training-Dataset-v2) +so that we can tokenize the data for usage in megatron pretraining scripts. + +Usage to tokenize one or more JSONL files (pretraining, ``text`` key): ```bash python -m modelopt.torch.utils.plugins.megatron_preprocess_data \ @@ -37,6 +40,21 @@ --tokenizer Qwen/Qwen3-0.6B ``` +Usage to tokenize a post-training dataset with ``messages`` key (chat format): + +```bash +python -m modelopt.torch.utils.plugins.megatron_preprocess_data \ + --jsonl_paths path/to/sft_data.jsonl \ + --json_keys messages \ + --output_dir /path/to/tokenized/Qwen3/ \ + --tokenizer Qwen/Qwen3-0.6B +``` + +When the value for a JSON key is a list of message dicts (e.g. +``[{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]``), +``tokenizer.apply_chat_template`` is automatically used to render the conversation +into a single text string before tokenization. + Usage to download and tokenize a dataset from Hugging Face Hub: ```bash @@ -69,6 +87,7 @@ class _Encoder: tokenizer: AutoTokenizer = None + _chat_template_logged: set[str] = set() def __init__( self, @@ -97,21 +116,35 @@ def encode(self, json_line: str): doc_len = 0 enc_len = 0 for key in self.json_keys: - text = data[key] + value = data[key] + + if isinstance(value, list): + if key not in _Encoder._chat_template_logged: + _Encoder._chat_template_logged.add(key) + print(f"Applying chat_template to '{key}' key") + kwargs = {} + tools = data.get("tools") + if tools: + kwargs["tools"] = tools + text = _Encoder.tokenizer.apply_chat_template(value, tokenize=False, **kwargs) + else: + text = value # Truncate text by character length if specified - doc_len += len(text) if self.max_document_length is not None: + original_length = len(text) text = text[: self.max_document_length] - # print(f"Document truncated from {original_length} to {self.max_document_length} characters") + if original_length != len(text): + print(f"Document truncated from {original_length} to {len(text)} characters") + doc_len += len(text) # Tokenize the entire text as one document encoded = _Encoder.tokenizer.encode(text) - enc_len += len(encoded) if self.max_sequence_length is not None: encoded = encoded[: self.max_sequence_length] # print(f"Sequence truncated from {original_length} to {self.max_sequence_length} tokens") + enc_len += len(encoded) if len(encoded) > 0 and self.append_eod: encoded.append(_Encoder.tokenizer.eos_token_id) diff --git a/tests/gpu_megatron/torch/utils/plugins/test_megatron_preprocess_data.py b/tests/gpu_megatron/torch/utils/plugins/test_megatron_preprocess_data.py index a0c3b9c51..f3739a209 100644 --- a/tests/gpu_megatron/torch/utils/plugins/test_megatron_preprocess_data.py +++ b/tests/gpu_megatron/torch/utils/plugins/test_megatron_preprocess_data.py @@ -17,6 +17,8 @@ import os from pathlib import Path +import pytest + from modelopt.torch.utils.dataset_utils import download_hf_dataset_as_jsonl from modelopt.torch.utils.plugins.megatron_preprocess_data import megatron_preprocess_data @@ -65,19 +67,27 @@ def test_megatron_preprocess_data_with_minipile_jsonl(tmp_path): assert os.path.getsize(expected_idx_file) > 0, "Index file should not be empty" -def test_megatron_preprocess_data_with_hf_dataset(tmp_path): +@pytest.mark.parametrize( + ("hf_dataset", "hf_split", "json_keys"), + [ + ("nanotron/minipile_100_samples", "train", ["text"]), + ("HuggingFaceTB/everyday-conversations-llama3.1-2k", "test_sft", ["messages"]), + ], +) +def test_megatron_preprocess_data_with_hf_dataset(tmp_path, hf_dataset, hf_split, json_keys): """Test megatron_preprocess_data with dataset download, --append_eod and --max_sequence_length. Downloads nanotron/minipile_100_samples train split from Hugging Face and tokenizes it. """ megatron_preprocess_data( - hf_dataset="nanotron/minipile_100_samples", - hf_split="train", + hf_dataset=hf_dataset, + hf_split=hf_split, + hf_max_samples_per_split=10, output_dir=tmp_path, - tokenizer_name_or_path="gpt2", - json_keys=["text"], + tokenizer_name_or_path="Qwen/Qwen3-0.6B", + json_keys=json_keys, append_eod=True, - max_sequence_length=512, + max_sequence_length=32, workers=4, )