-
Notifications
You must be signed in to change notification settings - Fork 295
Support megatron tokenization for post training datasets #1018
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -17,6 +17,8 @@ | |||||||||||||||||||
| import os | ||||||||||||||||||||
| from pathlib import Path | ||||||||||||||||||||
|
|
||||||||||||||||||||
| import pytest | ||||||||||||||||||||
|
|
||||||||||||||||||||
| from modelopt.torch.utils.dataset_utils import download_hf_dataset_as_jsonl | ||||||||||||||||||||
| from modelopt.torch.utils.plugins.megatron_preprocess_data import megatron_preprocess_data | ||||||||||||||||||||
|
|
||||||||||||||||||||
|
|
@@ -65,19 +67,27 @@ def test_megatron_preprocess_data_with_minipile_jsonl(tmp_path): | |||||||||||||||||||
| assert os.path.getsize(expected_idx_file) > 0, "Index file should not be empty" | ||||||||||||||||||||
|
|
||||||||||||||||||||
|
|
||||||||||||||||||||
| def test_megatron_preprocess_data_with_hf_dataset(tmp_path): | ||||||||||||||||||||
| @pytest.mark.parametrize( | ||||||||||||||||||||
| ("hf_dataset", "hf_split", "json_keys"), | ||||||||||||||||||||
| [ | ||||||||||||||||||||
| ("nanotron/minipile_100_samples", "train", ["text"]), | ||||||||||||||||||||
| ("HuggingFaceTB/everyday-conversations-llama3.1-2k", "test_sft", ["messages"]), | ||||||||||||||||||||
| ], | ||||||||||||||||||||
| ) | ||||||||||||||||||||
| def test_megatron_preprocess_data_with_hf_dataset(tmp_path, hf_dataset, hf_split, json_keys): | ||||||||||||||||||||
| """Test megatron_preprocess_data with dataset download, --append_eod and --max_sequence_length. | ||||||||||||||||||||
|
|
||||||||||||||||||||
| Downloads nanotron/minipile_100_samples train split from Hugging Face and tokenizes it. | ||||||||||||||||||||
| """ | ||||||||||||||||||||
|
Comment on lines
78
to
81
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Update the docstring to reflect parameterized test behavior. The docstring still references only "nanotron/minipile_100_samples train split", but the test is now parameterized to run against multiple datasets including the chat-based everyday-conversations dataset. 📝 Proposed fix- """Test megatron_preprocess_data with dataset download, --append_eod and --max_sequence_length.
-
- Downloads nanotron/minipile_100_samples train split from Hugging Face and tokenizes it.
- """
+ """Test megatron_preprocess_data with dataset download, --append_eod and --max_sequence_length.
+
+ Downloads parameterized HuggingFace datasets and tokenizes them with the specified json_keys.
+ Tests both plain text and chat-based datasets.
+ """📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||||
| megatron_preprocess_data( | ||||||||||||||||||||
| hf_dataset="nanotron/minipile_100_samples", | ||||||||||||||||||||
| hf_split="train", | ||||||||||||||||||||
| hf_dataset=hf_dataset, | ||||||||||||||||||||
| hf_split=hf_split, | ||||||||||||||||||||
| hf_max_samples_per_split=10, | ||||||||||||||||||||
| output_dir=tmp_path, | ||||||||||||||||||||
| tokenizer_name_or_path="gpt2", | ||||||||||||||||||||
| json_keys=["text"], | ||||||||||||||||||||
| tokenizer_name_or_path="Qwen/Qwen3-0.6B", | ||||||||||||||||||||
| json_keys=json_keys, | ||||||||||||||||||||
| append_eod=True, | ||||||||||||||||||||
| max_sequence_length=512, | ||||||||||||||||||||
| max_sequence_length=32, | ||||||||||||||||||||
| workers=4, | ||||||||||||||||||||
| ) | ||||||||||||||||||||
|
|
||||||||||||||||||||
|
|
||||||||||||||||||||
Uh oh!
There was an error while loading. Please reload this page.