From 0f413b688328cbeef638dd75e2c6e39112cac83d Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 20 Dec 2024 11:29:10 -0800 Subject: [PATCH 1/2] add keep_extensions param to existing scripts and docs Signed-off-by: Sarah Yurick --- .../distributeddataclassification.rst | 16 ++++++++-------- docs/user-guide/documentdataset.rst | 4 ++-- docs/user-guide/qualityfiltering.rst | 2 +- docs/user-guide/sparkother.rst | 2 +- docs/user-guide/taskdecontamination.rst | 2 +- examples/classifier_filtering.py | 2 +- examples/exact_deduplication.py | 3 +-- examples/identify_languages_and_fix_unicode.py | 2 +- examples/task_decontamination.py | 2 +- nemo_curator/scripts/find_exact_duplicates.py | 5 +++-- .../fuzzy_deduplication/compute_minhashes.py | 5 +++-- .../scripts/prepare_fasttext_training_data.py | 2 +- nemo_curator/utils/file_utils.py | 2 +- tests/test_read_data.py | 1 - tutorials/dapt-curation/code/utils.py | 14 +------------- .../red-pajama-v2-curation-tutorial.ipynb | 17 +++-------------- .../single_gpu_tutorial.ipynb | 7 +++---- tutorials/tinystories/main.py | 6 +++--- .../zyda2-tutorial/1_fuzzy_dedup/0_minhash.py | 3 +-- 19 files changed, 36 insertions(+), 61 deletions(-) diff --git a/docs/user-guide/distributeddataclassification.rst b/docs/user-guide/distributeddataclassification.rst index 257de441a..26c8d177d 100644 --- a/docs/user-guide/distributeddataclassification.rst +++ b/docs/user-guide/distributeddataclassification.rst @@ -61,7 +61,7 @@ Let's see how ``DomainClassifier`` works in a small excerpt taken from ``example from nemo_curator.classifiers import DomainClassifier - files = get_all_files_paths_under("books_dataset/") + files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl") input_dataset = DocumentDataset.read_json(files, backend="cudf") domain_classifier = DomainClassifier(filter_by=["Games", "Sports"]) @@ -83,7 +83,7 @@ Using the ``MultilingualDomainClassifier`` is very similar to using the ``Domain from nemo_curator.classifiers import MultilingualDomainClassifier - files = get_all_files_paths_under("japanese_books_dataset/") + files = get_all_files_paths_under("japanese_books_dataset/", keep_extensions="jsonl") input_dataset = DocumentDataset.read_json(files, backend="cudf") multilingual_domain_classifier = MultilingualDomainClassifier( @@ -106,7 +106,7 @@ Here's an example of how to use the ``QualityClassifier``: from nemo_curator.classifiers import QualityClassifier - files = get_all_files_paths_under("web_documents/") + files = get_all_files_paths_under("web_documents/", keep_extensions="jsonl") input_dataset = DocumentDataset.read_json(files, backend="cudf") quality_classifier = QualityClassifier(filter_by=["High", "Medium"]) @@ -134,7 +134,7 @@ NeMo Curator provides an easy way to annotate and filter your data using the saf .. code-block:: python - files = get_all_files_paths_under("unsafe_documents/") + files = get_all_files_paths_under("unsafe_documents/", keep_extensions="jsonl") input_dataset = DocumentDataset.read_json(files, backend="cudf") token = "hf_1234" # Replace with your user access token @@ -181,7 +181,7 @@ Here is a small example of how to use the ``InstructionDataGuardClassifier``: # The model expects instruction-response style text data. For example: # "Instruction: {instruction}. Input: {input_}. Response: {response}." - files = get_all_files_paths_under("instruction_input_response_dataset/") + files = get_all_files_paths_under("instruction_input_response_dataset/", keep_extensions="jsonl") input_dataset = DocumentDataset.read_json(files, backend="cudf") token = "hf_1234" # Replace with your user access token @@ -210,7 +210,7 @@ To use the FineWeb Educational Content Classifier, you can follow this example: from nemo_curator.classifiers import FineWebEduClassifier - files = get_all_files_paths_under("web_documents/") + files = get_all_files_paths_under("web_documents/", keep_extensions="jsonl") input_dataset = DocumentDataset.read_json(files, backend="cudf") edu_classifier = FineWebEduClassifier( @@ -247,7 +247,7 @@ Let's see how ``ContentTypeClassifier`` works in a small excerpt taken from ``ex from nemo_curator.classifiers import ContentTypeClassifier - files = get_all_files_paths_under("books_dataset/") + files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl") input_dataset = DocumentDataset.read_json(files, backend="cudf") content_type_classifier = ContentTypeClassifier(filter_by=["Blogs", "News"]) @@ -269,7 +269,7 @@ Here's an example of how to use the ``PromptTaskComplexityClassifier``: from nemo_curator.classifiers import PromptTaskComplexityClassifier - files = get_all_files_paths_under("my_dataset/") + files = get_all_files_paths_under("my_dataset/", keep_extensions="jsonl") input_dataset = DocumentDataset.read_json(files, backend="cudf") classifier = PromptTaskComplexityClassifier() diff --git a/docs/user-guide/documentdataset.rst b/docs/user-guide/documentdataset.rst index 07ef41a22..24ade0e79 100644 --- a/docs/user-guide/documentdataset.rst +++ b/docs/user-guide/documentdataset.rst @@ -43,7 +43,7 @@ You could read, filter the dataset, and write it using the following methods from nemo_curator.utils.file_utils import get_all_files_paths_under from nemo_curator.filters import WordCountFilter - files = get_all_files_paths_under("books_dataset/") + files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl") books = DocumentDataset.read_json(files, add_filename=True) filter_step = nc.ScoreFilter( @@ -58,7 +58,7 @@ You could read, filter the dataset, and write it using the following methods Let's walk through this code line by line. -* ``files = get_all_files_paths_under("books_dataset/")`` This retrieves a list of all files in the given directory. +* ``files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl")`` This retrieves a list of all files in the given directory, then filters the list to include only files ending with ".jsonl". In our case, this is equivalent to writing .. code-block:: python diff --git a/docs/user-guide/qualityfiltering.rst b/docs/user-guide/qualityfiltering.rst index ba2c34ad6..5060fb1ec 100644 --- a/docs/user-guide/qualityfiltering.rst +++ b/docs/user-guide/qualityfiltering.rst @@ -35,7 +35,7 @@ Let's examine this small example: from nemo_curator.utils.file_utils import get_all_files_paths_under from nemo_curator.filters import WordCountFilter - files = get_all_files_paths_under("books_dataset/") + files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl") books = DocumentDataset.read_json(files, add_filename=True) filter_step = nc.ScoreFilter( diff --git a/docs/user-guide/sparkother.rst b/docs/user-guide/sparkother.rst index 0da312e15..19a09c3d4 100644 --- a/docs/user-guide/sparkother.rst +++ b/docs/user-guide/sparkother.rst @@ -91,4 +91,4 @@ The following code snippet demonstrates how to read output from a Spark DataFram stories_dataset = DocumentDataset.read_parquet(processed_files, backend="pandas") It is worth noting that Spark typically tends to create checksum and other marker files which can vary by Spark distribution, -so it is advisable to ignore them when reading data into a NeMo Curator ``DocumentDataset``. \ No newline at end of file +so it is advisable to ignore them when reading data into a NeMo Curator ``DocumentDataset``. diff --git a/docs/user-guide/taskdecontamination.rst b/docs/user-guide/taskdecontamination.rst index 46a0d9804..e62c78e06 100644 --- a/docs/user-guide/taskdecontamination.rst +++ b/docs/user-guide/taskdecontamination.rst @@ -28,7 +28,7 @@ Let's examine this small example: from nemo_curator.utils.file_utils import get_all_files_paths_under from nemo_curator.tasks import Winogrande, Squad, TriviaQA, - files = get_all_files_paths_under("books_dataset/") + files = get_all_files_paths_under("books_dataset/", keep_extensions="jsonl") books = DocumentDataset.read_json(files, add_filename=True) downstream_tasks = [ diff --git a/examples/classifier_filtering.py b/examples/classifier_filtering.py index a6476e395..03a09b0ad 100644 --- a/examples/classifier_filtering.py +++ b/examples/classifier_filtering.py @@ -27,7 +27,7 @@ def load_dataset(input_data_dir): - files = list(get_all_files_paths_under(input_data_dir)) + files = list(get_all_files_paths_under(input_data_dir, keep_extensions="jsonl")) raw_data = read_data(files, file_type="jsonl", backend="pandas", add_filename=True) dataset = DocumentDataset(raw_data) diff --git a/examples/exact_deduplication.py b/examples/exact_deduplication.py index 81a2d66c1..ca2cda728 100644 --- a/examples/exact_deduplication.py +++ b/examples/exact_deduplication.py @@ -17,8 +17,7 @@ from nemo_curator.datasets import DocumentDataset from nemo_curator.modules import ExactDuplicates -from nemo_curator.utils.distributed_utils import get_client, read_data, write_to_disk -from nemo_curator.utils.file_utils import get_all_files_paths_under +from nemo_curator.utils.distributed_utils import get_client, write_to_disk from nemo_curator.utils.script_utils import ArgumentHelper diff --git a/examples/identify_languages_and_fix_unicode.py b/examples/identify_languages_and_fix_unicode.py index 92f628e33..7c58e3e62 100644 --- a/examples/identify_languages_and_fix_unicode.py +++ b/examples/identify_languages_and_fix_unicode.py @@ -28,7 +28,7 @@ def load_dataset(input_data_dir): - files = list(get_all_files_paths_under(input_data_dir)) + files = list(get_all_files_paths_under(input_data_dir, keep_extensions="jsonl")) raw_data = read_data(files, file_type="jsonl", backend="pandas", add_filename=True) dataset = DocumentDataset(raw_data) diff --git a/examples/task_decontamination.py b/examples/task_decontamination.py index daf707c31..f6162c018 100644 --- a/examples/task_decontamination.py +++ b/examples/task_decontamination.py @@ -44,7 +44,7 @@ def load_dataset(input_data_dir): - files = list(get_all_files_paths_under(input_data_dir)) + files = list(get_all_files_paths_under(input_data_dir, keep_extensions="jsonl")) raw_data = read_data(files, file_type="jsonl", backend="pandas", add_filename=True) dataset = DocumentDataset(raw_data) diff --git a/nemo_curator/scripts/find_exact_duplicates.py b/nemo_curator/scripts/find_exact_duplicates.py index e71ef5e11..5f6fc2435 100644 --- a/nemo_curator/scripts/find_exact_duplicates.py +++ b/nemo_curator/scripts/find_exact_duplicates.py @@ -55,8 +55,9 @@ def main(args): if num_files is not None and num_files <= 0: logger.info(f"Processed {num_files}... quitting") break - files = get_all_files_paths_under(root=data_path, recurse_subdirectories=False) - files = [f for f in files if f.endswith(".jsonl")] + files = get_all_files_paths_under( + root=data_path, recurse_subdirectories=False, keep_extensions="jsonl" + ) df = read_data( files[:num_files] if num_files else files, file_type="jsonl", diff --git a/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py b/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py index aa4e1f63f..771383eb0 100644 --- a/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py +++ b/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py @@ -70,8 +70,9 @@ def main(args): print(f"Processed {args.num_files}... quitting") break - files = get_all_files_paths_under(root=data_path, recurse_subdirectories=False) - files = [f for f in files if f.endswith(".jsonl")] + files = get_all_files_paths_under( + root=data_path, recurse_subdirectories=False, keep_extensions="jsonl" + ) df = read_data( files[:num_files] if num_files else files, file_type="jsonl", diff --git a/nemo_curator/scripts/prepare_fasttext_training_data.py b/nemo_curator/scripts/prepare_fasttext_training_data.py index c3a95683c..d8001b365 100644 --- a/nemo_curator/scripts/prepare_fasttext_training_data.py +++ b/nemo_curator/scripts/prepare_fasttext_training_data.py @@ -32,7 +32,7 @@ def sample_rows(df, n, seed): def main(args): client = get_client(**ArgumentHelper.parse_client_args(args)) # Get local path - files = list(get_all_files_paths_under(args.input_data_dir)) + files = list(get_all_files_paths_under(args.input_data_dir, keep_extensions="jsonl")) raw_data = read_data(files, file_type="jsonl", backend="pandas") dataset = DocumentDataset(raw_data) text_field = args.input_json_field diff --git a/nemo_curator/utils/file_utils.py b/nemo_curator/utils/file_utils.py index f1c957b47..eeca2042b 100644 --- a/nemo_curator/utils/file_utils.py +++ b/nemo_curator/utils/file_utils.py @@ -446,7 +446,7 @@ def reshard_jsonl( # Output file size in bytes blocksize = parse_str_of_num_bytes(output_file_size) - input_files = list(get_all_files_paths_under(input_dir)) + input_files = list(get_all_files_paths_under(input_dir, keep_extensions="jsonl")) # Read in the dask bag b = db.read_text(input_files, blocksize=blocksize) diff --git a/tests/test_read_data.py b/tests/test_read_data.py index a619be3a4..f3bd84667 100644 --- a/tests/test_read_data.py +++ b/tests/test_read_data.py @@ -9,7 +9,6 @@ read_data_blocksize, read_data_files_per_partition, ) -from nemo_curator.utils.file_utils import get_all_files_paths_under NUM_FILES = 5 NUM_RECORDS = 100 diff --git a/tutorials/dapt-curation/code/utils.py b/tutorials/dapt-curation/code/utils.py index dc91b2258..2d601688e 100755 --- a/tutorials/dapt-curation/code/utils.py +++ b/tutorials/dapt-curation/code/utils.py @@ -12,13 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json import os -import re - -import dask.dataframe as dd -import pandas as pd -import yaml from nemo_curator import ( ExactDuplicates, @@ -33,7 +27,6 @@ from nemo_curator.datasets import DocumentDataset from nemo_curator.filters import ( DocumentFilter, - RepeatedLinesFilter, RepeatedParagraphsFilter, RepeatingTopNGramsFilter, UrlsFilter, @@ -46,12 +39,7 @@ from nemo_curator.modifiers import DocumentModifier from nemo_curator.modifiers.pii_modifier import PiiModifier from nemo_curator.modifiers.unicode_reformatter import UnicodeReformatter -from nemo_curator.pii.constants import DEFAULT_LANGUAGE, DEFAULT_MAX_DOC_SIZE -from nemo_curator.utils.distributed_utils import get_client -from nemo_curator.utils.file_utils import ( - expand_outdir_and_mkdir, - get_all_files_paths_under, -) +from nemo_curator.utils.file_utils import expand_outdir_and_mkdir class QuotationUnifier(DocumentModifier): diff --git a/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb b/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb index d0f690ead..a59912580 100644 --- a/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb +++ b/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb @@ -121,31 +121,19 @@ "source": [ "import os\n", "import time\n", - "from dask.distributed import Client\n", "import warnings\n", "import dask.dataframe as dd\n", "import dask_cudf\n", "import cudf\n", - "import gzip\n", - "import json\n", - "import dask.bag as db\n", - "import glob\n", "from dask.distributed import wait\n", "import numpy as np\n", "\n", "from nemo_curator import get_client\n", - "from nemo_curator.datasets import DocumentDataset\n", "from nemo_curator.utils.distributed_utils import (\n", " get_num_workers,\n", " read_data,\n", " write_to_disk,\n", ")\n", - "from nemo_curator.utils.file_utils import (\n", - " expand_outdir_and_mkdir, \n", - " get_all_files_paths_under, \n", - " separate_by_metadata,\n", - " get_batched_files,\n", - ")\n", "\n", "warnings.filterwarnings('ignore')\n", "base_dir = \"/path/to/data\"" @@ -1473,8 +1461,9 @@ } ], "source": [ - "files = get_all_files_paths_under(root=input_data_dir, recurse_subdirectories=False)\n", - "files = [f for f in files if f.endswith(\".jsonl\")]\n", + "files = get_all_files_paths_under(\n", + " root=input_data_dir, recurse_subdirectories=False, keep_extensions=\"jsonl\"\n", + ")\n", "df = read_data(\n", " files,\n", " file_type=\"jsonl\",\n", diff --git a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb index 3170b3502..ffd8fae70 100644 --- a/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb +++ b/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb @@ -122,7 +122,6 @@ }, "outputs": [], "source": [ - "import argparse\n", "import os\n", "\n", "from nemo_curator.utils.distributed_utils import get_client,get_num_workers\n", @@ -130,7 +129,6 @@ "from nemo_curator.utils.distributed_utils import read_data,write_to_disk\n", "from nemo_curator.datasets import DocumentDataset\n", "\n", - "import sys\n", "import pandas as pd\n", "import time\n", "import cudf\n", @@ -1142,8 +1140,9 @@ "print(f\"Computing minhashes for {minhash_data_path}\")\n", "\n", "# Load data. Only the [minhash_id_field, text_field] columns are needed\n", - "files = get_all_files_paths_under(root=minhash_data_path, recurse_subdirectories=False)\n", - "files = [f for f in files if f.endswith(\".jsonl\")]\n", + "files = get_all_files_paths_under(\n", + " root=minhash_data_path, recurse_subdirectories=False, keep_extensions=\"jsonl\"\n", + ")\n", "df = read_data(\n", " files,\n", " file_type=\"jsonl\",\n", diff --git a/tutorials/tinystories/main.py b/tutorials/tinystories/main.py index 84e9948ee..eb965f0b3 100644 --- a/tutorials/tinystories/main.py +++ b/tutorials/tinystories/main.py @@ -176,9 +176,9 @@ def run_curation_pipeline(args: Any, jsonl_dir: str) -> None: client = get_client(**ArgumentHelper.parse_client_args(args)) print(f"Running curation pipeline on '{jsonl_dir}'...") files = [ - fp - for fp in get_all_files_paths_under(jsonl_dir, recurse_subdirectories=False) - if fp.endswith(".jsonl") + get_all_files_paths_under( + jsonl_dir, recurse_subdirectories=False, keep_extensions="jsonl" + ) ] print("Reading the data...") orig_dataset = DocumentDataset.read_json(files, add_filename=True) diff --git a/tutorials/zyda2-tutorial/1_fuzzy_dedup/0_minhash.py b/tutorials/zyda2-tutorial/1_fuzzy_dedup/0_minhash.py index fcbbf9dde..ce86c9213 100644 --- a/tutorials/zyda2-tutorial/1_fuzzy_dedup/0_minhash.py +++ b/tutorials/zyda2-tutorial/1_fuzzy_dedup/0_minhash.py @@ -13,8 +13,7 @@ def read_folder(input_folder, columns=["nemo_id", "text"]): - data_paths = get_all_files_paths_under(input_folder) - data_paths = [f for f in data_paths if f.endswith(".parquet")] + data_paths = get_all_files_paths_under(input_folder, keep_extensions="parquet") data_paths.sort() logging.info(f"Number of files being read: {len(data_paths)}") text_ddf = dask_cudf.read_parquet( From f564801391bd58fcf27f7b0b39f6fd9bbb1a0d8c Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 20 Dec 2024 11:37:22 -0800 Subject: [PATCH 2/2] run black Signed-off-by: Sarah Yurick --- nemo_curator/scripts/prepare_fasttext_training_data.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo_curator/scripts/prepare_fasttext_training_data.py b/nemo_curator/scripts/prepare_fasttext_training_data.py index d8001b365..572cb1108 100644 --- a/nemo_curator/scripts/prepare_fasttext_training_data.py +++ b/nemo_curator/scripts/prepare_fasttext_training_data.py @@ -32,7 +32,9 @@ def sample_rows(df, n, seed): def main(args): client = get_client(**ArgumentHelper.parse_client_args(args)) # Get local path - files = list(get_all_files_paths_under(args.input_data_dir, keep_extensions="jsonl")) + files = list( + get_all_files_paths_under(args.input_data_dir, keep_extensions="jsonl") + ) raw_data = read_data(files, file_type="jsonl", backend="pandas") dataset = DocumentDataset(raw_data) text_field = args.input_json_field