huggingface · ArjunJagdale · Jun 26, 2025 · Jun 26, 2025 · Jun 26, 2025 · Jun 26, 2025
diff --git a/docs/source/repository_structure.mdx b/docs/source/repository_structure.mdx
@@ -277,3 +277,27 @@ my_dataset_repository/
         ├── shard_0.csv
         └── shard_1.csv
 ```
+
+#### Automatic Subset Grouping
+
+When using folder-based datasets, `datasets` will automatically group files by subset if their names only differ by trailing digits or standard sharding patterns.
+
+For example:
+
+```bash
+train0.jsonl
+train1.jsonl
+train2.jsonl
+animals.jsonl
+metadata.jsonl
+```
+
+will be grouped into:
+
+* `"train"` subset → `train0.jsonl`, `train1.jsonl`, `train2.jsonl`
+* `"animals"` subset → `animals.jsonl`
+* `"metadata"` subset → `metadata.jsonl`
+
+This logic enables users to provide multiple logical subsets per split name without needing a nested folder structure. It's especially useful for datasets that are sharded or organized by topic.
+
+This grouping is enabled by default in all builders that inherit from `FolderBasedBuilder`.
diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py
@@ -19,7 +19,30 @@
 from .utils import tqdm as hf_tqdm
 from .utils.file_utils import _prepare_path_and_storage_options, is_local_path, is_relative_path, xbasename, xjoin
 from .utils.py_utils import string_to_dict
+import re
+from collections import defaultdict
+from pathlib import Path
 
+def group_files_by_subset(filepaths):
+    """
+    Groups files by subset according to a heuristic:
+    - Files whose names only differ by digits or shard suffixes are grouped together.
+    - Others are placed in separate groups.
+    """
+    def normalize(filename):
+        # Remove trailing numbers or known sharding conventions
+        name = Path(filename).stem
+        # Remove patterns like -00000-of-00003
+        name = re.sub(r'(-\d{5,}-of-\d{5,})$', '', name)
+        # Remove trailing digits, underscores, or hyphens
+        name = re.sub(r'[\d_]+$', '', name)
+        return name
+
+    groups = defaultdict(list)
+    for path in filepaths:
+        key = normalize(path)
+        groups[key].append(path)
+    return dict(groups)
 
 SingleOriginMetadata = Union[tuple[str, str], tuple[str], tuple[()]]
 

diff --git a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py
@@ -15,7 +15,7 @@
 from datasets import config
 from datasets.features.features import FeatureType, _visit, _visit_with_path, _VisitPath, require_storage_cast
 from datasets.utils.file_utils import readline
-
+from datasets.data_files import group_files_by_subset
 
 logger = datasets.utils.logging.get_logger(__name__)
 
@@ -120,51 +120,57 @@ def analyze(files_or_archives, downloaded_files_or_dirs, split):
         data_files = self.config.data_files
         splits = []
         for split_name, files in data_files.items():
-            if isinstance(files, str):
-                files = [files]
-            files, archives = self._split_files_and_archives(files)
-            downloaded_files = dl_manager.download(files)
-            downloaded_dirs = dl_manager.download_and_extract(archives)
-            if do_analyze:  # drop_metadata is None or False, drop_labels is None or False
-                logger.info(f"Searching for labels and/or metadata files in {split_name} data files...")
-                analyze(files, downloaded_files, split_name)
-                analyze(archives, downloaded_dirs, split_name)
-
-                if metadata_files:
-                    # add metadata if `metadata_files` are found and `drop_metadata` is None (default) or False
-                    add_metadata = not self.config.drop_metadata
-                    # if `metadata_files` are found, don't add labels
-                    add_labels = False
+                if isinstance(files, str):
+                        files = [files]
+                files, archives = self._split_files_and_archives(files)
+                downloaded_files = dl_manager.download(files)
+                downloaded_dirs = dl_manager.download_and_extract(archives)
+
+                if do_analyze:
+                        logger.info(f"Searching for labels and/or metadata files in {split_name} data files...")
+                        analyze(files, downloaded_files, split_name)
+                        analyze(archives, downloaded_dirs, split_name)
+
+                        if metadata_files:
+                                add_metadata = not self.config.drop_metadata
+                                add_labels = False
+                        else:
+                                add_metadata = False
+                                add_labels = (
+                                        (len(labels) > 1 and len(path_depths) == 1)
+                                        if self.config.drop_labels is None
+                                        else not self.config.drop_labels
+                                )
+
+                        if add_labels:
+                                logger.info("Adding the labels inferred from data directories to the dataset's features...")
+                        if add_metadata:
+                                logger.info("Adding metadata to the dataset...")
                 else:
-                    # if `metadata_files` are not found, don't add metadata
-                    add_metadata = False
-                    # if `metadata_files` are not found and `drop_labels` is None (default) -
-                    # add labels if files are on the same level in directory hierarchy and there is more than one label
-                    add_labels = (
-                        (len(labels) > 1 and len(path_depths) == 1)
-                        if self.config.drop_labels is None
-                        else not self.config.drop_labels
-                    )
+                        add_labels, add_metadata, metadata_files = False, False, {}
 
-                if add_labels:
-                    logger.info("Adding the labels inferred from data directories to the dataset's features...")
-                if add_metadata:
-                    logger.info("Adding metadata to the dataset...")
-            else:
-                add_labels, add_metadata, metadata_files = False, False, {}
-
-            splits.append(
-                datasets.SplitGenerator(
-                    name=split_name,
-                    gen_kwargs={
-                        "files": tuple(zip(files, downloaded_files))
-                        + tuple((None, dl_manager.iter_files(downloaded_dir)) for downloaded_dir in downloaded_dirs),
-                        "metadata_files": metadata_files.get(split_name, []),
-                        "add_labels": add_labels,
-                        "add_metadata": add_metadata,
-                    },
-                )
-            )
+                grouped = group_files_by_subset(files)
+                for subset_name, grouped_files in grouped.items():
+                        grouped_downloaded_files = [
+                                downloaded_files[files.index(original)] for original in grouped_files if original in files
+                        ]
+
+                        split_id = (
+                                f"{split_name}_{subset_name}" if subset_name != split_name else split_name
+                        )
+
+                        splits.append(
+                                datasets.SplitGenerator(
+                                        name=split_id,
+                                        gen_kwargs={
+                                                "files": tuple(zip(grouped_files, grouped_downloaded_files))
+                                                + tuple((None, dl_manager.iter_files(downloaded_dir)) for downloaded_dir in downloaded_dirs),
+                                                "metadata_files": metadata_files.get(split_name, []),
+                                                "add_labels": add_labels,
+                                                "add_metadata": add_metadata,
+                                        },
+                                )
+                        )
 
         if add_metadata:
             # Verify that:

diff --git a/tests/test_data_files.py b/tests/test_data_files.py
@@ -21,6 +21,7 @@
     resolve_pattern,
 )
 from datasets.fingerprint import Hasher
+from datasets.data_files import group_files_by_subset
 
 
 _TEST_PATTERNS = ["*", "**", "**/*", "*.txt", "data/*", "**/*.txt", "**/train.txt"]
@@ -509,6 +510,23 @@ def test_DataFilesPatternsDict(text_file):
     assert isinstance(data_files_dict["train"], DataFilesList)
 
 
+def test_group_files_by_subset():
+    files = [
+        "animals.jsonl",
+        "trees.jsonl",
+        "metadata.jsonl",
+        "train0.jsonl",
+        "train1.jsonl",
+        "train2.jsonl",
+    ]
+    groups = group_files_by_subset(files)
+    assert "train" in groups
+    assert set(groups["train"]) == {"train0.jsonl", "train1.jsonl", "train2.jsonl"}
+    assert "animals" in groups
+    assert "trees" in groups
+    assert "metadata" in groups
+
+
 def mock_fs(file_paths: List[str]):
     """
     Set up a mock filesystem for fsspec containing the provided files