Skip to content

Introduces automatic subset-level grouping for folder-based dataset builders #7066 #7646

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions docs/source/repository_structure.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -277,3 +277,27 @@ my_dataset_repository/
├── shard_0.csv
└── shard_1.csv
```

#### Automatic Subset Grouping

When using folder-based datasets, `datasets` will automatically group files by subset if their names only differ by trailing digits or standard sharding patterns.

For example:

```bash
train0.jsonl
train1.jsonl
train2.jsonl
animals.jsonl
metadata.jsonl
```

will be grouped into:

* `"train"` subset → `train0.jsonl`, `train1.jsonl`, `train2.jsonl`
* `"animals"` subset → `animals.jsonl`
* `"metadata"` subset → `metadata.jsonl`

This logic enables users to provide multiple logical subsets per split name without needing a nested folder structure. It's especially useful for datasets that are sharded or organized by topic.

This grouping is enabled by default in all builders that inherit from `FolderBasedBuilder`.
23 changes: 23 additions & 0 deletions src/datasets/data_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,30 @@
from .utils import tqdm as hf_tqdm
from .utils.file_utils import _prepare_path_and_storage_options, is_local_path, is_relative_path, xbasename, xjoin
from .utils.py_utils import string_to_dict
import re
from collections import defaultdict
from pathlib import Path

def group_files_by_subset(filepaths):
"""
Groups files by subset according to a heuristic:
- Files whose names only differ by digits or shard suffixes are grouped together.
- Others are placed in separate groups.
"""
def normalize(filename):
# Remove trailing numbers or known sharding conventions
name = Path(filename).stem
# Remove patterns like -00000-of-00003
name = re.sub(r'(-\d{5,}-of-\d{5,})$', '', name)
# Remove trailing digits, underscores, or hyphens
name = re.sub(r'[\d_]+$', '', name)
return name

groups = defaultdict(list)
for path in filepaths:
key = normalize(path)
groups[key].append(path)
return dict(groups)

SingleOriginMetadata = Union[tuple[str, str], tuple[str], tuple[()]]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from datasets import config
from datasets.features.features import FeatureType, _visit, _visit_with_path, _VisitPath, require_storage_cast
from datasets.utils.file_utils import readline

from datasets.data_files import group_files_by_subset

logger = datasets.utils.logging.get_logger(__name__)

Expand Down Expand Up @@ -120,51 +120,57 @@ def analyze(files_or_archives, downloaded_files_or_dirs, split):
data_files = self.config.data_files
splits = []
for split_name, files in data_files.items():
if isinstance(files, str):
files = [files]
files, archives = self._split_files_and_archives(files)
downloaded_files = dl_manager.download(files)
downloaded_dirs = dl_manager.download_and_extract(archives)
if do_analyze: # drop_metadata is None or False, drop_labels is None or False
logger.info(f"Searching for labels and/or metadata files in {split_name} data files...")
analyze(files, downloaded_files, split_name)
analyze(archives, downloaded_dirs, split_name)

if metadata_files:
# add metadata if `metadata_files` are found and `drop_metadata` is None (default) or False
add_metadata = not self.config.drop_metadata
# if `metadata_files` are found, don't add labels
add_labels = False
if isinstance(files, str):
files = [files]
files, archives = self._split_files_and_archives(files)
downloaded_files = dl_manager.download(files)
downloaded_dirs = dl_manager.download_and_extract(archives)

if do_analyze:
logger.info(f"Searching for labels and/or metadata files in {split_name} data files...")
analyze(files, downloaded_files, split_name)
analyze(archives, downloaded_dirs, split_name)

if metadata_files:
add_metadata = not self.config.drop_metadata
add_labels = False
else:
add_metadata = False
add_labels = (
(len(labels) > 1 and len(path_depths) == 1)
if self.config.drop_labels is None
else not self.config.drop_labels
)

if add_labels:
logger.info("Adding the labels inferred from data directories to the dataset's features...")
if add_metadata:
logger.info("Adding metadata to the dataset...")
else:
# if `metadata_files` are not found, don't add metadata
add_metadata = False
# if `metadata_files` are not found and `drop_labels` is None (default) -
# add labels if files are on the same level in directory hierarchy and there is more than one label
add_labels = (
(len(labels) > 1 and len(path_depths) == 1)
if self.config.drop_labels is None
else not self.config.drop_labels
)
add_labels, add_metadata, metadata_files = False, False, {}

if add_labels:
logger.info("Adding the labels inferred from data directories to the dataset's features...")
if add_metadata:
logger.info("Adding metadata to the dataset...")
else:
add_labels, add_metadata, metadata_files = False, False, {}

splits.append(
datasets.SplitGenerator(
name=split_name,
gen_kwargs={
"files": tuple(zip(files, downloaded_files))
+ tuple((None, dl_manager.iter_files(downloaded_dir)) for downloaded_dir in downloaded_dirs),
"metadata_files": metadata_files.get(split_name, []),
"add_labels": add_labels,
"add_metadata": add_metadata,
},
)
)
grouped = group_files_by_subset(files)
for subset_name, grouped_files in grouped.items():
grouped_downloaded_files = [
downloaded_files[files.index(original)] for original in grouped_files if original in files
]

split_id = (
f"{split_name}_{subset_name}" if subset_name != split_name else split_name
)

splits.append(
datasets.SplitGenerator(
name=split_id,
gen_kwargs={
"files": tuple(zip(grouped_files, grouped_downloaded_files))
+ tuple((None, dl_manager.iter_files(downloaded_dir)) for downloaded_dir in downloaded_dirs),
"metadata_files": metadata_files.get(split_name, []),
"add_labels": add_labels,
"add_metadata": add_metadata,
},
)
)

if add_metadata:
# Verify that:
Expand Down
18 changes: 18 additions & 0 deletions tests/test_data_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
resolve_pattern,
)
from datasets.fingerprint import Hasher
from datasets.data_files import group_files_by_subset


_TEST_PATTERNS = ["*", "**", "**/*", "*.txt", "data/*", "**/*.txt", "**/train.txt"]
Expand Down Expand Up @@ -509,6 +510,23 @@ def test_DataFilesPatternsDict(text_file):
assert isinstance(data_files_dict["train"], DataFilesList)


def test_group_files_by_subset():
files = [
"animals.jsonl",
"trees.jsonl",
"metadata.jsonl",
"train0.jsonl",
"train1.jsonl",
"train2.jsonl",
]
groups = group_files_by_subset(files)
assert "train" in groups
assert set(groups["train"]) == {"train0.jsonl", "train1.jsonl", "train2.jsonl"}
assert "animals" in groups
assert "trees" in groups
assert "metadata" in groups


def mock_fs(file_paths: List[str]):
"""
Set up a mock filesystem for fsspec containing the provided files
Expand Down