modelscope · cyruszhang · Nov 15, 2024 · Nov 19, 2024 · Nov 22, 2024 · Nov 25, 2024
diff --git a/.gitignore b/.gitignore
@@ -15,3 +15,8 @@ dist
 wandb/
 __pycache__
 .vscode/
+**/__dj__produced_data__/*
+venv/
+
+# dup files created by tests
+tests/ops/data/*dup*
diff --git a/configs/datasets/local_json.yaml b/configs/datasets/local_json.yaml
@@ -0,0 +1,6 @@
+# global parameters
+project_name: 'dataset-local-json'
+dataset:
+  configs:
+    - type: 'local'
+      path: 'path/to/json/file'
diff --git a/configs/datasets/local_parquet.yaml b/configs/datasets/local_parquet.yaml
@@ -0,0 +1,6 @@
+# global parameters
+project_name: 'dataset-local-parquet'
+dataset:
+  configs:
+    - type: 'local'
+      path: 'path/to/parquet/file'
diff --git a/configs/datasets/mixture.yaml b/configs/datasets/mixture.yaml
@@ -0,0 +1,10 @@
+project_name: 'dataset-mixture'
+dataset:
+  max_sample_num: 10000
+  configs:
+    - type: 'local'
+      weight: 1.0
+      path: 'path/to/json/file'
+    - type: 'local'
+      weight: 1.0
+      path: 'path/to/csv/file'
diff --git a/configs/datasets/remote_arxiv.yaml b/configs/datasets/remote_arxiv.yaml
@@ -0,0 +1,10 @@
+# global parameters
+project_name: 'dataset-remote-arxiv'
+dataset:
+  configs:
+    - type: 'remote'
+      source: 'arxiv'
+      lang: 'en'
+      dump_date: 'latest'
+      force_download: false
+      url_limit: 2
diff --git a/configs/datasets/remote_commoncrawl.yaml b/configs/datasets/remote_commoncrawl.yaml
@@ -0,0 +1,11 @@
+# global parameters
+project_name: 'dataset-remote-commoncrawl'
+dataset:
+  configs:
+    - type: 'remote'
+      source: 'commoncrawl'
+      start_snapshot: '2020-50'
+      end_snapshot: '2021-04'
+      aws: true
+      force_download: false
+      url_limit: 2
diff --git a/configs/datasets/remote_huggingface.yaml b/configs/datasets/remote_huggingface.yaml
@@ -0,0 +1,10 @@
+# global parameters
+project_name: 'dataset-remote-huggingface'
+dataset:
+  configs:
+    - type: 'remote'
+      source: 'huggingface'
+      path: "HuggingFaceFW/fineweb"
+      name: "CC-MAIN-2024-10"
+      split: "train"
+      limit: 1000
diff --git a/configs/datasets/remote_modelscope.yaml b/configs/datasets/remote_modelscope.yaml
@@ -0,0 +1,10 @@
+# global parameters
+project_name: 'dataset-remote-modelscope'
+dataset:
+  configs:
+    - type: 'remote'
+      source: 'modelscope'
+      path: 'modelscope/clue'
+      subset_name: 'afqmc'
+      split: 'train'
+      limit: 1000
diff --git a/configs/datasets/remote_wiki.yaml b/configs/datasets/remote_wiki.yaml
@@ -0,0 +1,10 @@
+# global parameters
+project_name: 'dataset-remote-wiki'
+dataset:
+  configs:
+    - type: 'remote'
+      source: 'wiki'
+      lang: 'en'
+      dump_date: 'latest'
+      force_download: false
+      url_limit: 2
diff --git a/configs/datasets/validation.yaml b/configs/datasets/validation.yaml
@@ -0,0 +1,18 @@
+dataset:
+  configs:
+    - type: local
+      path: path/to/data.json
+
+validators:
+  - type: conversation
+    min_turns: 2
+    max_turns: 20
+  - type: required_fields
+    required_fields:
+      - "text"
+      - "metadata"
+      - "language"
+    field_types:
+      text: "str"
+      metadata: "dict"
+      language: "str"
diff --git a/configs/demo/process-huggingface.yaml b/configs/demo/process-huggingface.yaml
@@ -0,0 +1,22 @@
+# Process config example for dataset
+
+# global parameters
+project_name: 'demo-process'
+dataset:
+  configs:
+    - type: 'remote'
+      source: 'huggingface'
+      path: 'hugfaceguy0001/retarded_bar'
+      name: 'question'
+      split: 'train'
+
+np: 4  # number of subprocess to process your dataset
+
+export_path: './outputs/demo-process/demo-processed.jsonl'
+
+# process schedule
+# a list of several process operators with their arguments
+process:
+  - language_id_score_filter:
+      lang: 'zh'
+      min_score: 0.8
diff --git a/data_juicer/config/__init__.py b/data_juicer/config/__init__.py
@@ -1,7 +1,7 @@
-from .config import (export_config, get_init_configs, init_configs,
-                     merge_config, prepare_side_configs)
+from .config import (export_config, get_default_cfg, get_init_configs,
+                     init_configs, merge_config, prepare_side_configs)
 
 __all__ = [
     'init_configs', 'get_init_configs', 'export_config', 'merge_config',
-    'prepare_side_configs'
+    'prepare_side_configs', 'get_default_cfg'
 ]
diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py
@@ -10,7 +10,7 @@
 import yaml
 from jsonargparse import (ActionConfigFile, ArgumentParser, Namespace,
                           dict_to_namespace, namespace_to_dict)
-from jsonargparse.typehints import ActionTypeHint
+from jsonargparse._typehints import ActionTypeHint
 from jsonargparse.typing import ClosedUnitInterval, NonNegativeInt, PositiveInt
 from loguru import logger
 
@@ -102,6 +102,13 @@ def init_configs(args: Optional[List[str]] = None, which_entry: object = None):
         help='Path to datasets with optional weights(0.0-1.0), 1.0 as '
         'default. Accepted format:<w1> dataset1-path <w2> dataset2-path '
         '<w3> dataset3-path ...')
+    parser.add_argument(
+        '--dataset',
+        type=Union[List[Dict], Dict],
+        default=[],
+        help='Dataset setting to define local/remote datasets; could be a '
+        'dict or a list of dicts; refer to configs/datasets for more '
+        'detailed examples')
     parser.add_argument(
         '--generated_dataset_config',
         type=Dict,
@@ -452,19 +459,22 @@ def init_setup_from_cfg(cfg: Namespace):
 
     # check and get dataset dir
     if cfg.get('dataset_path', None) and os.path.exists(cfg.dataset_path):
+        logger.info('dataset_path config is set and a valid local path')
         cfg.dataset_path = os.path.abspath(cfg.dataset_path)
         if os.path.isdir(cfg.dataset_path):
             cfg.dataset_dir = cfg.dataset_path
         else:
             cfg.dataset_dir = os.path.dirname(cfg.dataset_path)
-    elif cfg.dataset_path == '':
-        logger.warning('dataset_path is empty by default.')
+    elif cfg.dataset_path == '' and cfg.get('dataset', None):
+        logger.info('dataset_path config is empty; dataset is present')
         cfg.dataset_dir = ''
     else:
         logger.warning(f'dataset_path [{cfg.dataset_path}] is not a valid '
-                       f'local path. Please check and retry, otherwise we '
-                       f'will treat it as a remote dataset or a mixture of '
-                       f'several datasets.')
+                       f'local path, AND dataset is not present. '
+                       f'Please check and retry, otherwise we '
+                       f'will treat dataset_path as a remote dataset or a '
+                       f'mixture of several datasets.')
+
         cfg.dataset_dir = ''
 
     # check number of processes np
@@ -910,3 +920,36 @@ def get_init_configs(cfg: Union[Namespace, Dict]):
         json.dump(cfg, f)
     inited_dj_cfg = init_configs(['--config', temp_file])
     return inited_dj_cfg
+
+
+def get_default_cfg():
+    """Get default config values from config_all.yaml"""
+    cfg = Namespace()
+
+    # Get path to config_all.yaml
+    config_dir = os.path.dirname(os.path.abspath(__file__))
+    default_config_path = os.path.join(config_dir,
+                                       '../../configs/config_all.yaml')
+
+    # Load default values from yaml
+    with open(default_config_path, 'r', encoding='utf-8') as f:
+        defaults = yaml.safe_load(f)
+
+    # Convert to flat dictionary for namespace
+    flat_defaults = {
+        'executor_type': 'default',
+        'ray_address': 'auto',
+        'suffixes': None,
+        'text_keys': 'text',
+        'add_suffix': False,
+        'export_path': './outputs',
+        # Add other top-level keys from config_all.yaml
+        **defaults
+    }
+
+    # Update cfg with defaults
+    for key, value in flat_defaults.items():
+        if not hasattr(cfg, key):
+            setattr(cfg, key, value)
+
+    return cfg
diff --git a/data_juicer/core/__init__.py b/data_juicer/core/__init__.py
@@ -1,7 +1,8 @@
 from .adapter import Adapter
 from .analyzer import Analyzer
 from .data import NestedDataset
-from .executor import Executor
+from .executor import Executor, ExecutorFactory, RayExecutor
+from .executor.base import ExecutorBase
 from .exporter import Exporter
 from .monitor import Monitor
 from .tracer import Tracer
@@ -10,7 +11,10 @@
     'Adapter',
     'Analyzer',
     'NestedDataset',
+    'ExecutorFactory',
     'Executor',
+    'RayExecutor',
+    'ExecutorBase',
     'Exporter',
     'Monitor',
     'Tracer',

diff --git a/data_juicer/core/analyzer.py b/data_juicer/core/analyzer.py
@@ -8,7 +8,7 @@
 
 from data_juicer.analysis import ColumnWiseAnalysis, OverallAnalysis
 from data_juicer.config import init_configs
-from data_juicer.format import load_formatter
+from data_juicer.core.data.dataset_builder import DatasetBuilder
 from data_juicer.ops import NON_STATS_FILTERS, TAGGING_OPS, Filter, load_ops
 from data_juicer.ops.op_fusion import fuse_operators
 from data_juicer.utils import cache_utils
@@ -44,14 +44,9 @@ def __init__(self, cfg: Optional[Namespace] = None):
                         f'[{self.cfg.cache_compress}]')
             cache_utils.CACHE_COMPRESS = self.cfg.cache_compress
 
-        # setup formatter
-        logger.info('Setting up data formatter...')
-        self.formatter = load_formatter(
-            dataset_path=self.cfg.dataset_path,
-            generated_dataset_config=self.cfg.generated_dataset_config,
-            text_keys=self.cfg.text_keys,
-            suffixes=self.cfg.suffixes,
-            add_suffix=self.cfg.add_suffix)
+        # setup dataset builder
+        logger.info('Setting up dataset builder...')
+        self.dataset_builder = DatasetBuilder(cfg, executor_type='default')
 
         # prepare exporter and check export path suffix
         # NOTICE: no need to export dataset texts for analyzer
@@ -91,7 +86,7 @@ def run(self,
             load_data_np = self.cfg.np
         if dataset is None:
             logger.info('Loading dataset from data formatter...')
-            dataset = self.formatter.load_dataset(load_data_np, self.cfg)
+            dataset = self.dataset_builder.load_dataset(num_proc=load_data_np)
         else:
             logger.info(f'Using existing dataset {dataset}')
         if self.cfg.auto:

diff --git a/data_juicer/core/data/__init__.py b/data_juicer/core/data/__init__.py
@@ -0,0 +1,9 @@
+from .dj_dataset import (DJDataset, NestedDataset,
+                         add_same_content_to_new_column,
+                         wrap_func_with_nested_access)
+from .ray_dataset import RayDataset
+
+__all__ = [
+    'DJDataset', 'NestedDataset', 'RayDataset', 'wrap_func_with_nested_access',
+    'add_same_content_to_new_column'
+]
diff --git a/data_juicer/core/data/config_validator.py b/data_juicer/core/data/config_validator.py
@@ -0,0 +1,61 @@
+from typing import Dict
+
+
+class ConfigValidationError(Exception):
+    """Custom exception for validation errors"""
+    pass
+
+
+class ConfigValidator:
+    """Mixin class for configuration validation"""
+
+    # Define validation rules for each strategy type
+    CONFIG_VALIDATION_RULES = {
+        'required_fields': [],  # Fields that must be present
+        'optional_fields': [],  # Fields that are optional
+        'field_types': {},  # Expected types for fields
+        'custom_validators': {}  # Custom validation functions
+    }
+
+    def validate_config(self, ds_config: Dict) -> None:
+        """
+        Validate the configuration dictionary.
+
+        Args:
+            ds_config: Configuration dictionary to validate
+
+        Raises:
+            ValidationError: If validation fails
+        """
+        # Check required fields
+        missing_fields = [
+            field for field in self.CONFIG_VALIDATION_RULES['required_fields']
+            if field not in ds_config
+        ]
+        if missing_fields:
+            raise ConfigValidationError(
+                f"Missing required fields: {', '.join(missing_fields)}")
+
+        # Optional fields
+        # no need for any special checks
+
+        # Check field types
+        for field, expected_type in self.CONFIG_VALIDATION_RULES[
+                'field_types'].items():
+            if field in ds_config:
+                value = ds_config[field]
+                if not isinstance(value, expected_type):
+                    raise ConfigValidationError(
+                        f"Field '{field}' must be of "
+                        "type '{expected_type.__name__}', "
+                        f"got '{type(value).__name__}'")
+
+        # Run custom validators
+        for field, validator in self.CONFIG_VALIDATION_RULES[
+                'custom_validators'].items():
+            if field in ds_config:
+                try:
+                    validator(ds_config[field])
+                except Exception as e:
+                    raise ConfigValidationError(
+                        f"Validation failed for field '{field}': {str(e)}")