diff --git a/data_juicer/core/analyzer.py b/data_juicer/core/analyzer.py index ce1af2d847..a96e109201 100644 --- a/data_juicer/core/analyzer.py +++ b/data_juicer/core/analyzer.py @@ -39,9 +39,12 @@ def __init__(self, cfg=None): # setup formatter logger.info('Setting up data formatter...') - self.formatter = load_formatter(self.cfg.dataset_path, - self.cfg.text_keys, self.cfg.suffixes, - self.cfg.add_suffix) + self.formatter = load_formatter( + dataset_path=self.cfg.dataset_path, + generated_dataset_config=self.cfg.generated_dataset_config, + text_keys=self.cfg.text_keys, + suffixes=self.cfg.suffixes, + add_suffix=self.cfg.add_suffix) # prepare exporter and check export path suffix # NOTICE: no need to export dataset texts for analyzer diff --git a/data_juicer/core/executor.py b/data_juicer/core/executor.py index 87e38dbce9..30206518c1 100644 --- a/data_juicer/core/executor.py +++ b/data_juicer/core/executor.py @@ -48,10 +48,12 @@ def __init__(self, cfg=None): # setup formatter logger.info('Setting up data formatter...') - self.formatter = load_formatter(self.cfg.dataset_path, - self.cfg.generated_dataset_config, - self.cfg.text_keys, self.cfg.suffixes, - self.cfg.add_suffix) + self.formatter = load_formatter( + dataset_path=self.cfg.dataset_path, + generated_dataset_config=self.cfg.generated_dataset_config, + text_keys=self.cfg.text_keys, + suffixes=self.cfg.suffixes, + add_suffix=self.cfg.add_suffix) # whether to use checkpoint mechanism. If it's true, Executor will # check if there are existing checkpoints first and try to load the