From 0575193d47f2e6ef5d8e83b8f9b5723a7dd73709 Mon Sep 17 00:00:00 2001 From: Yilun Huang <lielin.hyl@alibaba-inc.com> Date: Tue, 14 Jan 2025 13:53:06 +0800 Subject: [PATCH] * fix save_ckpt bug: error if the number of samples in the result dataset is less than the number of workers when saving dataset to disk (#536) --- data_juicer/utils/ckpt_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/data_juicer/utils/ckpt_utils.py b/data_juicer/utils/ckpt_utils.py index d22762adb..78192b85f 100644 --- a/data_juicer/utils/ckpt_utils.py +++ b/data_juicer/utils/ckpt_utils.py @@ -121,7 +121,9 @@ def save_ckpt(self, ds): :param ds: input dataset to save """ - ds.save_to_disk(self.ckpt_ds_dir, num_proc=self.num_proc) + left_sample_num = len(ds) + ds.save_to_disk(self.ckpt_ds_dir, + num_proc=min(self.num_proc, left_sample_num)) with open(self.ckpt_op_record, 'w') as fout: json.dump(self.op_record, fout)