From 0575193d47f2e6ef5d8e83b8f9b5723a7dd73709 Mon Sep 17 00:00:00 2001
From: Yilun Huang <lielin.hyl@alibaba-inc.com>
Date: Tue, 14 Jan 2025 13:53:06 +0800
Subject: [PATCH] * fix save_ckpt bug: error if the number of samples in the
 result dataset is less than the number of workers when saving dataset to disk
 (#536)

---
 data_juicer/utils/ckpt_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/data_juicer/utils/ckpt_utils.py b/data_juicer/utils/ckpt_utils.py
index d22762adb..78192b85f 100644
--- a/data_juicer/utils/ckpt_utils.py
+++ b/data_juicer/utils/ckpt_utils.py
@@ -121,7 +121,9 @@ def save_ckpt(self, ds):
 
         :param ds: input dataset to save
         """
-        ds.save_to_disk(self.ckpt_ds_dir, num_proc=self.num_proc)
+        left_sample_num = len(ds)
+        ds.save_to_disk(self.ckpt_ds_dir,
+                        num_proc=min(self.num_proc, left_sample_num))
 
         with open(self.ckpt_op_record, 'w') as fout:
             json.dump(self.op_record, fout)