* fix skip_op_error & update_sampling_params

HYLcool · HYLcool · commit bece0d12a0dc · 2025-01-20T16:24:20.000+08:00
diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py
@@ -108,7 +108,7 @@ def wrapper(sample, *args, **kwargs):
                 else:
                     return [res]
             except Exception as e:
-                if skip_op_error:
+                if not skip_op_error:
                     raise
                 from loguru import logger
                 logger.error(f'An error occurred in {op_name} when processing '
diff --git a/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py b/data_juicer/ops/mapper/generate_qa_from_examples_mapper.py
@@ -118,6 +118,9 @@ def __init__(self,
         model_params = model_params or {}
         sampling_params = sampling_params or {}
 
+        sampling_params = update_sampling_params(sampling_params, hf_model,
+                                                 self.enable_vllm)
+
         if enable_vllm:
             assert torch.cuda.device_count() >= 1, 'must be executed in CUDA'
             # cannot initialize vllm replicas on different GPUs
@@ -140,10 +143,6 @@ def __init__(self,
                 **model_params)
             self.sampling_params = sampling_params
 
-        self.sampling_params = update_sampling_params(sampling_params,
-                                                      hf_model,
-                                                      self.enable_vllm)
-
         self.seed_qa_samples = self._load_seed_qa_samples()
         if len(self.seed_qa_samples) == 0:
             raise ValueError('No QA data was parsed from the seed file!')
diff --git a/data_juicer/ops/mapper/generate_qa_from_text_mapper.py b/data_juicer/ops/mapper/generate_qa_from_text_mapper.py
@@ -85,6 +85,9 @@ def __init__(self,
         model_params = model_params or {}
         sampling_params = sampling_params or {}
 
+        sampling_params = update_sampling_params(sampling_params, hf_model,
+                                                 self.enable_vllm)
+
         if enable_vllm:
             assert torch.cuda.device_count() >= 1, 'must be executed in CUDA'
             # cannot initialize vllm replicas on different GPUs
@@ -107,10 +110,6 @@ def __init__(self,
                 **model_params)
             self.sampling_params = sampling_params
 
-        self.sampling_params = update_sampling_params(sampling_params,
-                                                      hf_model,
-                                                      self.enable_vllm)
-
     def parse_output(self, raw_output):
         logger.debug(raw_output)
         qa_list = []
diff --git a/data_juicer/ops/mapper/optimize_qa_mapper.py b/data_juicer/ops/mapper/optimize_qa_mapper.py
@@ -77,6 +77,9 @@ def __init__(self,
         model_params = model_params or {}
         sampling_params = sampling_params or {}
 
+        sampling_params = update_sampling_params(sampling_params, hf_model,
+                                                 self.enable_vllm)
+
         if enable_vllm:
             assert torch.cuda.device_count() >= 1, 'must be executed in CUDA'
             # cannot initialize vllm replicas on different GPUs
@@ -99,10 +102,6 @@ def __init__(self,
                 **model_params)
             self.sampling_params = sampling_params
 
-        self.sampling_params = update_sampling_params(sampling_params,
-                                                      hf_model,
-                                                      self.enable_vllm)
-
     def build_input(self, sample):
         qa_pair = self.qa_pair_template.format(sample[self.query_key],
                                                sample[self.response_key])