fit xpu not support bf16 (#2809)

llbdyiu66 · web-flow · commit d9e0dbc4335d · 2025-10-29T17:56:14.000+08:00
Co-authored-by: llbdyiu66 &lt;llbdyiu66@users.noreply.github.com&gt;
diff --git a/paddleformers/transformers/model_utils.py b/paddleformers/transformers/model_utils.py
@@ -114,6 +114,12 @@
 ]
 
 
+def fit_bf16_to_uint16_np(tensor):
+    if "xpu" in paddle.device.get_device() and isinstance(tensor, np.ndarray) and str(tensor.dtype) == "bfloat16":
+        return tensor.astype("uint16")
+    return tensor
+
+
 def dy2st_nocheck_guard_context():
     try:
         context = paddle.framework._no_check_dy2st_diff()
@@ -433,7 +439,7 @@ def _transpose_hf_weight(key, weight):
                 and not key.endswith("_scale")
             ):
                 # numpy.array -> paddle.tensor
-                weight = paddle.Tensor.__call__(py_safe_slice_[:], zero_copy=True)
+                weight = paddle.Tensor.__call__(fit_bf16_to_uint16_np(py_safe_slice_[:]), zero_copy=True)
                 weight = _transpose_hf_weight(key, weight)
                 key_name = key.split(".weight")[0]
                 quant_key_name = key_name + ".quant_weight"
@@ -478,7 +484,7 @@ def _transpose_hf_weight(key, weight):
                         weight = py_safe_slice_[:]
                 if not return_numpy and device == "expected":
                     with device_guard():
-                        weight = paddle.Tensor.__call__(weight, zero_copy=True)
+                        weight = paddle.Tensor.__call__(fit_bf16_to_uint16_np(weight), zero_copy=True)
                     weight = weight._copy_to(paddle.framework._current_expected_place(), False)
                 weight = _transpose_hf_weight(key, weight)
                 part_state_dict[key] = weight
@@ -492,7 +498,7 @@ def _transpose_hf_weight(key, weight):
                 scale = f.get_tensor(key)
                 if not return_numpy and device == "expected":
                     with device_guard():
-                        scale = paddle.Tensor.__call__(scale, zero_copy=True)
+                        scale = paddle.Tensor.__call__(fit_bf16_to_uint16_np(scale), zero_copy=True)
                     scale = scale._copy_to(paddle.framework._current_expected_place(), False)
                 scale_dict[key] = scale
     return part_state_dict, scale_dict
@@ -583,10 +589,14 @@ def load_state_dict(
                 if device == "cpu":
                     with device_guard():
                         for k in list(state_dict.keys()):
-                            state_dict[k] = paddle.Tensor.__call__(state_dict.pop(k), zero_copy=True)
+                            state_dict[k] = paddle.Tensor.__call__(
+                                fit_bf16_to_uint16_np(state_dict.pop(k)), zero_copy=True
+                            )
                 elif device == "pin_memory":
                     for k in list(state_dict.keys()):
-                        state_dict[k] = paddle.to_tensor(state_dict.pop(k), place=paddle.CUDAPinnedPlace())
+                        state_dict[k] = paddle.to_tensor(
+                            fit_bf16_to_uint16_np(state_dict.pop(k)), place=paddle.CUDAPinnedPlace()
+                        )
 
             if len(scale_dict) != 0:
                 if ckpt_quant_stage == "O0":
@@ -2784,7 +2794,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
             for k in list(state_dict.keys()):
                 if not isinstance(state_dict[k], paddle.Tensor):
                     with device_guard():
-                        state_dict[k] = paddle.Tensor.__call__(state_dict.pop(k), zero_copy=True)
+                        state_dict[k] = paddle.Tensor.__call__(
+                            fit_bf16_to_uint16_np(state_dict.pop(k)), zero_copy=True
+                        )
         else:
             if is_sharded:
                 loaded_state_dict_keys = sharded_metadata["all_checkpoint_keys"]
@@ -2799,7 +2811,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
             for k in list(state_dict.keys()):
                 if not isinstance(state_dict[k], paddle.Tensor):
                     with device_guard():
-                        state_dict[k] = paddle.Tensor.__call__(state_dict.pop(k), zero_copy=True)
+                        state_dict[k] = paddle.Tensor.__call__(
+                            fit_bf16_to_uint16_np(state_dict.pop(k)), zero_copy=True
+                        )
         # 3. init the model
         init_args = config["init_args"] or ()
         with ContextManagers(init_contexts):
@@ -3380,7 +3394,9 @@ def load_sharded_checkpoint_as_one(folder, variant=None, return_numpy=False):
         if not return_numpy:
             for key in list(state_dict.keys()):
                 if isinstance(state_dict[key], np.ndarray):
-                    state_dict[key] = paddle.Tensor.__call__(state_dict.pop(key), zero_copy=True)
+                    state_dict[key] = paddle.Tensor.__call__(
+                        fit_bf16_to_uint16_np(state_dict.pop(key)), zero_copy=True
+                    )
         return state_dict
 
     index_file = os.path.join(folder, _add_variant(PADDLE_WEIGHTS_INDEX_NAME, variant))
@@ -3423,7 +3439,7 @@ def load_sharded_checkpoint_as_one(folder, variant=None, return_numpy=False):
     if not return_numpy:
         for key in list(ret.keys()):
             if isinstance(ret[key], np.ndarray):
-                ret[key] = paddle.Tensor.__call__(ret.pop(key), zero_copy=True)
+                ret[key] = paddle.Tensor.__call__(fit_bf16_to_uint16_np(ret.pop(key)), zero_copy=True)
 
     return ret