From 674841d67334d9ba073d1fe9daa808d6a14a1fcd Mon Sep 17 00:00:00 2001 From: a31413510 <31413510@qq.com> Date: Tue, 9 Dec 2025 08:36:28 +0000 Subject: [PATCH] remove bert && redundant code --- paddleformers/peft/__init__.py | 8 - paddleformers/peft/lokr/__init__.py | 19 - paddleformers/peft/lokr/lokr_config.py | 141 -- paddleformers/peft/lokr/lokr_layers.py | 240 --- paddleformers/peft/lokr/lokr_model.py | 296 ---- paddleformers/peft/prefix/__init__.py | 23 - paddleformers/peft/prefix/prefix_config.py | 102 -- paddleformers/peft/prefix/prefix_model.py | 555 ------ paddleformers/peft/prefix/utils.py | 52 - paddleformers/peft/reft/__init__.py | 24 - paddleformers/peft/reft/interventions.py | 148 -- paddleformers/peft/reft/modeling_utils.py | 175 -- paddleformers/peft/reft/predict.py | 132 -- paddleformers/peft/reft/reft_config.py | 85 - paddleformers/peft/reft/reft_model.py | 365 ---- paddleformers/peft/vera/__init__.py | 17 - paddleformers/peft/vera/vera_config.py | 131 -- paddleformers/peft/vera/vera_layers.py | 149 -- paddleformers/peft/vera/vera_model.py | 284 --- paddleformers/trainer/trainer_utils.py | 71 +- paddleformers/transformers/__init__.py | 27 - .../transformers/auto/configuration.py | 3 - paddleformers/transformers/auto/modeling.py | 2 - paddleformers/transformers/bert/__init__.py | 51 - .../transformers/bert/configuration.py | 407 ----- paddleformers/transformers/bert/modeling.py | 1420 --------------- paddleformers/transformers/bert/modeling.pyi | 347 ---- paddleformers/transformers/bert/tokenizer.py | 19 - .../transformers/bert/tokenizer_fast.py | 19 - paddleformers/transformers/export.py | 68 - .../feature_extraction_sequence_utils.py | 365 ---- .../long_sequence_strategies/__init__.py | 42 - .../attention_strategies.py | 51 - .../embedding_strategies.py | 223 --- .../long_sequence_strategies.py | 68 - paddleformers/transformers/optimization.py | 235 +-- paddleformers/transformers/qwen/modeling.py | 56 +- .../transformers/sentencepiece_model_pb2.py | 1534 ----------------- .../transformers/transposed_linear.py | 59 - tests/mergekit/test_merge_model.py | 10 +- tests/peft/test_lokr.py | 264 --- tests/peft/test_lora.py | 19 +- tests/peft/test_lorapro.py | 17 +- tests/peft/test_mora.py | 15 +- tests/peft/test_mos_lora.py | 17 +- tests/peft/test_prefix.py | 108 -- tests/peft/test_quant_lora.py | 4 +- tests/peft/test_reft.py | 247 --- tests/peft/test_vera.py | 205 --- tests/transformers/auto/test_configuration.py | 10 +- .../transformers/test_configuration_common.py | 6 +- .../transformers/test_configuration_utils.py | 32 +- tests/transformers/test_modeling_utils.py | 14 +- tests/transformers/test_shard_checkpoint.py | 32 +- tests/transformers/test_utils.py | 8 +- 55 files changed, 224 insertions(+), 8797 deletions(-) delete mode 100644 paddleformers/peft/lokr/__init__.py delete mode 100644 paddleformers/peft/lokr/lokr_config.py delete mode 100644 paddleformers/peft/lokr/lokr_layers.py delete mode 100644 paddleformers/peft/lokr/lokr_model.py delete mode 100644 paddleformers/peft/prefix/__init__.py delete mode 100644 paddleformers/peft/prefix/prefix_config.py delete mode 100644 paddleformers/peft/prefix/prefix_model.py delete mode 100644 paddleformers/peft/prefix/utils.py delete mode 100644 paddleformers/peft/reft/__init__.py delete mode 100644 paddleformers/peft/reft/interventions.py delete mode 100644 paddleformers/peft/reft/modeling_utils.py delete mode 100644 paddleformers/peft/reft/predict.py delete mode 100644 paddleformers/peft/reft/reft_config.py delete mode 100644 paddleformers/peft/reft/reft_model.py delete mode 100644 paddleformers/peft/vera/__init__.py delete mode 100644 paddleformers/peft/vera/vera_config.py delete mode 100644 paddleformers/peft/vera/vera_layers.py delete mode 100644 paddleformers/peft/vera/vera_model.py delete mode 100644 paddleformers/transformers/bert/__init__.py delete mode 100644 paddleformers/transformers/bert/configuration.py delete mode 100644 paddleformers/transformers/bert/modeling.py delete mode 100644 paddleformers/transformers/bert/modeling.pyi delete mode 100644 paddleformers/transformers/bert/tokenizer.py delete mode 100644 paddleformers/transformers/bert/tokenizer_fast.py delete mode 100644 paddleformers/transformers/export.py delete mode 100644 paddleformers/transformers/feature_extraction_sequence_utils.py delete mode 100644 paddleformers/transformers/long_sequence_strategies/__init__.py delete mode 100755 paddleformers/transformers/long_sequence_strategies/attention_strategies.py delete mode 100755 paddleformers/transformers/long_sequence_strategies/embedding_strategies.py delete mode 100644 paddleformers/transformers/long_sequence_strategies/long_sequence_strategies.py delete mode 100644 paddleformers/transformers/sentencepiece_model_pb2.py delete mode 100644 paddleformers/transformers/transposed_linear.py delete mode 100644 tests/peft/test_lokr.py delete mode 100644 tests/peft/test_prefix.py delete mode 100644 tests/peft/test_reft.py delete mode 100644 tests/peft/test_vera.py diff --git a/paddleformers/peft/__init__.py b/paddleformers/peft/__init__.py index ea74f991773..775a8be4493 100644 --- a/paddleformers/peft/__init__.py +++ b/paddleformers/peft/__init__.py @@ -19,19 +19,11 @@ from ..utils.lazy_import import _LazyModule import_structure = { - "lokr": ["LoKrConfig", "LoKrModel"], "lora": ["LoRAAutoConfig", "LoRAAutoModel", "LoRAConfig", "LoRAModel"], - "prefix": ["PrefixConfig", "PrefixModelForCausalLM"], - "reft": ["ReFTModel"], - "vera": ["VeRAConfig", "VeRAModel"], } if TYPE_CHECKING: - from .lokr import LoKrConfig, LoKrModel from .lora import LoRAAutoConfig, LoRAAutoModel, LoRAConfig, LoRAModel - from .prefix import PrefixConfig, PrefixModelForCausalLM - from .reft import ReFTModel - from .vera import VeRAConfig, VeRAModel else: sys.modules[__name__] = _LazyModule( __name__, diff --git a/paddleformers/peft/lokr/__init__.py b/paddleformers/peft/lokr/__init__.py deleted file mode 100644 index e1c868e065e..00000000000 --- a/paddleformers/peft/lokr/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .lokr_config import LoKrConfig -from .lokr_layers import LoKrLinear -from .lokr_model import LoKrModel - -__all__ = ["LoKrConfig", "LoKrModel", "LoKrLinear"] diff --git a/paddleformers/peft/lokr/lokr_config.py b/paddleformers/peft/lokr/lokr_config.py deleted file mode 100644 index 0dd4fd337f7..00000000000 --- a/paddleformers/peft/lokr/lokr_config.py +++ /dev/null @@ -1,141 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -from dataclasses import asdict, dataclass, field -from typing import List, Optional, Union - -from ...utils.env import LOKR_CONFIG_NAME - - -@dataclass -class LoKrConfig: - """ - This is the configuration class to store the configuration of a [`LoKrModel`]. - Convention of LoKrModel: W1 can be named as scaling matrix, W2 can be named as adapter matrix. - Args: - target_modules (`Union[List[str],str]`): The names of the modules to apply Lora to. - trainable_modules (`List[str]`): The names of the modules to train when applying Lora. - lokr_alpha (`float`): The alpha parameter for Lora scaling. - merge_weights (`bool`): - Whether to merge the weights of the Lora layers with the base transformer model in `eval` mode. - """ - - base_model_name_or_path: Optional[str] = field( - default=None, metadata={"help": "The name of the base model to use."} - ) - target_modules: Optional[Union[List[str], str]] = field( - default=None, - metadata={ - "help": "List of module names or regex expression of the module names to replace with LoKr." - "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' " - }, - ) - trainable_modules: Optional[List[str]] = field( - default=None, - metadata={ - "help": "List of module names or regex expression of the module names to train when applying with LoKr." - "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' " - }, - ) - trainable_bias: Optional[str] = field( - default=None, metadata={"help": "Define trainable bias parameters for the Lora model."} - ) - lokr_dim: int = field(default=8, metadata={"help": "Lora dimension in LoKr dimension, for adapter matrix"}) - factor: int = field(default=-1, metadata={"help": "Determine the decomposition size of LoKr matrices"}) - decompose_both: bool = field( - default=False, - metadata={"help": "Determine whether to decomposed both Scaling Matrix and adapter matrix together"}, - ) - lokr_alpha: float = field( - default=0.0, metadata={"help": "Determine the scaling of adapter weight, follow lokr convention"} - ) - merge_weight: bool = field( - default=False, metadata={"help": "Merge weights of the original model and the Lokr model"} - ) - tensor_parallel_degree: int = field(default=-1, metadata={"help": "-1 for not use tensor parallel"}) - dtype: Optional[str] = field(default=None, metadata={"help": "The data type of tensor"}) - - @property - def __dict__(self): - return asdict(self) - - def to_dict(self): - return self.__dict__ - - @property - def scaling(self): - if not (self.lokr_alpha or self.lokr_dim): - return 1.0 - return self.lokr_alpha / self.lokr_dim - - def save_pretrained(self, save_directory): - r""" - This method saves the configuration of your adapter model in a directory. - Args: - save_directory (`str`): - The directory where the configuration will be saved. - """ - if os.path.isfile(save_directory): - raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") - - os.makedirs(save_directory, exist_ok=True) - - output_dict = self.__dict__ - output_dict["scaling"] = self.scaling - output_path = os.path.join(save_directory, LOKR_CONFIG_NAME) - - # save it - with open(output_path, "w") as writer: - writer.write(json.dumps(output_dict, indent=2, sort_keys=True)) - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): - r""" - This method loads the configuration of your adapter model from a directory. - Args: - pretrained_model_name_or_path (`str`): - The directory or the hub-id where the configuration is saved. - **kwargs: - Additional keyword arguments passed along to the child class initialization. - """ - if os.path.isfile(os.path.join(pretrained_model_name_or_path, LOKR_CONFIG_NAME)): - config_file = os.path.join(pretrained_model_name_or_path, LOKR_CONFIG_NAME) - else: - raise ValueError(f"Can't find lokr_config.json at '{pretrained_model_name_or_path}'") - - loaded_attributes = cls.from_json_file(config_file) - loaded_attributes.pop("scaling", None) - - config = cls(**kwargs) - - for key, value in loaded_attributes.items(): - if hasattr(config, key): - setattr(config, key, value) - - return config - - @classmethod - def from_json_file(cls, path_json_file): - r""" - Loads a configuration file from a json file. - Args: - path_json_file (`str`): - The path to the json file. - """ - with open(path_json_file, "r") as file: - json_object = json.load(file) - - return json_object diff --git a/paddleformers/peft/lokr/lokr_layers.py b/paddleformers/peft/lokr/lokr_layers.py deleted file mode 100644 index 3064ea8ffd8..00000000000 --- a/paddleformers/peft/lokr/lokr_layers.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -from typing import Tuple - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F - - -# borrow heavily from: -# https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/lokr.py -class LoKrLinear(nn.Linear): - # LoKr implemented in a dense layer - def __init__( - self, - in_features: int, - out_features: int, - lokr_dim: int = 0, - lokr_alpha: float = 0.0, # self.scale is determined by lokr_alpha/lokr_dim - factor: int = -1, - decompose_both: bool = False, - **kwargs - ): - nn.Linear.__init__(self, in_features, out_features, **kwargs) - if not isinstance(lokr_dim, int) or lokr_dim <= 0: - raise ValueError("w_2 matrix lora dimension lokr_dim should be a positive integer") - self.lokr_dim = lokr_dim - self.use_w1 = False - self.use_w2 = False - # Mark the weight as unmerged - self.merged = False - in_m, in_n = factorization(in_features, factor) - out_m, out_n = factorization(out_features, factor) - shape = ((out_m, out_n), (in_m, in_n)) - self.op = F.linear - - lokr_alpha = lokr_dim if lokr_alpha is None or lokr_alpha == 0 else lokr_alpha - if self.use_w2 and self.use_w1: - lokr_alpha = lokr_dim - self.scale = lokr_alpha / self.lokr_dim - - # Actual trainable parameters - if decompose_both and lokr_dim < max(shape[0][0], shape[1][0]) / 2: - self.lokr_w1_a = self.create_parameter( - shape=[shape[0][0], lokr_dim], - dtype=self._dtype, - is_bias=False, - attr=paddle.ParamAttr( - initializer=paddle.nn.initializer.Constant(value=0.0), - ), - ) - self.lokr_w1_b = self.create_parameter( - shape=[lokr_dim, shape[1][0]], - dtype=self._dtype, - is_bias=False, - default_initializer=nn.initializer.KaimingUniform( - negative_slope=math.sqrt(5), nonlinearity="leaky_relu" - ), - ) - else: - self.use_w1 = True - self.lokr_w1 = self.create_parameter( - shape=[shape[0][0], shape[1][0]], - dtype=self._dtype, - is_bias=False, - attr=paddle.ParamAttr( - initializer=paddle.nn.initializer.Constant(value=0.0), - ), - ) # a*c, 1-mode - - if lokr_dim < max(shape[0][1], shape[1][1]) / 2: - self.lokr_w2_a = self.create_parameter( - shape=[shape[0][1], lokr_dim], - dtype=self._dtype, - is_bias=False, - attr=paddle.ParamAttr( - initializer=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu"), - ), - ) - self.lokr_w2_b = self.create_parameter( - shape=[lokr_dim, shape[1][1]], - dtype=self._dtype, - is_bias=False, - attr=paddle.ParamAttr( - initializer=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu"), - ), - ) - # w1 ⊗ (w2_a x w2_b) = (a, b)⊗((c, dim)x(dim, d)) = (a, b)⊗(c, d) = (ac, bd) - else: - self.use_w2 = True - self.lokr_w2 = self.create_parameter( - shape=[shape[0][1], shape[1][1]], - dtype=self._dtype, - is_bias=False, - attr=paddle.ParamAttr( - initializer=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu"), - ), - ) - adapter_weight = make_kron( - self.lokr_w1 if self.use_w1 else self.lokr_w1_a @ self.lokr_w1_b, - (self.lokr_w2 if self.use_w2 else self.lokr_w2_a @ self.lokr_w2_b), - paddle.to_tensor(self.scale), - ) - assert paddle.sum(paddle.isnan(adapter_weight)) == 0, "weight is nan" - # Freezing the pre-trained weight matrix - self.weight.stop_gradient = True - self.disable_lokr = False - - def get_adapter_weight(self): - weight = make_kron( - self.lokr_w1 if self.use_w1 else self.lokr_w1_a @ self.lokr_w1_b, - (self.lokr_w2 if self.use_w2 else self.lokr_w2_a @ self.lokr_w2_b), - paddle.to_tensor(self.scale), - ) - return weight.T - - def merge(self): - if not self.merged: - adapter_weight = self.get_weight() - new_weight = self.weight + adapter_weight * self.scale # core code - self.weight.set_value(new_weight) - self.merged = True - - def unmerge(self): - if self.merged: - adapter_weight = self.get_weight() - new_weight = self.weight - adapter_weight * self.scale # core code - self.weight.set_value(new_weight) - self.merged = False - - def forward(self, input: paddle.Tensor): # core code - if self.merged: - result = self.op(x=input, weight=self.weight, bias=self.bias, name=self.name) - else: - result = self.op(x=input, weight=self.weight, bias=self.bias, name=self.name) - adapter_weight = self.get_adapter_weight() - result += self.op(x=input, weight=adapter_weight) - return result - - def extra_repr(self): - """ - Give detailed debug infos of LoKrModels by print(model) methods. - """ - final_str = ( - "in_features={in_feature} out_features={out_feature}bias={bias}\nlokr_dim={lokr_dim}\ndtype={dtype}\n" - ) - info_dict = { - "in_feature": self.weight.shape[0], - "out_feature": self.weight.shape[1], - "bias": self.bias, - "lokr_dim": self.lokr_dim, - "dtype": self._dtype, - "adapter_weight_scale": self.scale, - "name": f", name={self.name}" if self.name else "", - } - if self.use_w1: - info_dict["lokr_w1"] = self.lokr_w1.shape - final_str += "lokr_w1={lokr_w1}\n" - else: - info_dict["lokr_w1_a"] = self.lokr_w1_a.shape - info_dict["lokr_w1_b"] = self.lokr_w1_b.shape - final_str += "lokr_w1_a={lokr_w1_a}\nlokr_w1_b={lokr_w1_b}\n" - - if self.use_w2: - info_dict["lokr_w2"] = self.lokr_w2.shape - final_str += "lokr_w2={lokr_w2}\n" - else: - info_dict["lokr_w2_a"] = self.lokr_w2_a.shape - info_dict["lokr_w2_b"] = self.lokr_w2_b.shape - final_str += "lokr_w2_a={lokr_w2_a}\nlokr_w2_b={lokr_w2_b}\n" - - final_str += "adapter weight scale={adapter_weight_scale}\nname={name}" - - return final_str.format(**info_dict) - - -# Below code is a direct copy from https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/lokr.py#L11 -def factorization(dimension: int, factor: int = -1) -> Tuple[int, int]: - """ - return a tuple of two value of input dimension decomposed by the number closest to factor - second value is higher or equal than first value. - - In LoRA with Kroneckor Product, first value is a value for weight scale. - second value is a value for weight. - - Because of non-commutative property, A⊗B ≠ B⊗A. Meaning of two matrices is slightly different. - - examples) - factor - -1 2 4 8 16 ... - 127 -> 127, 1 127 -> 127, 1 127 -> 127, 1 127 -> 127, 1 127 -> 127, 1 - 128 -> 16, 8 128 -> 64, 2 128 -> 32, 4 128 -> 16, 8 128 -> 16, 8 - 250 -> 125, 2 250 -> 125, 2 250 -> 125, 2 250 -> 125, 2 250 -> 125, 2 - 360 -> 45, 8 360 -> 180, 2 360 -> 90, 4 360 -> 45, 8 360 -> 45, 8 - 512 -> 32, 16 512 -> 256, 2 512 -> 128, 4 512 -> 64, 8 512 -> 32, 16 - 1024 -> 32, 32 1024 -> 512, 2 1024 -> 256, 4 1024 -> 128, 8 1024 -> 64, 16 - """ - - if factor > 0 and (dimension % factor) == 0: - m = factor - n = dimension // factor - return m, n - if factor == -1: - factor = dimension - m, n = 1, dimension - length = m + n - while m < n: - new_m = m + 1 - while dimension % new_m != 0: - new_m += 1 - new_n = dimension // new_m - if new_m + new_n > length or new_m > factor: - break - else: - m, n = new_m, new_n - if m > n: - n, m = m, n - return m, n - - -def make_kron(w1, w2, scale): - if len(w2.shape) == 4: - w1 = w1.unsqueeze(2).unsqueeze(2) - w2 = w2.contiguous() - rebuild = paddle.kron(w1, w2) # rebuild.shape: (out_features, in_features) - - return rebuild * scale diff --git a/paddleformers/peft/lokr/lokr_model.py b/paddleformers/peft/lokr/lokr_model.py deleted file mode 100644 index fafa684ab11..00000000000 --- a/paddleformers/peft/lokr/lokr_model.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import os -import re -from collections import OrderedDict -from typing import Dict, Union - -import numpy as np -import paddle -import paddle.nn as nn -from paddle.distributed.fleet.meta_parallel import PipelineLayer - -from ...transformers import AutoConfig, PretrainedModel -from ...transformers.model_utils import _add_variant, dtype_guard -from ...utils.env import LOKR_WEIGHTS_NAME -from ...utils.log import logger -from .lokr_config import LoKrConfig - - -def get_lokr_layers(): - from .lokr_layers import LoKrLinear - - return { - "LoKrLinear": LoKrLinear, - } - - -lokr_layers = get_lokr_layers() -LoKrLinear = lokr_layers["LoKrLinear"] -AVAILABLE_LAYERS = [ - LoKrLinear, -] - - -class LoKrModel(nn.Layer): - restore_layer_map: Dict[nn.Layer, nn.Layer] = { - LoKrLinear: nn.Linear, - } - - def __init__(self, model, lokr_config: LoKrConfig) -> None: - super().__init__() - self.model_config = AutoConfig.from_pretrained(lokr_config.base_model_name_or_path) - self.quantized = False - self.lokr_config = lokr_config - self.lokr_split_mapping = {} - if self.lokr_config.dtype is None: - self.lokr_config.dtype = paddle.get_default_dtype() - with dtype_guard(self.lokr_config.dtype): - self.model = self.get_lokr_model(model, lokr_config) - self.is_pipelinemodel = False - if issubclass(type(self.model), PipelineLayer): - raise NotImplementedError("lokr don't support pipeline parallel now") - if lokr_config.tensor_parallel_degree > 1: - self.lokr_config.tensor_parallel_degree = -1 - self.model.config.tensor_parallel_degree = -1 - raise NotImplementedError("lokr don't support tensor parallel now") - # currently tensor_parallel_degree should all be set to -1. - self.forward = self.model.forward - - logger.info("Mark only lokr and trainable_module as trainable.") - self.mark_only_lokr_as_trainable() - - @classmethod - def from_pretrained(cls, model, lokr_path, **kwargs): - lokr_config = kwargs.pop("lokr_config", None) - # init lokr config & lokr model - if not isinstance(lokr_config, LoKrConfig): - lokr_config = LoKrConfig.from_pretrained(lokr_path) - # define a new variable to conserve original lora_config.tensor_parallel_degree value which will update while initializing lora model - lokr_config_tensor_parallel_degree = lokr_config.tensor_parallel_degree - lokr_model = cls(model, lokr_config) - - # define lokr weight name - lokr_weight_name = LOKR_WEIGHTS_NAME - - # load and set lokr weight parameter - lokr_weight_path = os.path.join(lokr_path, lokr_weight_name) - if os.path.exists(lokr_weight_path): - # load lokr weight parameter - lokr_state_dict = paddle.load(lokr_weight_path, return_numpy=True) - logger.info(f"Loading the LoKR weights from {lokr_weight_path}") - - if ( - lokr_config_tensor_parallel_degree > 1 - and lokr_config_tensor_parallel_degree != model.config.tensor_parallel_degree - ): - raise NotImplementedError( - f"{lokr_config_tensor_parallel_degree} is not equal to {model.config.tensor_parallel_degree}. Please merge LoKR weights first." - ) - # set lokr state dict - lokr_model.set_state_dict(lokr_state_dict) - else: - logger.error(f"LoKR weights not found under {lokr_path}, creating LoKR weights from scratch") - - return lokr_model - - def set_state_dict(self, state_dict): - import warnings - - warnings.filterwarnings( - action="ignore", message=".*Skip loading for.*", category=Warning, lineno=0, append=False - ) - self.model.set_state_dict(state_dict) - logger.info("Load lokr weight successfully") - - def save_pretrained(self, save_directory: str, merge_tensor_parallel: bool = False, **kwargs): - logger.info("save lokr pretrained") - save_model_config = kwargs.get("save_model_config", True) - - variant = kwargs.get("variant", None) - is_main_process = kwargs.get("is_main_process", paddle.distributed.get_rank() == 0) - - assert not os.path.isfile( - save_directory - ), f"Saving directory ({save_directory}) should be a directory, not a file" - os.makedirs(save_directory, exist_ok=True) - - lokr_config_to_save = LoKrConfig(**self.lokr_config.to_dict()) - trainable_state_dict = self.get_trainable_state_dict() - - # save lokr weight - lokr_weight_name = _add_variant(LOKR_WEIGHTS_NAME, variant) - weight_filename = os.path.join(save_directory, lokr_weight_name) - paddle.save(trainable_state_dict, weight_filename) - - # save lokr config - if is_main_process: - lokr_config_to_save.save_pretrained(save_directory) - if save_model_config: - model_config_to_save = copy.deepcopy(self.model.config) - if merge_tensor_parallel: - model_config_to_save.tensor_parallel_degree = -1 - model_config_to_save.save_pretrained(save_directory) - - def _find_and_replace_module(self, model, module_name, lokr_config): - parent_module = model - attribute_chain = module_name.split(".") - for name in attribute_chain[:-1]: - parent_module = getattr(parent_module, name) - module = getattr(parent_module, attribute_chain[-1]) - lokr_module = None - if isinstance(module, nn.Linear): - lokr_module = LoKrLinear( - in_features=module.weight.shape[0], - out_features=module.weight.shape[1], - lokr_dim=lokr_config.lokr_dim, - decompose_both=lokr_config.decompose_both, - lokr_alpha=lokr_config.lokr_alpha, - factor=lokr_config.factor, - bias_attr=False if module.bias is None else None, - ) - if lokr_module is None: - raise ValueError("Target LoKr Module not found. LoKr strategy only supports paddle.nn.Linear right now") - - lokr_module.weight = module.weight - if module.bias is not None: - lokr_module.bias = module.bias - setattr(parent_module, attribute_chain[-1], lokr_module) - - def _find_and_restore_module(self, module_name): - parent_module = self.model - attribute_chain = module_name.split(".") - for name in attribute_chain[:-1]: - parent_module = getattr(parent_module, name) - module = getattr(parent_module, attribute_chain[-1]) - original_model_class = self.restore_layer_map[module.__class__] - original_module = original_model_class(in_features=module.weight.shape[0], out_features=module.weight.shape[1]) - original_module.weight = module.weight - if module.bias is not None: - original_module.bias = module.bias - setattr(parent_module, attribute_chain[-1], original_module) - - def get_trainable_state_dict(self): - trainable_state_dict = OrderedDict() - for name, weight in self.model.state_dict().items(): - # get lokr parameter & QAT scale parameter - if not weight.stop_gradient or "activation_quanter" in name or "weight_quanter" in name: - trainable_state_dict[name] = weight - return trainable_state_dict - - def print_trainable_parameters(self) -> None: - freeze_numel = 0 - trainable_numel = 0 - for _, weight in self.model.state_dict().items(): - if weight.stop_gradient: - freeze_numel += np.prod(weight.shape) - else: - trainable_numel += np.prod(weight.shape) - logger.debug( - f"Frozen parameters: {freeze_numel:.2e} || Trainable parameters:{trainable_numel:.2e} || Total parameters:{freeze_numel+trainable_numel:.2e}|| Trainable:{trainable_numel / (freeze_numel+trainable_numel):.2%}" - ) - - def mark_only_lokr_as_trainable(self) -> None: - for _, layer in self.model.named_sublayers(): - if isinstance(layer, LoKrLinear): - for name, weight in layer.state_dict().items(): - if self.lokr_config.trainable_bias in ["lokr", "all"] and "bias" in name: - weight.stop_gradient = False - elif "lokr" in name: - weight.stop_gradient = False - else: - weight.stop_gradient = True - else: - for name, weight in layer.state_dict().items(): - if self.lokr_config.trainable_bias == "all" and "bias" in name: - weight.stop_gradient = False - else: - weight.stop_gradient = True - if self.lokr_config.trainable_modules is not None: - for name, weight in self.model.state_dict().items(): - if any( - re.fullmatch(trainable_module, name) for trainable_module in self.lokr_config.trainable_modules - ): - weight.stop_gradient = False - - def get_lokr_model(self, model: Union[PretrainedModel, nn.Layer], lokr_config: LoKrConfig): - """ - Iterate all base model layers, change target modules to LoKrLayer. - """ - if lokr_config.target_modules is None: - return model - else: - target_modules = lokr_config.target_modules - - for target_module in target_modules: - for i in model.named_sublayers(): - module_name = i[0] - if re.fullmatch(target_module, module_name): - self._find_and_replace_module(model, module_name, lokr_config) - return model - - def restore_original_model(self): - # make sure W and lokr weights are not merged before we restore the original model - for layer_name, layer in self.model.named_sublayers(): - if isinstance(layer, LoKrLinear): - self._find_and_restore_module(layer_name) - return self.model - - def __getattr__(self, name: str): - """Forward missing attributes to the wrapped module.""" - try: - return super().__getattr__(name) # defer to nn.Layer's logic - except AttributeError: - return getattr(self.model, name) - - def train(self): - self.training = True - self.model.training = True - for layer in self.model.sublayers(): - layer.training = True - layer.train() - - def eval(self): - self.training = False - self.model.training = False - for layer in self.model.sublayers(): - layer.training = False - layer.eval() - - def disable_lokr(self): - for _, layer in self.model.named_sublayers(): - if any(isinstance(layer, lokr_layer) for lokr_layer in AVAILABLE_LAYERS): - layer.disable_lokr = True - - def enable_lokr(self): - for _, layer in self.model.named_sublayers(): - if any(isinstance(layer, lokr_layer) for lokr_layer in AVAILABLE_LAYERS): - layer.disable_lokr = False - - def merge(self): - for _, layer in self.model.named_sublayers(): - if any(isinstance(layer, lokr_layer) for lokr_layer in AVAILABLE_LAYERS): - layer.merge() - - def unmerge(self): - for _, layer in self.model.named_sublayers(): - if any(isinstance(layer, lokr_layer) for lokr_layer in AVAILABLE_LAYERS): - layer.unmerge() - - def get_model_config( - self, - ): - return self.model_config.to_dict() diff --git a/paddleformers/peft/prefix/__init__.py b/paddleformers/peft/prefix/__init__.py deleted file mode 100644 index c8bd6e6f07a..00000000000 --- a/paddleformers/peft/prefix/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .prefix_config import PrefixConfig -from .prefix_model import PrefixModelForCausalLM -from .utils import ( - bloom_postprocess_past_key_value, - chatglm_postprocess_past_key_value, - llama_postprocess_past_key_value, - mistral_postprocess_past_key_value, - qwen_postprocess_past_key_value, -) diff --git a/paddleformers/peft/prefix/prefix_config.py b/paddleformers/peft/prefix/prefix_config.py deleted file mode 100644 index ba9135c6a9b..00000000000 --- a/paddleformers/peft/prefix/prefix_config.py +++ /dev/null @@ -1,102 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -from dataclasses import asdict, dataclass, field -from typing import Optional - -from ...utils.env import PREFIX_CONFIG_NAME - - -@dataclass -class PrefixConfig: - prefix_dropout: float = field(default=0.0, metadata={"help": "Prefix projection dropout"}) - num_prefix_tokens: Optional[int] = field(default=None, metadata={"help": "Number of prefix tokens"}) - num_attention_heads: Optional[int] = field(default=None, metadata={"help": "Number of attention heads"}) - multi_query_group_num: Optional[int] = field(default=None, metadata={"help": "Number of Multi-Query Groups."}) - num_hidden_layers: Optional[int] = field(default=None, metadata={"help": "Number of transformer hidden layers"}) - hidden_size: Optional[int] = field( - default=None, metadata={"help": "The hidden embedding dimension of the transformer model"} - ) - prefix_projection: bool = field(default=False, metadata={"help": "Whether to project the prefix tokens"}) - prefix_projection_hidden_size: Optional[int] = field( - default=None, metadata={"help": "The hidden embedding dimension of the transformer model"} - ) - tensor_parallel_degree: int = field(default=-1, metadata={"help": ("1 for not use tensor parallel")}) - dtype: Optional[str] = field(default=None, metadata={"help": "The data type of tensor"}) - - @property - def __dict__(self): - return asdict(self) - - def to_dict(self): - return self.__dict__ - - def save_pretrained(self, save_directory): - r""" - This method saves the configuration of your adapter model in a directory. - Args: - save_directory (`str`): - The directory where the configuration will be saved. - """ - if os.path.isfile(save_directory): - raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") - - os.makedirs(save_directory, exist_ok=True) - - output_dict = self.__dict__ - output_path = os.path.join(save_directory, PREFIX_CONFIG_NAME) - - # save it - with open(output_path, "w") as writer: - writer.write(json.dumps(output_dict, indent=2, sort_keys=True)) - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): - r""" - This method loads the configuration of your adapter model from a directory. - Args: - pretrained_model_name_or_path (`str`): - The directory or the hub-id where the configuration is saved. - **kwargs: - Additional keyword arguments passed along to the child class initialization. - """ - if os.path.isfile(os.path.join(pretrained_model_name_or_path, PREFIX_CONFIG_NAME)): - config_file = os.path.join(pretrained_model_name_or_path, PREFIX_CONFIG_NAME) - else: - raise ValueError(f"Can't find prefix_config.json at '{pretrained_model_name_or_path}'") - - loaded_attributes = cls.from_json_file(config_file) - - config = cls(**kwargs) - - for key, value in loaded_attributes.items(): - if hasattr(config, key): - setattr(config, key, value) - - return config - - @classmethod - def from_json_file(cls, path_json_file): - r""" - Loads a configuration file from a json file. - Args: - path_json_file (`str`): - The path to the json file. - """ - with open(path_json_file, "r") as file: - json_object = json.load(file) - - return json_object diff --git a/paddleformers/peft/prefix/prefix_model.py b/paddleformers/peft/prefix/prefix_model.py deleted file mode 100644 index ce55ed06247..00000000000 --- a/paddleformers/peft/prefix/prefix_model.py +++ /dev/null @@ -1,555 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import gc -import inspect -import os -import tempfile -from functools import partial -from typing import Callable, Optional - -import aistudio_sdk -import numpy as np -import paddle -import paddle.nn as nn -from paddle.distributed import fleet - -from ...transformers.cache_utils import DynamicCache -from ...transformers.model_utils import ( - _add_variant, - _load_state_dict_into_model, - dtype_guard, - load_state_dict, -) -from ...transformers.utils import get_checkpoint_shard_files -from ...utils.distributed import distributed_gather -from ...utils.env import ( - PAST_KEY_VALUES_FILE_NAME, - PREFIX_WEIGHTS_NAME, - SAFE_PEFT_WEIGHTS_INDEX_NAME, -) -from ...utils.log import logger -from .prefix_config import PrefixConfig - - -def signature(function): - """ - Obtain the input arguments of the given function. - """ - sig = inspect.signature(function) - args = [p.name for p in sig.parameters.values() if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD] - return args - - -class PrefixModelForCausalLM(paddle.nn.Layer): - """ - PrefixModel for causal language modeling. - """ - - def __init__( - self, - model, - prefix_config: PrefixConfig, - postprocess_past_key_value: Optional[Callable] = None, - pad_attention_mask: Optional[Callable] = None, - ) -> None: - super().__init__() - if isinstance(model, fleet.meta_parallel.PipelineLayer): - raise NotImplementedError("Prefix tuning is not implemented for pipeline parallelism.") - self.prefix_config = prefix_config - self.model = model - self.forward_keys = signature(self.model.forward) - self.config = model.config - if self.prefix_config.dtype is None: - self.prefix_config.dtype = paddle.get_default_dtype() - with dtype_guard(self.prefix_config.dtype): - self.prefix_encoder = self._create_prefix_encoder() - self.prefix_dropout = nn.Dropout(p=prefix_config.prefix_dropout) - self.prefix_tokens = paddle.arange(self.prefix_config.num_prefix_tokens, dtype="int64") - self.model_prepare_inputs_for_generation = self.model.prepare_inputs_for_generation - self.inference = False - self.postprocess_past_key_value = postprocess_past_key_value - self.pad_attention_mask = pad_attention_mask - if self.model.base_model_prefix == "chatglm_v2": - self.prefix_config.tensor_parallel_degree = -1 - else: - if self.prefix_config.tensor_parallel_degree != self.model.config.tensor_parallel_degree: - self.prefix_config.tensor_parallel_degree = self.model.config.tensor_parallel_degree - logger.warning( - f"Reset tensor_parallel_degree of prefix_config to {self.model.config.tensor_parallel_degree}." - ) - logger.info("Mark only prefix and trainable_module as trainable.") - self.mark_only_prefix_as_trainable() - - def forward( - self, - input_ids, - attention_mask=None, - **kwargs, - ): - - batch_size = input_ids.shape[0] - past_key_values = self._get_past_key_values(batch_size) - - if attention_mask is not None: - if self.pad_attention_mask is not None: - attention_mask = self.pad_attention_mask( - input_ids.shape, self.prefix_config.num_prefix_tokens, attention_mask - ) - else: - if len(attention_mask.shape) == 2: - prefix_attention_mask = paddle.ones( - [batch_size, self.prefix_config.num_prefix_tokens], dtype=attention_mask.dtype - ) - elif len(attention_mask.shape) == 3: - batch_size, src_seq_len, tgt_seq_len = attention_mask.shape - prefix_attention_mask = paddle.ones( - [batch_size, src_seq_len, self.prefix_config.num_prefix_tokens], dtype=attention_mask.dtype - ) - elif len(attention_mask.shape) == 4: - batch_size, num_heads, src_seq_len, tgt_seq_len = attention_mask.shape - prefix_attention_mask = paddle.ones( - [batch_size, num_heads, src_seq_len, self.prefix_config.num_prefix_tokens], - dtype=attention_mask.dtype, - ) - else: - raise ValueError(f"Unexpected attention_mask shape: {attention_mask.shape}") - attention_mask = paddle.cat((prefix_attention_mask, attention_mask), axis=-1) - kwargs["attention_mask"] = attention_mask - - if "past_key_values" in self.forward_keys: - output = self.model(input_ids=input_ids, past_key_values=past_key_values, **kwargs) - elif "cache" in self.forward_keys: - output = self.model(input_ids=input_ids, cache=past_key_values, **kwargs) - else: - raise NotImplementedError("Model does not support past_key_values either cache") - return output - - def generate(self, **kwargs): - if "input_ids" not in kwargs: - raise ValueError("input_ids must be provided for Peft model generation") - - self.model.prepare_inputs_for_generation = self._prepare_inputs_for_generation - outputs = self.model.generate(**kwargs) - self.model.prepare_inputs_for_generation = self.model_prepare_inputs_for_generation - return outputs - - def _prepare_inputs_for_generation(self, *args, **kwargs): - model_kwargs = self.model_prepare_inputs_for_generation(*args, **kwargs) - attention_mask = model_kwargs["attention_mask"] - batch_size = model_kwargs["input_ids"].shape[0] - if self.pad_attention_mask is not None: - attention_mask = self.pad_attention_mask( - model_kwargs["input_ids"].shape, self.prefix_config.num_prefix_tokens, attention_mask - ) - else: - if len(attention_mask.shape) == 2: - prefix_attention_mask = paddle.ones( - [batch_size, self.prefix_config.num_prefix_tokens], dtype=attention_mask.dtype - ) - elif len(attention_mask.shape) == 3: - batch_size, src_seq_len, tgt_seq_len = attention_mask.shape - prefix_attention_mask = paddle.ones( - [batch_size, src_seq_len, self.prefix_config.num_prefix_tokens], dtype=attention_mask.dtype - ) - elif len(attention_mask.shape) == 4: - batch_size, num_heads, src_seq_len, tgt_seq_len = attention_mask.shape - prefix_attention_mask = paddle.ones( - [batch_size, num_heads, src_seq_len, self.prefix_config.num_prefix_tokens], - dtype=attention_mask.dtype, - ) - else: - raise ValueError(f"Unexpected attention_mask shape: {attention_mask.shape}") - attention_mask = paddle.cat((prefix_attention_mask, attention_mask), axis=-1) - model_kwargs["attention_mask"] = attention_mask - - if "past_key_values" in self.forward_keys: - key = "past_key_values" - elif "cache" in self.forward_keys: - key = "cache" - else: - raise NotImplementedError("Model does not support past_key_values either cache") - if model_kwargs[key] is None: - past_key_values = self._get_past_key_values(batch_size) - model_kwargs[key] = past_key_values - return model_kwargs - - def mark_only_prefix_as_trainable(self) -> None: - # freeze pretrained model - for _, weight in self.model.state_dict().items(): - weight.stop_gradient = True - # train prefix encoder only - for _, weight in self.prefix_encoder.state_dict().items(): - weight.stop_gradient = False - - def _create_prefix_encoder(self): - prefix_dropout = nn.Dropout(p=self.prefix_config.prefix_dropout) - self.head_dim = self.prefix_config.hidden_size // self.prefix_config.num_attention_heads - if self.prefix_config.multi_query_group_num is not None: - self.num_heads = self.prefix_config.multi_query_group_num - else: - self.num_heads = self.prefix_config.num_attention_heads - if self.prefix_config.prefix_projection: - activation = nn.Tanh() - if self.prefix_config.tensor_parallel_degree > 1: - prefix_embedding = fleet.meta_parallel.VocabParallelEmbedding( - self.prefix_config.num_prefix_tokens, - self.head_dim * self.num_heads, - ) - prefix_proj_0 = fleet.meta_parallel.ColumnParallelLinear( - self.head_dim * self.num_heads, - self.prefix_config.prefix_projection_hidden_size, - has_bias=True, - gather_output=False, - ) - prefix_proj_1 = fleet.meta_parallel.RowParallelLinear( - self.prefix_config.prefix_projection_hidden_size, - self.head_dim * self.num_heads * self.prefix_config.num_hidden_layers * 2, - has_bias=True, - input_is_parallel=True, - ) - else: - prefix_embedding = nn.Embedding( - self.prefix_config.num_prefix_tokens, - self.head_dim * self.num_heads, - ) - prefix_proj_0 = nn.Linear( - self.head_dim * self.num_heads, - self.prefix_config.prefix_projection_hidden_size, - ) - prefix_proj_1 = nn.Linear( - self.prefix_config.prefix_projection_hidden_size, - self.head_dim * self.num_heads * self.prefix_config.num_hidden_layers * 2, - ) - prefix_encoder = nn.Sequential(prefix_embedding, prefix_proj_0, activation, prefix_proj_1, prefix_dropout) - else: - if self.prefix_config.tensor_parallel_degree > 1: - prefix_embedding = fleet.meta_parallel.VocabParallelEmbedding( - self.prefix_config.num_prefix_tokens, - self.head_dim * self.num_heads * self.prefix_config.num_hidden_layers * 2, - ) - else: - prefix_embedding = nn.Embedding( - self.prefix_config.num_prefix_tokens, - self.head_dim * self.num_heads * self.prefix_config.num_hidden_layers * 2, - ) - prefix_encoder = nn.Sequential(prefix_embedding, prefix_dropout) - return prefix_encoder - - def _get_past_key_values(self, batch_size): - - # (bs, prefixlen, hidden_dim*layer_num*2) - past_key_values = self.prefix_encoder(self.prefix_tokens.unsqueeze(0).expand([batch_size, -1])) - - # (bs, prefixlen, hidden_dim*layer_num*2/tensor_parallel_degree) - if self.prefix_config.tensor_parallel_degree > 1: - split_past_key_values = past_key_values.split( - num_or_sections=self.prefix_config.tensor_parallel_degree, axis=2 - ) - past_key_values = split_past_key_values[self.model.config.tensor_parallel_rank] - num_heads_per_partition = self.num_heads // self.prefix_config.tensor_parallel_degree - else: - num_heads_per_partition = self.num_heads - - # (bs, prefixlen, layer_num*2, head_num/tensor_parallel_degree, head_dim) - past_key_values = past_key_values.reshape( - [ - batch_size, - self.prefix_config.num_prefix_tokens, - self.prefix_config.num_hidden_layers * 2, - num_heads_per_partition, - self.head_dim, - ] - ) - - if self.postprocess_past_key_value is not None: - past_key_values = self.postprocess_past_key_value(past_key_values) - past_key_values_cache = DynamicCache() - if isinstance(past_key_values, tuple): - for layer_idx, (key_state, value_state) in enumerate(past_key_values): - past_key_values_cache.update(key_state, value_state, layer_idx) - else: - return past_key_values - return past_key_values_cache - - def train(self): - self.training = True - self.model.training = True - self.prefix_encoder.training = True - self.model.train() - self.prefix_encoder.train() - - def eval(self): - self.training = False - self.model.training = False - self.prefix_encoder.training = False - self.model.eval() - self.prefix_encoder.eval() - - def print_trainable_parameters(self) -> None: - trainable_numel = 0 - freeze_numel = 0 - for _, weight in self.model.state_dict().items(): - if weight.stop_gradient: - freeze_numel += np.prod(weight.shape) - else: - trainable_numel += np.prod(weight.shape) - for _, weight in self.prefix_encoder.state_dict().items(): - if weight.stop_gradient: - freeze_numel += np.prod(weight.shape) - else: - trainable_numel += np.prod(weight.shape) - logger.debug( - f"Frozen parameters: {freeze_numel:.2e} || Trainable parameters:{trainable_numel:.2e} || Total parameters:{freeze_numel+trainable_numel:.2e}|| Trainable:{trainable_numel / (freeze_numel+trainable_numel):.2%}" - ) - - @classmethod - def from_pretrained( - cls, - model, - prefix_path, - postprocess_past_key_value=None, - pad_attention_mask=None, - ): - # init prefix config & prefix model - prefix_config = PrefixConfig.from_pretrained(prefix_path) - # define a new variable to conserve original prefix_config.tensor_parallel_degree value which will update while initializing prefix model - prefix_config_tensor_parallel_degree = prefix_config.tensor_parallel_degree - prefix_model = cls(model, prefix_config, postprocess_past_key_value, pad_attention_mask) - - prefix_model_index_file = os.path.join(prefix_path, SAFE_PEFT_WEIGHTS_INDEX_NAME) - if os.path.exists(prefix_model_index_file): - # load safetensors format file. - resolved_archieve_file, sharded_metadata = get_checkpoint_shard_files( - pretrained_model_name_or_path=prefix_path, - index_filename=prefix_model_index_file, - ) - loaded_keys = sharded_metadata["all_checkpoint_keys"] - expected_keys = set(prefix_model.prefix_encoder.state_dict().keys()) - missing_keys = expected_keys - set(loaded_keys) - if len(missing_keys) > 0: - raise ValueError(f"missing_keys: {missing_keys}") - - error_msgs = [] - for shard_file in resolved_archieve_file: - pre_tensor_parallel_split = False - if model.config.tensor_parallel_degree > 1: - pre_tensor_parallel_split = True - tp_actions = prefix_model._get_tensor_parallel_convert_actions(is_split=True) - state_dict = load_state_dict( - shard_file, - tp_actions if pre_tensor_parallel_split else None, - expected_keys, - ) - error_msgs += _load_state_dict_into_model(prefix_model.prefix_encoder, state_dict, "") - del state_dict - gc.collect() - - if len(error_msgs) > 0: - error_msgs = "\n\t".join(error_msgs) - raise RuntimeError( - f"Error(s) in loading state_dict for {prefix_model.__class__.__name__}:\n\t{error_msgs}" - ) - return prefix_model - - # define prefix weight name - if prefix_config_tensor_parallel_degree > 1: - prefix_weight_name = _add_variant(PREFIX_WEIGHTS_NAME, f"tp{model.config.tensor_parallel_rank:0>2d}") - else: - prefix_weight_name = PREFIX_WEIGHTS_NAME - - # load and set prefix weight parameter - prefix_weight_path = os.path.join(prefix_path, prefix_weight_name) - if os.path.exists(prefix_weight_path): - # load prefix weight parameter - prefix_state_dict = paddle.load(prefix_weight_path, return_numpy=True) - logger.info(f"Loading the prefix weights from {prefix_weight_path}") - - if ( - prefix_config_tensor_parallel_degree > 1 - and prefix_config_tensor_parallel_degree != model.config.tensor_parallel_degree - ): - raise NotImplementedError( - f"{prefix_config_tensor_parallel_degree} is not equal to {model.config.tensor_parallel_degree}. Please merge prefix weights first." - ) - - # convert parameters to tensor parallel for mp model - if prefix_config_tensor_parallel_degree <= 1 and model.config.tensor_parallel_degree > 1: - prefix_state_dict = prefix_model._convert_tensor_parallel(prefix_state_dict=prefix_state_dict) - - # set prefix state dict - prefix_model.set_state_dict(prefix_state_dict) - else: - logger.error(f"prefix weights not found under {prefix_path}, creating prefix weights from scratch") - - return prefix_model - - def save_pretrained(self, save_directory: str, merge_tensor_parallel: bool = True, **kwargs): - variant = kwargs.get("variant", None) - is_main_process = kwargs.get("is_main_process", paddle.distributed.get_rank() == 0) - - assert not os.path.isfile( - save_directory - ), f"Saving directory ({save_directory}) should be a directory, not a file" - os.makedirs(save_directory, exist_ok=True) - - # past_key_values: (prefixlen, hidden_dim*layer_num*2) - past_key_values = self.prefix_encoder(self.prefix_tokens.unsqueeze(0).expand([1, -1])) - # (prefixlen, 2, layer_num, num_heads, head_dim) - past_key_values = past_key_values.reshape( - [ - self.prefix_config.num_prefix_tokens, - 2, - self.prefix_config.num_hidden_layers, - self.num_heads, - self.head_dim, - ] - ) - # (num_layers, 2, num_heads, prefixlen, head_dim) - past_key_values = paddle.transpose(past_key_values, perm=[2, 1, 3, 0, 4]).cpu().numpy() - - if merge_tensor_parallel and self.prefix_config.tensor_parallel_degree > 1: - trainable_state_dict = self.prefix_encoder.state_dict() - trainable_state_dict = self._merge_trainable_tensor_parallel(trainable_state_dict) - if not is_main_process: - logger.info("Saving with merge_tensor_parallel, tensor_parallel_rank > 0 don't need save") - return - variant = None - self.prefix_config.tensor_parallel_degree = -1 - else: - trainable_state_dict = self.prefix_encoder.state_dict() - if self.prefix_config.tensor_parallel_degree > 1: - if variant is None: - variant = f"tp{self.model.config.tensor_parallel_rank:0>2d}" - - # save prefix tuning weight - prefix_weight_name = _add_variant(PREFIX_WEIGHTS_NAME, variant) - weight_filename = os.path.join(save_directory, prefix_weight_name) - paddle.save(trainable_state_dict, weight_filename) - - # save prefix config & past key values - if is_main_process: - self.prefix_config.save_pretrained(save_directory) - np.save(os.path.join(save_directory, PAST_KEY_VALUES_FILE_NAME), past_key_values) - - if self.model.base_model_prefix == "chatglm_v2": - self.prefix_config.tensor_parallel_degree = -1 - else: - self.prefix_config.tensor_parallel_degree = self.model.config.tensor_parallel_degree - - def set_state_dict(self, state_dict): - self.prefix_encoder.set_state_dict(state_dict) - logger.info("Load prefix weight successfully") - - def _get_tensor_parallel_convert_actions(self, loaded_keys=None, is_split=False, ignore_error=False): - from ...transformers.conversion_utils import split_or_merge_func - - fn = split_or_merge_func( - is_split=is_split, - tensor_parallel_degree=self.prefix_config.tensor_parallel_degree, - tensor_parallel_rank=self.model.config.tensor_parallel_rank, - num_attention_heads=self.model.config.num_attention_heads, - ) - - if self.prefix_config.prefix_projection: - name_action_mappings = { - "0.weight": partial(fn, is_column=False), - "1.weight": partial(fn, is_column=True), - "1.bias": partial(fn, is_column=True), - "3.weight": partial(fn, is_column=False), - } - else: - name_action_mappings = { - "0.weight": partial(fn, is_column=False), - } - return name_action_mappings - - def _merge_trainable_tensor_parallel(self, trainable_state_dict): - name_action_mappings = self._get_tensor_parallel_convert_actions(is_split=False) - hcg = paddle.distributed.fleet.get_hybrid_communicate_group() - mp_group = hcg.get_model_parallel_group() - is_dst = paddle.distributed.get_rank(mp_group) == 0 - - for key in trainable_state_dict: - tensor = trainable_state_dict[key] - if key in name_action_mappings: - ret = distributed_gather(tensor, group=mp_group, offload=True) - action = name_action_mappings[key] - tensor = action(ret) if is_dst else None - trainable_state_dict[key] = tensor - else: - trainable_state_dict[key] = tensor.cpu().numpy() if is_dst else None - - return trainable_state_dict - - def _convert_tensor_parallel(self, prefix_state_dict): - name_action_mappings = self._get_tensor_parallel_convert_actions(is_split=True) - for name, action in name_action_mappings.items(): - tensor = prefix_state_dict.pop(name) - prefix_state_dict[name] = action(tensor) - return prefix_state_dict - - def save_to_aistudio( - self, - repo_id, - private=True, - license="Apache License 2.0", - exist_ok=True, - subfolder=None, - merge_tensor_parallel=False, - **kwargs - ): - """ - Uploads all elements of this model to a new AiStudio Hub repository. - Args: - repo_id (str): Repository name for your model/tokenizer in the Hub. - token (str): Your token for the Hub. - private (bool, optional): Whether the model/tokenizer is set to private. Defaults to True. - license (str): The license of your model/tokenizer. Defaults to: "Apache License 2.0". - exist_ok (bool, optional): Whether to override existing repository. Defaults to: True. - subfolder (str, optional): Push to a subfolder of the repo instead of the root - merge_tensor_parallel (bool): Whether to merge the tensor parallel weights. Defaults to False. - """ - res = aistudio_sdk.hub.create_repo(repo_id=repo_id, private=private, license=license, **kwargs) - if "error_code" in res: - if res["error_code"] == 10003 and exist_ok: - logger.info( - f"Repo {repo_id} already exists, it will override files with the same name. To avoid this, please set exist_ok=False" - ) - else: - logger.error( - f"Failed to create repo {repo_id}, error_code: {res['error_code']}, error_msg: {res['error_msg']}" - ) - else: - logger.info(f"Successfully created repo {repo_id}") - - with tempfile.TemporaryDirectory() as root_dir: - if subfolder is not None: - save_dir = os.path.join(root_dir, subfolder) - else: - save_dir = root_dir - # save model - self.save_pretrained(save_dir, merge_tensor_parallel=merge_tensor_parallel) - - # Upload model and return - logger.info(f"Pushing to the {repo_id}. This might take a while") - for filename in os.listdir(save_dir): - res = aistudio_sdk.hub.upload( - repo_id=repo_id, path_or_fileobj=os.path.join(save_dir, filename), path_in_repo=filename, **kwargs - ) - if "error_code" in res: - logger.error( - f"Failed to upload {filename}, error_code: {res['error_code']}, error_msg: {res['error_msg']}" - ) - else: - logger.info(f"{filename}: {res['message']}") diff --git a/paddleformers/peft/prefix/utils.py b/paddleformers/peft/prefix/utils.py deleted file mode 100644 index 48fd7c9b4f1..00000000000 --- a/paddleformers/peft/prefix/utils.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle - - -def bloom_postprocess_past_key_value(past_key_values): - # (layer_num, bs, head_num/tensor_parallel_degree, prefixlen, head_dim)*2 - keys, values = paddle.transpose(past_key_values, perm=[2, 0, 1, 3, 4]).split(2) - # keys: [layer_num, bs, head_num/tensor_parallel_degree, head_dim, prefixlen] - # value: [layer_num, bs, head_num/tensor_parallel_degree, prefixlen, head_dim] - # keys, values = past_key_values[0].transpose([0, 1, 2, 4, 3]), past_key_values[1] - return tuple(zip(keys, values)) - - -def chatglm_postprocess_past_key_value(past_key_values): - # (layer_num, prefixlen, bs, head_num/tensor_parallel_degree, head_dim)*2 - keys, values = paddle.transpose(past_key_values, perm=[2, 1, 0, 3, 4]).split(2) - - return tuple(zip(keys, values)) - - -def llama_postprocess_past_key_value(past_key_values): - # (layer_num, bs, head_num/tensor_parallel_degree, prefixlen, head_dim)*2 - keys, values = paddle.transpose(past_key_values, perm=[2, 0, 3, 1, 4]).split(2) - - return tuple(zip(keys, values)) - - -def mistral_postprocess_past_key_value(past_key_values): - # (layer_num, bs, head_num/tensor_parallel_degree, prefixlen, head_dim)*2 - keys, values = paddle.transpose(past_key_values, perm=[2, 0, 3, 1, 4]).split(2) - - return tuple(zip(keys, values)) - - -def qwen_postprocess_past_key_value(past_key_values): - # (layer_num, bs, prefixlen, head_num/tensor_parallel_degree, head_dim)*2 - keys, values = paddle.transpose(past_key_values, perm=[2, 0, 1, 3, 4]).split(2) - - return tuple(zip(keys, values)) diff --git a/paddleformers/peft/reft/__init__.py b/paddleformers/peft/reft/__init__.py deleted file mode 100644 index 0f8b4f51d4b..00000000000 --- a/paddleformers/peft/reft/__init__.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .interventions import ( - LoreftIntervention, - LowRankRotateLayer, - TinyIntervention, - intervention_mapping, -) -from .modeling_utils import ReftDataCollator -from .predict import do_predict -from .reft_config import ReFTConfig -from .reft_model import ReFTModel diff --git a/paddleformers/peft/reft/interventions.py b/paddleformers/peft/reft/interventions.py deleted file mode 100644 index 030a90cd00d..00000000000 --- a/paddleformers/peft/reft/interventions.py +++ /dev/null @@ -1,148 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math - -import paddle -import paddle.nn as nn -from paddle import ParamAttr - - -def linear_act(x): - return x - - -ACT2FN = { - "linear": linear_act, - "relu": nn.ReLU(), -} - - -# A linear transformation with orthogonal initialization. -class LowRankRotateLayer(nn.Layer): - def __init__(self, n, m): - super().__init__() - self.weight = self.create_parameter( - shape=[n, m], - attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Orthogonal()), - is_bias=False, - ) - - def forward(self, x): - return paddle.matmul(x.astype(self.weight.dtype), self.weight) - - -# existing methods LoReFT(h) = h + R^T(Wh + b − Rh) -class LoreftIntervention(nn.Layer): - def __init__(self, **kwargs): - super(LoreftIntervention, self).__init__() - rotate_layer = LowRankRotateLayer(kwargs["embed_dim"], kwargs["low_rank_dimension"]) - self.rotate_layer = rotate_layer - self.learned_source = nn.Linear( - kwargs["embed_dim"], - kwargs["low_rank_dimension"], - weight_attr=ParamAttr(initializer=nn.initializer.Orthogonal()), - ) - self.data_type = kwargs["dtype"] - self.learned_source = self.learned_source.astype(self.data_type) - self.dropout = nn.Dropout(kwargs["dropout"] if "dropout" in kwargs else 0.0) - self.act_fn = ( - ACT2FN["linear"] if "act_fn" not in kwargs or kwargs["act_fn"] is None else ACT2FN[kwargs["act_fn"]] - ) - - def forward( - self, - base, - ): - rotated_base = self.rotate_layer(base) - output = base + paddle.matmul( - ( - self.act_fn( - self.learned_source( - base, - ) - ) - - rotated_base - ), - self.rotate_layer.weight.T, - ) - return self.dropout(output.astype(base.dtype)) - - def load_state_dict(self, state_dict, *args, **kwargs): - self.learned_source.weight.data = state_dict["learned_source.weight"].astype(self.data_type) - self.learned_source.bias.data = state_dict["learned_source.bias"].astype(self.data_type) - overload_w = state_dict["rotate_layer.weight"].astype(self.data_type) - overload_w_width = overload_w.shape[-1] - with paddle.no_grad(): - self.rotate_layer.weight[:, :overload_w_width] = paddle.to_tensor(overload_w) - return - - -# our proposed method -class TinyIntervention(nn.Layer): - def __init__(self, **kwargs): - super(TinyIntervention, self).__init__() - self.rank = kwargs["low_rank_dimension"] - self.hidden_size = kwargs["embed_dim"] - dropout = 0.0 - if dropout > 0.0: - self.dropout = nn.Dropout(p=dropout) - else: - self.dropout = lambda x: x - self.scaling = 1 - # Actual trainable parameters - self.param_A = self.create_parameter( - shape=[self.hidden_size, self.rank], - dtype=self._dtype, - is_bias=False, - default_initializer=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu"), - ) - self.param_B = self.create_parameter( - shape=[self.rank, self.hidden_size], - dtype=self._dtype, - is_bias=False, - default_initializer=nn.initializer.Constant(value=0.0), - ) - self.param_a = self.create_parameter( - shape=[self.rank], - dtype=self._dtype, - is_bias=False, - default_initializer=nn.initializer.Constant(value=1), - ) - self.param_b = self.create_parameter( - shape=[self.hidden_size], - dtype=self._dtype, - is_bias=False, - default_initializer=nn.initializer.Constant(value=1), - ) - self.param_A.stop_gradient = False - self.param_B.stop_gradient = False - - def forward( - self, - base, - ): - diag_b = paddle.diag(self.param_b) - diag_a = paddle.diag(self.param_a) - result = (self.dropout(base) @ self.param_A @ diag_a @ self.param_B @ diag_b) * self.scaling - return self.dropout(base + result.astype(base.dtype)) - - def load_state_dict(self, state_dict): - self.param_A.set_value(state_dict["param_A"]) - self.param_B.set_value(state_dict["param_B"]) - self.param_a.set_value(state_dict["param_a"]) - self.param_b.set_value(state_dict["param_b"]) - - -intervention_mapping = {"LoreftIntervention": LoreftIntervention, "TinyIntervention": TinyIntervention} diff --git a/paddleformers/peft/reft/modeling_utils.py b/paddleformers/peft/reft/modeling_utils.py deleted file mode 100644 index ba9269d5e05..00000000000 --- a/paddleformers/peft/reft/modeling_utils.py +++ /dev/null @@ -1,175 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import importlib -import logging -import os -import random -from dataclasses import dataclass -from typing import Dict, Sequence - -import numpy as np -import paddle -from paddle import nn - - -def getattr_for_paddle_module(model, parameter_name): - """Recursively fetch the model based on the name.""" - current_module = model - for param in parameter_name.split("."): - if "[" in param: - current_module = getattr(current_module, param.split("[")[0])[int(param.split("[")[-1].strip("]"))] - else: - current_module = getattr(current_module, param) - return current_module - - -def get_module_hook(model, representation) -> nn.Layer: - """Render the intervening module with a hook.""" - hook_type = "register_forward_post_hook" - parameter_name = f"model.layers[{representation['layer']}]" - module = getattr_for_paddle_module(model, parameter_name) - module_hook = getattr(module, hook_type) - return module_hook - - -class HandlerList: - """General class to set hooks and set off hooks.""" - - def __init__(self, handlers): - self.handlers = handlers - - def __len__(self): - return len(self.handlers) - - def remove(self): - for handler in self.handlers: - handler.remove() - - def extend(self, new_handlers): - self.handlers.extend(new_handlers.handlers) - return self - - -# gather hidden states on intervention locations -def gather_neurons(tensor_input, unit_locations_as_list): - unit_locations = paddle.to_tensor(unit_locations_as_list, place=tensor_input.place) - tensor_output = paddle.take_along_axis( - tensor_input, - axis=1, - indices=unit_locations.reshape([*unit_locations.shape, *(1,) * (len(tensor_input.shape) - 2)]).expand( - [-1, -1, *tensor_input.shape[2:]] - ), - ) - return tensor_output - - -# Replace selected neurons in `tensor_input` by `replacing_tensor_input`. -def scatter_neurons( - tensor_input, - replacing_tensor_input, - unit_locations_as_list, -): - unit_locations = paddle.to_tensor( - unit_locations_as_list, - place=tensor_input.place, - ) - - # [1,1,4096] - meta_component = paddle.arange(tensor_input.shape[-1]).unsqueeze(axis=0).unsqueeze(axis=0) - - start_index, end_index = ( - meta_component.min().tolist(), - meta_component.max().tolist() + 1, - ) - # 4096 - # last_dim = meta_component.shape[-1] - # 0, 1, 2, ..., batch_size-1 - _batch_idx = paddle.arange(tensor_input.shape[0]).unsqueeze(1) - tensor_input[_batch_idx, unit_locations, start_index:end_index] = replacing_tensor_input - return tensor_input - - -# do intervention -def do_intervention( - base_representation, - intervention, -): - """Do the actual intervention.""" - # base_representation: 从隐藏状态抽取出的对应token的隐藏状态 f7+l7: batch_size, 14, hidden_size - # intervention: 干预的模型 - # flatten - # original_base_shape = base_representation.shape - # if len(original_base_shape) == 2 or intervention.keep_last_dim: - # base_representation_f = base_representation - # intervened_representation = intervention( - # base_representation_f, - # ) - intervened_representation = intervention( - base_representation, - ) - return intervened_representation - - -# Introducing corresponding classes based on strings -def get_type_from_string(type_str): - """Help function to convert string to type""" - # Remove from the string - type_str = type_str.replace("", "") - - # Split the string into module and class name - module_name, class_name = type_str.rsplit(".", 1) - - # Import the module - if not module_name.startswith("paddleformers"): - module_name = f"paddleformers.peft.reft.{module_name}" - module = importlib.import_module(module_name) - - # Get the class - cls = getattr(module, class_name) - - return cls - - -def create_directory(path): - """Create directory if not exist""" - if not os.path.exists(path): - os.makedirs(path) - logging.info(f"Directory '{path}' created successfully.") - else: - logging.info(f"Directory '{path}' already exists.") - - -def set_seed(seed: int): - random.seed(seed) - np.random.seed(seed) - paddle.seed(seed) - - -def count_parameters(model): - """Count parameters of a model that require gradients""" - return int(sum(p.numel() for p in model.parameters() if not p.stop_gradient)) - - -@dataclass -class ReftDataCollator(object): - """Collate examples for ReFT.""" - - def __init__(self, data_collator): - self.data_collator = data_collator - - def __call__(self, instances: Sequence[Dict]) -> Dict[str, paddle.Tensor]: - batch_inputs = self.data_collator(instances) - max_seq_length = batch_inputs["input_ids"].shape[-1] - batch_inputs["intervention_locations"] = batch_inputs["intervention_locations"][..., :max_seq_length] - return batch_inputs diff --git a/paddleformers/peft/reft/predict.py b/paddleformers/peft/reft/predict.py deleted file mode 100644 index e0ab29b1d68..00000000000 --- a/paddleformers/peft/reft/predict.py +++ /dev/null @@ -1,132 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import logging - -import paddle -from paddle.io import DataLoader, Dataset -from tqdm import tqdm - -from ...data import DataCollatorForSeq2Seq -from ...transformers import AutoTokenizer -from .modeling_utils import ReftDataCollator - -device = "gpu" if paddle.is_compiled_with_cuda() else "cpu" - - -def make_data_collator(tokenizer, model, max_length): - data_collator_fn = DataCollatorForSeq2Seq( - tokenizer=tokenizer, - model=model, - label_pad_token_id=-100, - padding="longest", - max_length=max_length, - ) - return ReftDataCollator(data_collator=data_collator_fn) - - -def make_dataloader( - dataset: Dataset, batch_size: int, collate_fn: DataCollatorForSeq2Seq, shuffle: bool -) -> DataLoader: - return DataLoader(dataset, shuffle=shuffle, batch_size=batch_size, collate_fn=collate_fn) - - -def do_predict( - intervenable, - tokenizer: AutoTokenizer, - eval_dataset: Dataset, - batch_size: int = 4, - data_collator=None, - greedy_decoding=True, - temperature=None, - top_p=None, - top_k=None, - max_new_tokens=32, - do_sample=False, - predict_path=None, - num_beams=4, - max_length=2048, -): - # switch the tokenizer mode first for generation tasks - tokenizer.padding_side = "left" # switch padding side for collator - if greedy_decoding: - num_beams = 1 - data_collator = make_data_collator(tokenizer, intervenable.model, max_length) - eval_dataloader = make_dataloader(eval_dataset, batch_size, data_collator, shuffle=False) - generations = [] - eval_iterator = tqdm(eval_dataloader, position=0, leave=True) - with paddle.no_grad(): - for step, inputs in enumerate(eval_iterator): - for k, v in inputs.items(): - if v is not None and isinstance(v, paddle.Tensor): - inputs[k] = v.to(device) - - # [layers, batch_size, positions] - intervention_locations = paddle.transpose(inputs["intervention_locations"], perm=[1, 0, 2]) - # get left padding count, [batch_size], and add to locations - left_padding = (inputs["input_ids"] == tokenizer.bos_token_id).nonzero(as_tuple=True)[1] - - if left_padding.numel() > 0: - left_padding = left_padding.reshape([1, -1, 1]).to(device) # [1, batch_size, 1] - intervention_locations += left_padding - # intervention_locations -= 1 # offset for the sink padding - else: - logging.info("Warning: No BOS token found, skipping left padding adjustment.") - - # repeat each batch by num_beams times in intervention locations - # -> [layers, batch_size * num_beams, positions] - intervention_locations = intervention_locations.repeat_interleave(num_beams, axis=1).tolist() - - # set generation args depending on task - generation_args = { - "base": { - "input_ids": inputs["input_ids"], - "attention_mask": inputs["attention_mask"], - }, - "unit_locations": intervention_locations, - "intervene_on_prompt": True, - "eos_token_id": tokenizer.eos_token_id, - "early_stopping": True, - "max_new_tokens": max_new_tokens, - "do_sample": do_sample, - } - # override generation args if necessary - if temperature is not None: - generation_args["temperature"] = temperature - if top_p is not None: - generation_args["top_p"] = top_p - if top_k is not None: - generation_args["top_k"] = top_k - - # generate with intervention on prompt - _, steered_response = intervenable.generate(**generation_args) - # detokenize in batch - actual_preds = tokenizer.batch_decode(steered_response[0], skip_special_tokens=True) - - for inputs_id, label, pred in zip(inputs["input_ids"], inputs["labels"], actual_preds): - filtered_labels = label[label != -100] - generations += [ - { - "src": tokenizer.decode(inputs_id, skip_special_tokens=True), - "trg": tokenizer.decode(filtered_labels, skip_special_tokens=True), - "pred": pred, - } - ] - - if predict_path is not None: - with open(predict_path, "w") as json_file: - json.dump(generations, json_file, indent=4, ensure_ascii=False) - - return generations diff --git a/paddleformers/peft/reft/reft_config.py b/paddleformers/peft/reft/reft_config.py deleted file mode 100644 index fda6d092c7c..00000000000 --- a/paddleformers/peft/reft/reft_config.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os - -from .modeling_utils import get_type_from_string - - -class ReFTConfig: - def __init__( - self, - representations, - intervention_params=None, - position=None, - intervention_types=None, - sorted_keys=None, - intervention_dimensions=None, - **kwargs, - ): - if not isinstance(representations, list): - representations = [representations] - - self.representations = representations - self.intervention_types = intervention_types - overwrite_intervention_types = [] - for reprs in self.representations: - if reprs["intervention"] is not None: - overwrite_intervention_types += [type(reprs["intervention"])] - - self.intervention_types = overwrite_intervention_types - self.sorted_keys = sorted_keys - self.intervention_dimensions = intervention_dimensions - self.intervention_params = intervention_params - self.position = position - - def to_dict(self): - return { - "representations": self.representations, - "intervention_types": self.intervention_types, - "sorted_keys": self.sorted_keys, - } - - @staticmethod - def from_pretrained(load_directory): - saved_config = json.load(open(os.path.join(load_directory, "config.json"), "r")) - for representation, intervention_type in zip( - saved_config["representations"], saved_config["intervention_types"] - ): - representation["intervention"] = get_type_from_string(intervention_type)( - **saved_config["intervention_params"] - ) - reft_config = ReFTConfig( - representations=saved_config["representations"], - intervention_params=saved_config["intervention_params"], - ) - return reft_config - - def save_pretrained(self, save_directory): - config_dict = {} - config_dict["representations"] = [ - { - "layer": repr["layer"], - "component": repr["component"], - "low_rank_dimension": repr["low_rank_dimension"], - } - for repr in self.representations - ] - - config_dict["intervention_params"] = self.intervention_params - config_dict["intervention_types"] = [repr(intervention_type) for intervention_type in self.intervention_types] - config_dict["position"] = self.position - with open(os.path.join(save_directory, "config.json"), "w") as f: - json.dump(config_dict, f, indent=4) diff --git a/paddleformers/peft/reft/reft_model.py b/paddleformers/peft/reft/reft_model.py deleted file mode 100644 index df866e8287a..00000000000 --- a/paddleformers/peft/reft/reft_model.py +++ /dev/null @@ -1,365 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import json -import logging -import os -import types -from typing import List, Optional - -import paddle -from paddle import nn - -from .modeling_utils import ( - HandlerList, - count_parameters, - create_directory, - do_intervention, - gather_neurons, - get_module_hook, - scatter_neurons, -) -from .reft_config import ReFTConfig - - -class ReFTModel(nn.Layer): - """ - config: ReFTConfig - """ - - def __init__(self, config, model, **kwargs): - super().__init__() - self.config = config - self.intervention_types = config.intervention_types - self.representations = {} - self.interventions = {} - _original_key_order = [] - # for generate - self._key_setter_call_counter = {} - for i, representation in enumerate(config.representations): - _key = f'layer.{representation["layer"]}' - if representation["intervention"] is not None: - intervention = representation["intervention"] - - module_hook = get_module_hook(model, representation) - self.representations[_key] = representation - self.interventions[_key] = (intervention, module_hook) - _original_key_order += [_key] - - # usually, it's a one time call per - # hook unless model generates. - self._key_setter_call_counter[_key] = 0 - - self.sorted_keys = _original_key_order - self.model = model - self.model_config = model.config - self.disable_model_gradients() - self.trainable_model_parameters = {} - - def forward( - self, - **base, - ): - unit_locations = base["intervention_locations"].transpose([1, 0, 2]).tolist() - self._reset_hook_count() - try: - # intervene, register hook after decoder block - set_handlers_to_remove = self._wait_for_forward_with_intervention(unit_locations) - # run intervened forward - del base["intervention_locations"] - counterfactual_outputs = self.model(**base) - set_handlers_to_remove.remove() - except Exception as e: - raise e - self._reset_hook_count() - return counterfactual_outputs - - def generate( - self, - base, - unit_locations: Optional[List] = None, - intervene_on_prompt: bool = False, - output_original_output: Optional[bool] = False, - **kwargs, - ): - self._reset_hook_count() - self._intervene_on_prompt = intervene_on_prompt - base_outputs = None - if output_original_output or True: - # returning un-intervened output - base_outputs = self.model.generate(**base, **kwargs) - set_handlers_to_remove = None - try: - # intervene, register hook after decoder block - set_handlers_to_remove = self._wait_for_forward_with_intervention( - unit_locations, - ) - # run intervened generate - counterfactual_outputs = self.model.generate(**base, **kwargs) - set_handlers_to_remove.remove() - except Exception as e: - raise e - self._reset_hook_count() - return base_outputs, counterfactual_outputs - - def _wait_for_forward_with_intervention( - self, - unit_locations, - ): - all_set_handlers = HandlerList([]) - for key_id, key in enumerate(self.sorted_keys): - set_handlers = self._intervention_setter(key, unit_locations[key_id]) - all_set_handlers.extend(set_handlers) - return all_set_handlers - - def _intervention_setter( - self, - key, - unit_locations_base, - ) -> HandlerList: - """ - Create a list of setter handlers that will set activations - """ - handlers = [] - intervention, module_hook = self.interventions[key] - - def hook_callback( - model, - inputs, - outputs, - ): - is_prompt = self._key_setter_call_counter[key] == 0 - if is_prompt: - self._key_setter_call_counter[key] += 1 - if not is_prompt: - return - - selected_output = self._gather_intervention_output(outputs, key, unit_locations_base) - - if not isinstance(self.interventions[key][0], types.FunctionType): - intervened_representation = do_intervention( - selected_output, - intervention, - ) - if intervened_representation is None: - return - - if isinstance(outputs, tuple): - _ = self._scatter_intervention_output( - outputs[0], - intervened_representation, - key, - unit_locations_base, - ) - else: - _ = self._scatter_intervention_output( - outputs, - intervened_representation, - key, - unit_locations_base, - ) - - handlers.append( - module_hook( - hook_callback, - ) - ) - - return HandlerList(handlers) - - def _gather_intervention_output(self, output, representations_key, unit_locations) -> paddle.Tensor: - """ - Gather intervening activations from the output based on indices - """ - if isinstance(output, tuple): - original_output = output[0].clone() - else: - original_output = output.clone() - if unit_locations is None: - return original_output - - # gather based on intervention locations - selected_output = gather_neurons( - original_output, - unit_locations, - ) - return selected_output - - def _scatter_intervention_output( - self, - output, - intervened_representation, - representations_key, - unit_locations, - ) -> paddle.Tensor: - """ - Scatter in the intervened activations in the output - """ - # data structure casting - if isinstance(output, tuple): - original_output = output[0] - else: - original_output = output - # for non-sequence-based models, we simply replace - # all the activations. - if unit_locations is None: - original_output[:] = intervened_representation[:] - return original_output - - # component = self.representations[representations_key].component - # unit = self.representations[representations_key].unit - - # scatter in-place - _ = scatter_neurons( - original_output, - intervened_representation, - unit_locations, - ) - - return original_output - - def save_pretrained(self, save_directory, **kwargs): - create_directory(save_directory) - saving_config = copy.deepcopy(self.config) - saving_config.sorted_keys = self.sorted_keys - saving_config.intervention_types = [] - saving_config.intervention_dimensions = [] - - for k, v in self.interventions.items(): - intervention = v[0] - saving_config.intervention_types += [(type(intervention))] - binary_filename = f"intkey_{k}.bin" - # save intervention binary file - logging.info(f"Saving trainable intervention to {binary_filename}.") - paddle.save( - intervention.state_dict(), - os.path.join(save_directory, binary_filename), - ) - - saving_config.save_pretrained(save_directory) - - @staticmethod - def from_pretrained( - load_directory, - model, - ): - """ - Load interventions from disk - """ - reft_config = ReFTConfig.from_pretrained( - load_directory=load_directory, - ) - intervenable = ReFTModel(reft_config, model) - intervenable.disable_model_gradients() - - # load binary files - for i, (k, v) in enumerate(intervenable.interventions.items()): - intervention = v[0] - binary_filename = f"intkey_{k}.bin" - saved_state_dict = paddle.load(os.path.join(load_directory, binary_filename)) - intervention.load_state_dict(saved_state_dict) - return intervenable - - def train(self): - self.model.train() - - def eval(self): - self.model.eval() - - def count_parameters(self, include_model=False): - total_parameters = 0 - for k, v in self.interventions.items(): - total_parameters += count_parameters(v[0]) - if include_model: - total_parameters += sum(p.numel() for p in self.model.parameters() if p.requires_grad) - return total_parameters - - def print_trainable_parameters(self): - trainable_intervention_parameters = 0 - for k, v in self.interventions.items(): - trainable_intervention_parameters += count_parameters(v[0]) - - trainable_model_parameters = int(sum(p.numel() for p in self.model.parameters() if not p.stop_gradient)) - - all_model_parameters = int(sum(p.numel() for p in self.model.parameters())) - - total_trainable_parameters = trainable_intervention_parameters + trainable_model_parameters - - logging.info("trainable_intervention_parameters:", trainable_intervention_parameters) - logging.info("trainable_model_parameters:", trainable_model_parameters) - logging.info("all_model_parameters:", all_model_parameters) - logging.info("total_trainable_parameters:", total_trainable_parameters) - logging.info( - f"trainable intervention params: {trainable_intervention_parameters:,d} || trainable model params: {trainable_model_parameters:,d}\n" - f"model params: {all_model_parameters:,d} || trainable%: {100 * total_trainable_parameters / all_model_parameters}" - ) - - def _reset_hook_count(self): - """ - Reset the hook count before any generate call - """ - self._key_setter_call_counter = dict.fromkeys(self._key_setter_call_counter, 0) - - def __str__(self): - attr_dict = { - "model_type": str(self.model_type), - "intervention_types": str(self.intervention_types), - "alignabls": self.sorted_keys, - } - return json.dumps(attr_dict, indent=4) - - def get_trainable_parameters(self): - """ - Return trainable params as key value pairs - """ - ret_params = [] - for k, v in self.interventions.items(): - ret_params += [p for p in v[0].parameters()] - for p in self.model.parameters(): - if p.requires_grad: - ret_params += [p] - return ret_params - - def named_parameters(self, recurse=True, include_sublayers=True): - """ - The above, but for HuggingFace. - """ - ret_params = [] - for k, v in self.interventions.items(): - ret_params += [(k + "." + n, p) for n, p in v[0].named_parameters()] - for n, p in self.model.named_parameters(): - if not p.stop_gradient: - ret_params += [("model." + n, p)] - return ret_params - - def enable_model_gradients(self): - """ - Enable gradient in the model - """ - # Unfreeze all model weights - self.model.train() - for param in self.model.parameters(): - param.stop_gradient = False - self.model_has_grad = True - - def disable_model_gradients(self): - """ - Disable gradient in the model - """ - # Freeze all model weights - self.model.eval() - for param in self.model.parameters(): - param.stop_gradient = True - self.model_has_grad = False diff --git a/paddleformers/peft/vera/__init__.py b/paddleformers/peft/vera/__init__.py deleted file mode 100644 index 2ba6e86f900..00000000000 --- a/paddleformers/peft/vera/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .vera_config import VeRAConfig -from .vera_layers import VeRALinear -from .vera_model import VeRAModel diff --git a/paddleformers/peft/vera/vera_config.py b/paddleformers/peft/vera/vera_config.py deleted file mode 100644 index 76f0d3a73bb..00000000000 --- a/paddleformers/peft/vera/vera_config.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -from dataclasses import asdict, dataclass, field -from typing import List, Optional, Union - -from ...utils.env import VERA_CONFIG_NAME - - -@dataclass -class VeRAConfig: - """ - This is the configuration class to store the configuration of a [`VeRAModel`]. - Args: - r (`int`): vera attention dimension - target_modules (`Union[List[str],str]`): The names of the modules to apply vera to. - trainable_modules (`List[str]`): The names of the modules to train when applying vera. - vera_alpha (`float`): The alpha parameter for vera scaling. - vera_dropout (`float`): The dropout probability for vera layers. - """ - - r: int = field(default=8, metadata={"help": "vera attention dimension"}) - target_modules: Optional[Union[List[str], str]] = field( - default=None, - metadata={ - "help": "List of module names or regex expression of the module names to replace with vera." - "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' " - }, - ) - trainable_modules: Optional[List[str]] = field( - default=None, - metadata={ - "help": "List of module names or regex expression of the module names to train when applying with vera." - "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' " - }, - ) - vera_alpha: int = field(default=8, metadata={"help": "vera alpha"}) - vera_dropout: float = field(default=0.0, metadata={"help": "vera dropout"}) - trainable_bias: Optional[str] = field( - default=None, metadata={"help": "Define trainable bias parameters for the vera model."} - ) - tensor_parallel_degree: int = field(default=-1, metadata={"help": "1 for not use tensor parallel"}) - dtype: Optional[str] = field(default=None, metadata={"help": "The data type of tensor"}) - head_dim: Optional[int] = field( - default=None, - metadata={ - "help": "The model multi head dimension.Only for veraMergedLinear and ColumnParallelveraMergedLinear." - }, - ) - do_qat: bool = field(default=False, metadata={"help": "Whether the vera model would do quant-aware training"}) - base_model_name_or_path: Optional[str] = field( - default=None, metadata={"help": "The name of the base model to use."} - ) - pissa_init: bool = field(default=False, metadata={"help": "Whether the vera weight initialized by pissa"}) - - @property - def __dict__(self): - return asdict(self) - - def to_dict(self): - return self.__dict__ - - def save_pretrained(self, save_directory): - r""" - This method saves the configuration of your adapter model in a directory. - Args: - save_directory (`str`): - The directory where the configuration will be saved. - """ - if os.path.isfile(save_directory): - raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") - - os.makedirs(save_directory, exist_ok=True) - - output_dict = self.__dict__ - output_path = os.path.join(save_directory, VERA_CONFIG_NAME) - - # save it - with open(output_path, "w") as writer: - writer.write(json.dumps(output_dict, indent=2, sort_keys=True)) - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): - r""" - This method loads the configuration of your adapter model from a directory. - Args: - pretrained_model_name_or_path (`str`): - The directory or the hub-id where the configuration is saved. - **kwargs: - Additional keyword arguments passed along to the child class initialization. - """ - if os.path.isfile(os.path.join(pretrained_model_name_or_path, VERA_CONFIG_NAME)): - config_file = os.path.join(pretrained_model_name_or_path, VERA_CONFIG_NAME) - else: - raise ValueError(f"Can't find vera_config.json at '{pretrained_model_name_or_path}'") - - loaded_attributes = cls.from_json_file(config_file) - - config = cls(**kwargs) - - for key, value in loaded_attributes.items(): - if hasattr(config, key): - setattr(config, key, value) - - return config - - @classmethod - def from_json_file(cls, path_json_file): - r""" - Loads a configuration file from a json file. - Args: - path_json_file (`str`): - The path to the json file. - """ - with open(path_json_file, "r") as file: - json_object = json.load(file) - - return json_object diff --git a/paddleformers/peft/vera/vera_layers.py b/paddleformers/peft/vera/vera_layers.py deleted file mode 100644 index 8bf478503ba..00000000000 --- a/paddleformers/peft/vera/vera_layers.py +++ /dev/null @@ -1,149 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F - - -class VeRALinear(nn.Linear): - # VeRA implemented in a dense layer - def __init__( - self, - base_linear_module: paddle.nn.layer.common.Linear, - in_features: int, - out_features: int, - r: int = 0, - vera_alpha: int = 1, - vera_dropout: float = 0.0, - pissa_init: bool = False, - **kwargs - ): - nn.Linear.__init__(self, in_features, out_features, **kwargs) - self.weight.set_value(base_linear_module.weight) - - if not isinstance(r, int) or r <= 0: - raise ValueError("Vora rank r should be a positive integer") - self.r = r - self.vera_alpha = vera_alpha - # Optional dropout - if vera_dropout > 0.0: - self.vera_dropout = nn.Dropout(p=vera_dropout) - else: - self.vera_dropout = lambda x: x - # Mark the weight as unmerged - self.merged = False - - if pissa_init: - assert self.vera_alpha == self.r, "pissa method requires vera_alpha=r, scaling=1" - self.scaling = 1.0 - self.vera_A = self.create_parameter( - shape=[in_features, r], - dtype=self._dtype, - is_bias=False, - ) - self.vera_B = self.create_parameter( - shape=[r, out_features], - dtype=self._dtype, - is_bias=False, - ) - self.pissa_init(r) - - else: - # Actual trainable parameters - self.vera_A = self.create_parameter( - shape=[in_features, r], - dtype=self._dtype, - is_bias=False, - default_initializer=nn.initializer.KaimingUniform( - negative_slope=math.sqrt(5), nonlinearity="leaky_relu" - ), - ) - self.vera_B = self.create_parameter( - shape=[r, out_features], - dtype=self._dtype, - is_bias=False, - default_initializer=nn.initializer.Constant(value=0.0), - ) - self.scaling = self.vera_alpha / self.r - - self.vera_b = self.create_parameter( - shape=[out_features], - dtype=self._dtype, - is_bias=False, - default_initializer=nn.initializer.Constant(value=1.0), - ) - - self.vera_d = self.create_parameter( - shape=[r], - dtype=self._dtype, - is_bias=False, - default_initializer=nn.initializer.Constant(value=1.0), - ) - - # Freezing the pre-trained weight matrix and bias vector - self.weight.stop_gradient = True - - def pissa_init(self, r): - weight = self.weight - dtype = weight.dtype - - if dtype != paddle.float32: - weight = weight.astype(paddle.float32) - - U, S, Vh = paddle.linalg.svd(weight.data, full_matrices=False) - - Ur = U[:, :r] - Sr = S[:r] - Vhr = Vh[:r] - - vera_A = Ur @ paddle.diag(paddle.sqrt(Sr)) - vera_B = paddle.diag(paddle.sqrt(Sr)) @ Vhr - - self.vera_A.set_value(vera_A.astype(dtype)) - self.vera_B.set_value(vera_B.astype(dtype)) - res = weight.data - vera_A @ vera_B - weight = res.astype(dtype) - self.weight.set_value(weight) - - def merge(self): - if not self.merged: - diag_b = paddle.diag(self.vera_b) - diag_d = paddle.diag(self.vera_d) - new_weight = self.weight + self.vera_A @ diag_d @ self.vera_B @ diag_b * self.scaling - self.weight.set_value(new_weight) - self.merged = True - - def unmerge(self): - if self.merged: - diag_b = paddle.diag(self.vera_b) - diag_d = paddle.diag(self.vera_d) - new_weight = self.weight - self.vera_A @ diag_d @ self.vera_B @ diag_b * self.scaling - self.weight.set_value(new_weight) - self.merged = False - - def forward(self, input: paddle.Tensor, *args, **kwargs): - result = F.linear(x=input, weight=self.weight, bias=self.bias, name=self.name) - if not self.merged: - # result += (self.vera_dropout(input) @ self.vera_A @ self.vera_B) * self.scaling - diag_b = paddle.diag(self.vera_b) - diag_d = paddle.diag(self.vera_d) - result += (self.vera_dropout(input) @ self.vera_A @ diag_d @ self.vera_B @ diag_b) * self.scaling - return result - - def extra_repr(self): - name = f", name={self.name}" if self.name else "" - return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}" diff --git a/paddleformers/peft/vera/vera_model.py b/paddleformers/peft/vera/vera_model.py deleted file mode 100644 index 6c76ec62cf6..00000000000 --- a/paddleformers/peft/vera/vera_model.py +++ /dev/null @@ -1,284 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import os -import re -from collections import OrderedDict -from typing import Dict, Union - -import numpy as np -import paddle -import paddle.nn as nn -from paddle.distributed.fleet.meta_parallel import PipelineLayer - -from ...transformers.model_utils import PretrainedModel, _add_variant, dtype_guard -from ...utils.env import VERA_WEIGHTS_NAME -from ...utils.log import logger -from .vera_config import VeRAConfig -from .vera_layers import VeRALinear - - -class VeRAModel(nn.Layer): - restore_layer_map: Dict[nn.Layer, nn.Layer] = { - VeRALinear: nn.Linear, - } - - def __init__(self, model, vera_config: VeRAConfig) -> None: - super().__init__() - self.quantized = False - self.vera_config = vera_config - if self.vera_config.dtype is None: - self.vera_config.dtype = paddle.get_default_dtype() - with dtype_guard(self.vera_config.dtype): - self.model = self.get_vera_model(model, vera_config) - self.is_pipelinemodel = False - if issubclass(type(self.model), PipelineLayer): - raise NotImplementedError("vera don't support pipeline parallel now") - if vera_config.tensor_parallel_degree > 1: - raise NotImplementedError("vera don't support tensor parallel now") - self.forward = self.model.forward - - @classmethod - def from_pretrained(cls, model, vera_path, **kwargs): - vera_config = kwargs.pop("vera_config", None) - # init vera config & vera model - if not isinstance(vera_config, VeRAConfig): - vera_config = VeRAConfig.from_pretrained(vera_path) - # define a new variable to conserve original vera_config.tensor_parallel_degree value which will update while initializing vera model - vera_config_tensor_parallel_degree = vera_config.tensor_parallel_degree - vera_model = cls(model, vera_config) - - vera_weight_name = VERA_WEIGHTS_NAME - - # load and set vera weight parameter - vera_weight_path = os.path.join(vera_path, vera_weight_name) - logger.info(f"vera weight path is {vera_weight_path}") - if os.path.exists(vera_weight_path): - # load vera weight parameter - logger.info("vera_weight_path existed, loading vera weight parameter") - - vera_state_dict = paddle.load(vera_weight_path, return_numpy=True) - logger.info(f"Loading the VeRA weights from {vera_weight_path}") - - if ( - vera_config_tensor_parallel_degree > 1 - and vera_config_tensor_parallel_degree != model.config.tensor_parallel_degree - ): - raise NotImplementedError( - f"{vera_config_tensor_parallel_degree} is not equal to {model.config.tensor_parallel_degree}. Please merge VeRA weights first." - ) - - # set vera state dict - vera_model.set_state_dict(vera_state_dict) - else: - logger.error(f"VeRA weights not found under {vera_path}, creating VeRA weights from scratch") - - return vera_model - - def set_state_dict(self, state_dict): - import warnings - - warnings.filterwarnings( - action="ignore", message=".*Skip loading for.*", category=Warning, lineno=0, append=False - ) - self.model.set_state_dict(state_dict) - logger.info("Load vera weight successfully") - - def save_pretrained(self, save_directory: str, merge_tensor_parallel: bool = False, **kwargs): - - logger.info("save vera pretrained") - save_model_config = kwargs.get("save_model_config", True) - - if self.is_pipelinemodel: - self.model._single_to_pp_mapping = None - if self.quantized and merge_tensor_parallel and self.vera_config.tensor_parallel_degree > 1: - merge_tensor_parallel = False - logger.warning( - "Quantized strategy does not support merge_tensor_parallel. Set merge_tensor_parallel to False." - ) - if self.is_pipelinemodel and merge_tensor_parallel and self.vera_config.tensor_parallel_degree > 1: - merge_tensor_parallel = False - logger.warning( - "Pipeline parallelism does not support merge_tensor_parallel. Set merge_tensor_parallel to False." - ) - - variant = kwargs.get("variant", None) - is_main_process = kwargs.get("is_main_process", paddle.distributed.get_rank() == 0) - - assert not os.path.isfile( - save_directory - ), f"Saving directory ({save_directory}) should be a directory, not a file" - os.makedirs(save_directory, exist_ok=True) - - vera_config_to_save = VeRAConfig(**self.vera_config.to_dict()) - - logger.info(f"vera config to save is {vera_config_to_save}") - - trainable_state_dict = self.get_trainable_state_dict() - - # save vera weight - vera_weight_name = _add_variant(VERA_WEIGHTS_NAME, variant) - weight_filename = os.path.join(save_directory, vera_weight_name) - paddle.save(trainable_state_dict, weight_filename) - - # save vera config - if is_main_process: - vera_config_to_save.save_pretrained(save_directory) - if save_model_config: - model_config_to_save = copy.deepcopy(self.model.config) - if merge_tensor_parallel: - model_config_to_save.tensor_parallel_degree = -1 - model_config_to_save.save_pretrained(save_directory) - - def _find_and_replace_module(self, model, module_name, vera_config, enable_vera): - parent_module = model - attribute_chain = module_name.split(".") - for name in attribute_chain[:-1]: - parent_module = getattr(parent_module, name) - module = getattr(parent_module, attribute_chain[-1]) - vera_module = None - if enable_vera is None: - if isinstance(module, nn.Linear): - vera_module = VeRALinear( - # pass the base linear module - base_linear_module=module, - in_features=module.weight.shape[0], - out_features=module.weight.shape[1], - r=vera_config.r, - vera_alpha=vera_config.vera_alpha, - vera_dropout=vera_config.vera_dropout, - bias_attr=False if module.bias is None else None, - pissa_init=vera_config.pissa_init, - ) - - if vera_module is None: - raise ValueError( - f"VeRA strategy only supports paddle.nn.Linear or paddle.distributed.fleet.meta_parallel.ColumnParallelLinear. {module}({module_name}) is not supported。" - ) - - if module.bias is not None: - vera_module.bias = module.bias - - setattr(parent_module, attribute_chain[-1], vera_module) - - def _find_and_restore_module(self, module_name): - parent_module = self.model - attribute_chain = module_name.split(".") - for name in attribute_chain[:-1]: - parent_module = getattr(parent_module, name) - module = getattr(parent_module, attribute_chain[-1]) - original_model_class = self.restore_layer_map[module.__class__] - original_module = original_model_class(in_features=module.weight.shape[0], out_features=module.weight.shape[1]) - original_module.weight = module.weight - if module.bias is not None: - original_module.bias = module.bias - setattr(parent_module, attribute_chain[-1], original_module) - - def get_trainable_state_dict(self): - trainable_state_dict = OrderedDict() - for name, weight in self.model.state_dict().items(): - # get vera parameter - if not weight.stop_gradient: - trainable_state_dict[name] = weight - return trainable_state_dict - - def print_trainable_parameters(self) -> None: - freeze_numel = 0 - trainable_numel = 0 - for _, weight in self.model.state_dict().items(): - if weight.stop_gradient: - freeze_numel += np.prod(weight.shape) - else: - trainable_numel += np.prod(weight.shape) - logger.debug( - f"Frozen parameters: {freeze_numel:.2e} || Trainable parameters:{trainable_numel:.2e} || Total parameters:{freeze_numel+trainable_numel:.2e}|| Trainable:{trainable_numel / (freeze_numel+trainable_numel):.2%}" - ) - - def mark_only_vera_as_trainable(self, notfreezeB=False) -> None: - for _, layer in self.model.named_sublayers(): - if isinstance(layer, VeRALinear): - for name, weight in layer.state_dict().items(): - if self.vera_config.trainable_bias in ["vera", "all"] and "bias" in name: - weight.stop_gradient = False - elif "vera" in name: - # notfreezeB=True, vera_b, vera_d, vera_B is trainable - # notfreezeB=False, vera_b, vera_d is trainable - if "vera_b" in name or "vera_d" in name: - weight.stop_gradient = False - elif "vera_B" in name and notfreezeB: - weight.stop_gradient = False - else: - weight.stop_gradient = True - else: - weight.stop_gradient = True - else: - for name, weight in layer.state_dict().items(): - if self.vera_config.trainable_bias == "all" and "bias" in name: - weight.stop_gradient = False - else: - weight.stop_gradient = True - if self.vera_config.trainable_modules is not None: - for name, weight in self.model.state_dict().items(): - if any( - re.fullmatch(trainable_module, name) for trainable_module in self.vera_config.trainable_modules - ): - weight.stop_gradient = False - - def get_vera_model(self, model: Union[PretrainedModel, nn.Layer], vera_config: VeRAConfig): - - if vera_config.target_modules is None: - return model - elif isinstance(vera_config.target_modules, str): - target_modules = [vera_config.target_modules] - enable_vera_list = [None] - else: - target_modules = vera_config.target_modules - enable_vera_list = [None for _ in range(len(target_modules))] - - for target_module, enable_vera in zip(target_modules, enable_vera_list): - for i in model.named_sublayers(): - module_name = i[0] - if re.fullmatch(target_module, module_name): - self._find_and_replace_module(model, module_name, vera_config, enable_vera) - return model - - def restore_original_model(self): - for layer_name, layer in self.model.named_sublayers(): - if isinstance(layer, VeRALinear): - self._find_and_restore_module(layer_name) - else: - raise NotImplementedError(f"{layer} restoration is not supported yet.") - return self.model - - def __getattr__(self, name: str): - """Forward missing attributes to the wrapped module.""" - try: - return super().__getattr__(name) # defer to nn.Layer's logic - except AttributeError: - return getattr(self.model, name) - - def train(self): - self.training = True - self.model.training = True - for layer in self.model.sublayers(): - layer.training = True - layer.train() - - def eval(self): - self.training = False - self.model.training = False - for layer in self.model.sublayers(): - layer.training = False - layer.eval() diff --git a/paddleformers/trainer/trainer_utils.py b/paddleformers/trainer/trainer_utils.py index 42a84635396..42e46183c24 100644 --- a/paddleformers/trainer/trainer_utils.py +++ b/paddleformers/trainer/trainer_utils.py @@ -30,6 +30,7 @@ import random import threading import time +from collections import namedtuple from contextlib import contextmanager from enum import Enum from pathlib import Path @@ -53,7 +54,7 @@ from safetensors.paddle import save_file from transformers.tokenization_utils_base import BatchEncoding -from ..ops import Topology +# from ..ops import Topology from ..trainer.argparser import strtobool from ..transformers.model_utils import _parse_size from ..utils.env import PREFIX_CHECKPOINT_DIR, _re_checkpoint # noqa for compatibility @@ -99,6 +100,74 @@ def log_trainer_start(): os.environ["MAIN_PROCESS_STARTED"] = "1" +GroupInfo = namedtuple("GroupInfo", ["size", "rank", "world"]) + + +class Topology: + def __init__( + self, + device_rank, + world_size, + dp_degree=None, + pp_degree=1, + sharding_degree=1, + mp_degree=1, + sep_degree=1, + order=["dp", "pp", "sharding", "mp", "sep"], + ): + assert set(order) == {"dp", "pp", "sharding", "mp", "sep"}, f"Illegal order : {order}" + self.order = order + + degree_map = { + "dp": dp_degree, + "pp": pp_degree, + "sharding": sharding_degree, + "mp": mp_degree, + "sep": sep_degree, + } + shape = [degree_map[key] for key in self.order] + + arr = np.arange(0, dp_degree * pp_degree * sharding_degree * mp_degree * sep_degree).reshape(shape) + ranks = [rank[0] for rank in np.where(arr == device_rank)] + + self.world = GroupInfo(size=world_size, rank=device_rank, world=list(range(0, world_size))) + worlds = [] + for i in range(len(ranks)): + indexes = tuple(ranks[:i] + [slice(None)] + ranks[(i + 1) :]) + worlds.append(arr[indexes]) + + for i, key in enumerate(self.order): + if key == "dp": + self.dp_info = GroupInfo(size=len(worlds[i]), rank=ranks[i], world=worlds[i].tolist()) + elif key == "pp": + self.pp_info = GroupInfo(size=len(worlds[i]), rank=ranks[i], world=worlds[i].tolist()) + elif key == "sharding": + self.sharding_info = GroupInfo(size=len(worlds[i]), rank=ranks[i], world=worlds[i].tolist()) + elif key == "mp": + self.mp_info = GroupInfo(size=len(worlds[i]), rank=ranks[i], world=worlds[i].tolist()) + elif key == "sep": + self.sep_info = GroupInfo(size=len(worlds[i]), rank=ranks[i], world=worlds[i].tolist()) + + self.is_last = self.pp_info.rank == self.pp_info.size - 1 + + data_arr = np.arange(0, dp_degree * sharding_degree).reshape([dp_degree, sharding_degree]) + for i, key in enumerate(self.order): + if key != "dp" and key != "sharding": + data_arr = np.expand_dims(data_arr, axis=i).repeat(degree_map[key], axis=i) + + self.data_info = GroupInfo( + size=int(self.dp_info.size * self.sharding_info.size), + rank=int(self.dp_info.rank * self.sharding_info.size + self.sharding_info.rank), + world=data_arr.reshape(-1).tolist(), + ) + + assert self.data_info.world[device_rank] == self.data_info.rank, "Data rank calculate error!" + self.data_inner_times = self.world.size // self.data_info.size + + def __repr__(self): + return f"dp_info:\n\t {self.dp_info}, \npp_info:\n\t {self.pp_info}, \nsharding_info:\n\t {self.sharding_info}, \nmp_info:\n\t {self.mp_info}, \nsep_info:\n\t {self.sep_info}, \ndata_info:\n\t {self.data_info}, \norder:\n\t {self.order}" + + def _get_distributed_seeds(seed: int = 1234, topo: Topology = None): """ Get the seeds from distributed environment strategy. diff --git a/paddleformers/transformers/__init__.py b/paddleformers/transformers/__init__.py index 1f65543564d..1f4d83668cb 100644 --- a/paddleformers/transformers/__init__.py +++ b/paddleformers/transformers/__init__.py @@ -62,25 +62,6 @@ "dispatching", "MoEFlexTokenLayer", ], - "bert.modeling": [ - "BertForSequenceClassification", - "BertPretrainingHeads", - "BertForMaskedLM", - "BertForPretraining", - "BertPretrainedModel", - "BertForTokenClassification", - "BertForMultipleChoice", - "BertModel", - "BertPretrainingCriterion", - "BertForQuestionAnswering", - ], - "bert.tokenizer": ["BertTokenizer"], - "bert.tokenizer_fast": ["BertTokenizerFast"], - "bert.configuration": [ - "BERT_PRETRAINED_INIT_CONFIGURATION", - "BertConfig", - "BERT_PRETRAINED_RESOURCE_FILES_MAP", - ], "auto.configuration": ["AutoConfig"], "auto.image_processing": ["AutoImageProcessor", "IMAGE_PROCESSOR_MAPPING"], "auto.modeling": [ @@ -195,7 +176,6 @@ "ernie4_5_moe_vl.tokenizer": ["Ernie4_5_VLTokenizer"], "ernie4_5_moe_vl.image_processor": ["Ernie4_5_VLImageProcessor"], "ernie4_5_moe_vl.processor": ["Ernie4_5_VLProcessor"], - "export": ["export_model"], "gpt_oss.configuration": ["GptOssConfig"], "gpt_oss.modeling": ["GptOssModel", "GptOssForCausalLM", "GptOssForCausalLMPipe"], "gemma3_text.configuration": ["Gemma3Config", "Gemma3TextConfig"], @@ -302,7 +282,6 @@ "Qwen3NextForCausalLMPipe", "Qwen3NextPretrainingCriterion", ], - "bert": [], "llama": [], "qwen2": [], "qwen3": [], @@ -351,7 +330,6 @@ from .tensor_parallel_utils import parallel_matmul, fused_head_and_loss_fn from .moe_gate import * from .moe_layer import * - from .export import export_model with suppress(Exception): from paddle.distributed.fleet.utils.sequence_parallel_utils import ( @@ -365,11 +343,6 @@ register_sequence_parallel_allreduce_hooks, ) - # isort: split - from .bert.modeling import * - from .bert.tokenizer import * - from .bert.configuration import * - # isort: split from .auto.configuration import * from .auto.image_processing import * diff --git a/paddleformers/transformers/auto/configuration.py b/paddleformers/transformers/auto/configuration.py index ed5bf5a5dfc..7e2677847f6 100644 --- a/paddleformers/transformers/auto/configuration.py +++ b/paddleformers/transformers/auto/configuration.py @@ -33,8 +33,6 @@ CONFIG_MAPPING_NAMES = OrderedDict( [ - ("bert", "BertConfig"), - ("deepseek_v2", "DeepseekV2Config"), ("deepseek_v3", "DeepseekV3Config"), ("ernie4_5", "Ernie4_5Config"), ("ernie4_5_moe", "Ernie4_5_MoeConfig"), @@ -59,7 +57,6 @@ MODEL_NAMES_MAPPING = OrderedDict( # Base model mapping [ - ("bert", "Bert"), ("deepseek_v2", "DeepseekV2"), ("deepseek_v3", "DeepseekV3"), ("ernie4_5", "Ernie4_5"), diff --git a/paddleformers/transformers/auto/modeling.py b/paddleformers/transformers/auto/modeling.py index 4d740f40d0a..d1b64ded055 100644 --- a/paddleformers/transformers/auto/modeling.py +++ b/paddleformers/transformers/auto/modeling.py @@ -52,8 +52,6 @@ MAPPING_NAMES = OrderedDict( [ - ("Bert", "bert"), - ("DeepseekV2", "deepseek_v2"), ("DeepseekV3", "deepseek_v3"), ("Ernie4_5", "ernie4_5"), ("Ernie4_5_Moe", "ernie4_5_moe"), diff --git a/paddleformers/transformers/bert/__init__.py b/paddleformers/transformers/bert/__init__.py deleted file mode 100644 index 847c3bdee0f..00000000000 --- a/paddleformers/transformers/bert/__init__.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import sys -from typing import TYPE_CHECKING - -from ...utils.lazy_import import _LazyModule - -import_structure = { - "tokenizer_utils_fast": ["PretrainedTokenizerFast"], - "tokenizer": ["BasicTokenizer", "BertTokenizer", "WordpieceTokenizer"], - "tokenizer_fast": ["BertTokenizerFast"], - "modeling": [ - "BertForSequenceClassification", - "BertPretrainingHeads", - "BertForMaskedLM", - "BertForPretraining", - "BertPretrainedModel", - "BertForTokenClassification", - "BertForMultipleChoice", - "BertModel", - "BertPretrainingCriterion", - "BertForQuestionAnswering", - ], - "configuration": ["BERT_PRETRAINED_INIT_CONFIGURATION", "BertConfig", "BERT_PRETRAINED_RESOURCE_FILES_MAP"], -} - -if TYPE_CHECKING: - from .configuration import * - from .modeling import * - from .tokenizer import * - from .tokenizer_fast import * -else: - sys.modules[__name__] = _LazyModule( - __name__, - globals()["__file__"], - import_structure, - module_spec=__spec__, - ) diff --git a/paddleformers/transformers/bert/configuration.py b/paddleformers/transformers/bert/configuration.py deleted file mode 100644 index 7ebe62fde19..00000000000 --- a/paddleformers/transformers/bert/configuration.py +++ /dev/null @@ -1,407 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" BERT model configuration""" -from __future__ import annotations - -from typing import Dict - -from ..configuration_utils import PretrainedConfig - -__all__ = ["BERT_PRETRAINED_INIT_CONFIGURATION", "BertConfig", "BERT_PRETRAINED_RESOURCE_FILES_MAP"] - -BERT_PRETRAINED_INIT_CONFIGURATION = { - "bert-base-uncased": { - "vocab_size": 30522, - "hidden_size": 768, - "num_hidden_layers": 12, - "num_attention_heads": 12, - "intermediate_size": 3072, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - }, - "bert-large-uncased": { - "vocab_size": 30522, - "hidden_size": 1024, - "num_hidden_layers": 24, - "num_attention_heads": 16, - "intermediate_size": 4096, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - }, - "bert-base-multilingual-uncased": { - "vocab_size": 105879, - "hidden_size": 768, - "num_hidden_layers": 12, - "num_attention_heads": 12, - "intermediate_size": 3072, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - }, - "bert-base-cased": { - "vocab_size": 28996, - "hidden_size": 768, - "num_hidden_layers": 12, - "num_attention_heads": 12, - "intermediate_size": 3072, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - }, - "bert-base-chinese": { - "vocab_size": 21128, - "hidden_size": 768, - "num_hidden_layers": 12, - "num_attention_heads": 12, - "intermediate_size": 3072, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - }, - "bert-base-multilingual-cased": { - "vocab_size": 119547, - "hidden_size": 768, - "num_hidden_layers": 12, - "num_attention_heads": 12, - "intermediate_size": 3072, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - }, - "bert-large-cased": { - "vocab_size": 28996, - "hidden_size": 1024, - "num_hidden_layers": 24, - "num_attention_heads": 16, - "intermediate_size": 4096, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - }, - "bert-wwm-chinese": { - "vocab_size": 21128, - "hidden_size": 768, - "num_hidden_layers": 12, - "num_attention_heads": 12, - "intermediate_size": 3072, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - }, - "bert-wwm-ext-chinese": { - "vocab_size": 21128, - "hidden_size": 768, - "num_hidden_layers": 12, - "num_attention_heads": 12, - "intermediate_size": 3072, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - }, - "macbert-base-chinese": { - "vocab_size": 21128, - "hidden_size": 768, - "num_hidden_layers": 12, - "num_attention_heads": 12, - "intermediate_size": 3072, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - }, - "macbert-large-chinese": { - "vocab_size": 21128, - "hidden_size": 1024, - "num_hidden_layers": 24, - "num_attention_heads": 16, - "intermediate_size": 4096, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - }, - "simbert-base-chinese": { - "vocab_size": 13685, - "hidden_size": 768, - "num_hidden_layers": 12, - "num_attention_heads": 12, - "intermediate_size": 3072, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 512, - "type_vocab_size": 2, - "initializer_range": 0.02, - "pad_token_id": 0, - }, - "uer/chinese-roberta-base": { - "attention_probs_dropout_prob": 0.1, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 768, - "initializer_range": 0.02, - "intermediate_size": 3072, - "max_position_embeddings": 512, - "num_attention_heads": 12, - "num_hidden_layers": 12, - "type_vocab_size": 2, - "vocab_size": 21128, - "pad_token_id": 0, - }, - "uer/chinese-roberta-medium": { - "attention_probs_dropout_prob": 0.1, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 512, - "initializer_range": 0.02, - "intermediate_size": 2048, - "max_position_embeddings": 512, - "num_attention_heads": 8, - "num_hidden_layers": 8, - "type_vocab_size": 2, - "vocab_size": 21128, - "pad_token_id": 0, - }, - "uer/chinese-roberta-6l-768h": { - "attention_probs_dropout_prob": 0.1, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 768, - "initializer_range": 0.02, - "intermediate_size": 3072, - "max_position_embeddings": 512, - "num_attention_heads": 12, - "num_hidden_layers": 6, - "type_vocab_size": 2, - "vocab_size": 21128, - "pad_token_id": 0, - }, - "uer/chinese-roberta-small": { - "attention_probs_dropout_prob": 0.1, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 512, - "initializer_range": 0.02, - "intermediate_size": 2048, - "max_position_embeddings": 512, - "num_attention_heads": 8, - "num_hidden_layers": 4, - "type_vocab_size": 2, - "vocab_size": 21128, - "pad_token_id": 0, - }, - "uer/chinese-roberta-mini": { - "attention_probs_dropout_prob": 0.1, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 256, - "initializer_range": 0.02, - "intermediate_size": 1024, - "max_position_embeddings": 512, - "num_attention_heads": 4, - "num_hidden_layers": 4, - "type_vocab_size": 2, - "vocab_size": 21128, - "pad_token_id": 0, - }, - "uer/chinese-roberta-tiny": { - "attention_probs_dropout_prob": 0.1, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 128, - "initializer_range": 0.02, - "intermediate_size": 512, - "max_position_embeddings": 512, - "num_attention_heads": 2, - "num_hidden_layers": 2, - "type_vocab_size": 2, - "vocab_size": 21128, - "pad_token_id": 0, - }, -} - -BERT_PRETRAINED_RESOURCE_FILES_MAP = { - "model_state": { - "bert-base-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/bert-base-uncased.pdparams", - "bert-large-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/bert-large-uncased.pdparams", - "bert-base-multilingual-uncased": "http://bj.bcebos.com/paddlenlp/models/transformers/bert-base-multilingual-uncased.pdparams", - "bert-base-cased": "http://bj.bcebos.com/paddlenlp/models/transformers/bert/bert-base-cased.pdparams", - "bert-base-chinese": "http://bj.bcebos.com/paddlenlp/models/transformers/bert/bert-base-chinese.pdparams", - "bert-base-multilingual-cased": "http://bj.bcebos.com/paddlenlp/models/transformers/bert/bert-base-multilingual-cased.pdparams", - "bert-large-cased": "http://bj.bcebos.com/paddlenlp/models/transformers/bert/bert-large-cased.pdparams", - "bert-wwm-chinese": "http://bj.bcebos.com/paddlenlp/models/transformers/bert/bert-wwm-chinese.pdparams", - "bert-wwm-ext-chinese": "http://bj.bcebos.com/paddlenlp/models/transformers/bert/bert-wwm-ext-chinese.pdparams", - "macbert-base-chinese": "https://bj.bcebos.com/paddlenlp/models/transformers/macbert/macbert-base-chinese.pdparams", - "macbert-large-chinese": "https://bj.bcebos.com/paddlenlp/models/transformers/macbert/macbert-large-chinese.pdparams", - "simbert-base-chinese": "https://bj.bcebos.com/paddlenlp/models/transformers/simbert/simbert-base-chinese-v1.pdparams", - "uer/chinese-roberta-base": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_base.pdparams", - "uer/chinese-roberta-medium": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_medium.pdparams", - "uer/chinese-roberta-6l-768h": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_6l_768h.pdparams", - "uer/chinese-roberta-small": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_small.pdparams", - "uer/chinese-roberta-mini": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_mini.pdparams", - "uer/chinese-roberta-tiny": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_tiny.pdparams", - } -} - - -class BertConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`BertModel`] or a [`TFBertModel`]. It is used to - instantiate a BERT model according to the specified arguments, defining the model architecture. Instantiating a - configuration with the defaults will yield a similar configuration to that of the BERT - bert-base-uncased architecture. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - - Args: - vocab_size (`int`, *optional*, defaults to 30522): - Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`]. - hidden_size (`int`, *optional*, defaults to 768): - Dimensionality of the encoder layers and the pooler layer. - num_hidden_layers (`int`, *optional*, defaults to 12): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 12): - Number of attention heads for each attention layer in the Transformer encoder. - intermediate_size (`int`, *optional*, defaults to 3072): - Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. - hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): - The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, - `"relu"`, `"silu"` and `"gelu_new"` are supported. - hidden_dropout_prob (`float`, *optional*, defaults to 0.1): - The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. - attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): - The dropout ratio for the attention probabilities. - max_position_embeddings (`int`, *optional*, defaults to 512): - The maximum sequence length that this model might ever be used with. Typically set this to something large - just in case (e.g., 512 or 1024 or 2048). - type_vocab_size (`int`, *optional*, defaults to 2): - The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`]. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - layer_norm_eps (`float`, *optional*, defaults to 1e-12): - The epsilon used by the layer normalization layers. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - classifier_dropout (`float`, *optional*): - The dropout ratio for the classification head. - - Examples: - - ```python - >>> from paddleformers.transformers import BertModel, BertConfig - - >>> # Initializing a BERT bert-base-uncased style configuration - >>> configuration = BertConfig() - - >>> # Initializing a model from the bert-base-uncased style configuration - >>> model = BertModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - model_type = "bert" - attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"} - pretrained_init_configuration = BERT_PRETRAINED_INIT_CONFIGURATION - - def __init__( - self, - vocab_size: int = 30522, - hidden_size: int = 768, - num_hidden_layers: int = 12, - num_attention_heads: int = 12, - intermediate_size: int = 3072, - hidden_act: str = "gelu", - hidden_dropout_prob: float = 0.1, - attention_probs_dropout_prob: float = 0.1, - max_position_embeddings: int = 512, - type_vocab_size: int = 16, - initializer_range: float = 0.02, - pad_token_id: int = 0, - pool_act: str = "tanh", - fuse: bool = False, - layer_norm_eps=1e-12, - use_cache=False, - **kwargs - ): - super().__init__(pad_token_id=pad_token_id, **kwargs) - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.pool_act = pool_act - self.fuse = fuse - - self.layer_norm_eps = layer_norm_eps - self.use_cache = use_cache diff --git a/paddleformers/transformers/bert/modeling.py b/paddleformers/transformers/bert/modeling.py deleted file mode 100644 index 19403474250..00000000000 --- a/paddleformers/transformers/bert/modeling.py +++ /dev/null @@ -1,1420 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from __future__ import annotations - -import warnings -from typing import Optional, Tuple - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import Tensor -from paddle.nn import Layer - -try: - from paddle.incubate.nn import FusedTransformerEncoderLayer -except ImportError: - FusedTransformerEncoderLayer = None -from dataclasses import dataclass - -from ...utils.converter import StateDictNameMapping, init_name_mappings -from ...utils.env import CONFIG_NAME -from ..model_outputs import ( - BaseModelOutputWithPoolingAndCrossAttentions, - MaskedLMOutput, - ModelOutput, - MultipleChoiceModelOutput, - QuestionAnsweringModelOutput, - SequenceClassifierOutput, - TokenClassifierOutput, -) -from ..model_utils import PretrainedModel, register_base_model -from ..transposed_linear import TransposedLinear -from .configuration import ( - BERT_PRETRAINED_INIT_CONFIGURATION, - BERT_PRETRAINED_RESOURCE_FILES_MAP, - BertConfig, -) - -__all__ = [ - "BertModel", - "BertPretrainedModel", - "BertForPretraining", - "BertPretrainingCriterion", - "BertPretrainingHeads", - "BertForSequenceClassification", - "BertForTokenClassification", - "BertForQuestionAnswering", - "BertForMultipleChoice", - "BertForMaskedLM", -] - - -class BertEmbeddings(Layer): - """ - Include embeddings from word, position and token_type embeddings - """ - - def __init__(self, config: BertConfig): - super(BertEmbeddings, self).__init__() - - self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size) - self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) - self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) - self.layer_norm = nn.LayerNorm(config.hidden_size) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def forward( - self, - input_ids: Tensor, - token_type_ids: Optional[Tensor] = None, - position_ids: Optional[Tensor] = None, - past_key_values_length: Optional[int] = None, - ): - - if position_ids is None: - ones = paddle.ones_like(input_ids, dtype="int64") - seq_length = paddle.cumsum(ones, axis=-1) - - position_ids = seq_length - ones - if past_key_values_length is not None: - position_ids += past_key_values_length - position_ids.stop_gradient = True - if token_type_ids is None: - token_type_ids = paddle.zeros_like(input_ids, dtype="int64") - - input_embedings = self.word_embeddings(input_ids) - position_embeddings = self.position_embeddings(position_ids) - token_type_embeddings = self.token_type_embeddings(token_type_ids) - - embeddings = input_embedings + position_embeddings + token_type_embeddings - embeddings = self.layer_norm(embeddings) - embeddings = self.dropout(embeddings) - return embeddings - - -class BertPooler(Layer): - """ - Pool the result of BertEncoder. - """ - - def __init__(self, config: BertConfig): - """init the bert pooler with config & args/kwargs - - Args: - config (BertConfig): BertConfig instance. Defaults to None. - """ - super(BertPooler, self).__init__() - - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.activation = nn.Tanh() - self.pool_act = config.pool_act - - def forward(self, hidden_states): - # We "pool" the model by simply taking the hidden state corresponding - # to the first token. - first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense(first_token_tensor) - if self.pool_act == "tanh": - pooled_output = self.activation(pooled_output) - return pooled_output - - -class BertPretrainedModel(PretrainedModel): - """ - An abstract class for pretrained BERT models. It provides BERT related - `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`, - `pretrained_init_configuration`, `base_model_prefix` for downloading and - loading pretrained models. - See :class:`~paddleformers.transformers.model_utils.PretrainedModel` for more details. - """ - - model_config_file = CONFIG_NAME - config_class = BertConfig - resource_files_names = {"model_state": "model_state.pdparams"} - base_model_prefix = "bert" - - pretrained_init_configuration = BERT_PRETRAINED_INIT_CONFIGURATION - pretrained_resource_files_map = BERT_PRETRAINED_RESOURCE_FILES_MAP - - @classmethod - def _get_name_mappings(cls, config: BertConfig) -> list[StateDictNameMapping]: - mappings: list[StateDictNameMapping] = [] - model_mappings = [ - "embeddings.word_embeddings.weight", - "embeddings.position_embeddings.weight", - "embeddings.token_type_embeddings.weight", - ["embeddings.LayerNorm.weight", "embeddings.layer_norm.weight"], - ["embeddings.LayerNorm.bias", "embeddings.layer_norm.bias"], - ["pooler.dense.weight", None, "transpose"], - "pooler.dense.bias", - # for TokenClassification - ] - for layer_index in range(config.num_hidden_layers): - layer_mappings = [ - [ - f"encoder.layer.{layer_index}.attention.self.query.weight", - f"encoder.layers.{layer_index}.self_attn.q_proj.weight", - "transpose", - ], - [ - f"encoder.layer.{layer_index}.attention.self.query.bias", - f"encoder.layers.{layer_index}.self_attn.q_proj.bias", - ], - [ - f"encoder.layer.{layer_index}.attention.self.key.weight", - f"encoder.layers.{layer_index}.self_attn.k_proj.weight", - "transpose", - ], - [ - f"encoder.layer.{layer_index}.attention.self.key.bias", - f"encoder.layers.{layer_index}.self_attn.k_proj.bias", - ], - [ - f"encoder.layer.{layer_index}.attention.self.value.weight", - f"encoder.layers.{layer_index}.self_attn.v_proj.weight", - "transpose", - ], - [ - f"encoder.layer.{layer_index}.attention.self.value.bias", - f"encoder.layers.{layer_index}.self_attn.v_proj.bias", - ], - [ - f"encoder.layer.{layer_index}.attention.output.dense.weight", - f"encoder.layers.{layer_index}.self_attn.out_proj.weight", - "transpose", - ], - [ - f"encoder.layer.{layer_index}.attention.output.dense.bias", - f"encoder.layers.{layer_index}.self_attn.out_proj.bias", - ], - [ - f"encoder.layer.{layer_index}.intermediate.dense.weight", - f"encoder.layers.{layer_index}.linear1.weight", - "transpose", - ], - [f"encoder.layer.{layer_index}.intermediate.dense.bias", f"encoder.layers.{layer_index}.linear1.bias"], - [ - f"encoder.layer.{layer_index}.attention.output.LayerNorm.weight", - f"encoder.layers.{layer_index}.norm1.weight", - ], - [ - f"encoder.layer.{layer_index}.attention.output.LayerNorm.bias", - f"encoder.layers.{layer_index}.norm1.bias", - ], - [ - f"encoder.layer.{layer_index}.output.dense.weight", - f"encoder.layers.{layer_index}.linear2.weight", - "transpose", - ], - [f"encoder.layer.{layer_index}.output.dense.bias", f"encoder.layers.{layer_index}.linear2.bias"], - [f"encoder.layer.{layer_index}.output.LayerNorm.weight", f"encoder.layers.{layer_index}.norm2.weight"], - [f"encoder.layer.{layer_index}.output.LayerNorm.bias", f"encoder.layers.{layer_index}.norm2.bias"], - ] - model_mappings.extend(layer_mappings) - - init_name_mappings(model_mappings) - - # base-model prefix "BertModel" - if "BertModel" not in config.architectures: - for mapping in model_mappings: - mapping[0] = "bert." + mapping[0] - mapping[1] = "bert." + mapping[1] - - # downstream mappings - if "BertForQuestionAnswering" in config.architectures: - model_mappings.extend( - [["qa_outputs.weight", "classifier.weight", "transpose"], ["qa_outputs.bias", "classifier.bias"]] - ) - if ( - "BertForMultipleChoice" in config.architectures - or "BertForSequenceClassification" in config.architectures - or "BertForTokenClassification" in config.architectures - ): - model_mappings.extend([["classifier.weight", "classifier.weight", "transpose"]]) - - mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)] - return mappings - - def _init_weights(self, layer): - """Initialization hook""" - if isinstance(layer, (nn.Linear, nn.Embedding)): - # In the dygraph mode, use the `set_value` to reset the parameter directly, - # and reset the `state_dict` to update parameter in static mode. - if isinstance(layer.weight, paddle.Tensor): - layer.weight.set_value( - paddle.tensor.normal( - mean=0.0, - std=self.config.initializer_range, - shape=layer.weight.shape, - ) - ) - - elif isinstance(layer, nn.LayerNorm): - layer._epsilon = self.config.layer_norm_eps - - -@register_base_model -class BertModel(BertPretrainedModel): - """ - The bare BERT Model transformer outputting raw hidden-states. - - This model inherits from :class:`~paddleformers.transformers.model_utils.PretrainedModel`. - Refer to the superclass documentation for the generic methods. - - This model is also a Paddle `paddle.nn.Layer `__ subclass. Use it as a regular Paddle Layer - and refer to the Paddle documentation for all matter related to general usage and behavior. - - Args: - config (:class:`BertConfig`): - An instance of BertConfig used to construct BertModel. - """ - - def __init__(self, config: BertConfig): - super(BertModel, self).__init__(config) - - self.pad_token_id = config.pad_token_id - self.initializer_range = config.initializer_range - self.embeddings = BertEmbeddings(config) - if config.fuse and FusedTransformerEncoderLayer is None: - warnings.warn( - "FusedTransformerEncoderLayer is not supported by the running Paddle. " - "The flag fuse_transformer will be ignored. Try Paddle >= 2.3.0" - ) - self.fuse = config.fuse and FusedTransformerEncoderLayer is not None - if self.fuse: - self.encoder = nn.LayerList( - [ - FusedTransformerEncoderLayer( - config.hidden_size, - config.num_attention_heads, - config.intermediate_size, - dropout_rate=config.hidden_dropout_prob, - activation=config.hidden_act, - attn_dropout_rate=config.attention_probs_dropout_prob, - act_dropout_rate=0.0, - ) - for _ in range(config.num_hidden_layers) - ] - ) - else: - encoder_layer = nn.TransformerEncoderLayer( - config.hidden_size, - config.num_attention_heads, - config.intermediate_size, - dropout=config.hidden_dropout_prob, - activation=config.hidden_act, - attn_dropout=config.attention_probs_dropout_prob, - act_dropout=0, - ) - self.encoder = nn.TransformerEncoder(encoder_layer, config.num_hidden_layers) - self.pooler = BertPooler(config) - - def get_input_embeddings(self): - return self.embeddings.word_embeddings - - def set_input_embeddings(self, value): - self.embeddings.word_embeddings = value - - def forward( - self, - input_ids: Tensor, - token_type_ids: Optional[Tensor] = None, - position_ids: Optional[Tensor] = None, - attention_mask: Optional[Tensor] = None, - past_key_values: Optional[Tuple[Tuple[Tensor]]] = None, - use_cache: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None, - return_dict: Optional[bool] = None, - ): - r""" - The BertModel forward method, overrides the `__call__()` special method. - - Args: - input_ids (Tensor): - Indices of input sequence tokens in the vocabulary. They are - numerical representations of tokens that build the input sequence. - Its data type should be `int64` and it has a shape of [batch_size, sequence_length]. - token_type_ids (Tensor, optional): - Segment token indices to indicate different portions of the inputs. - Selected in the range ``[0, type_vocab_size - 1]``. - If `type_vocab_size` is 2, which means the inputs have two portions. - Indices can either be 0 or 1: - - - 0 corresponds to a *sentence A* token, - - 1 corresponds to a *sentence B* token. - - Its data type should be `int64` and it has a shape of [batch_size, sequence_length]. - Defaults to `None`, which means we don't add segment embeddings. - position_ids(Tensor, optional): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, - max_position_embeddings - 1]``. - Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`. - attention_mask (Tensor, optional): - Mask used in multi-head attention to avoid performing attention on to some unwanted positions, - usually the paddings or the subsequent positions. - Its data type can be int, float and bool. - When the data type is bool, the `masked` tokens have `False` values and the others have `True` values. - When the data type is int, the `masked` tokens have `0` values and the others have `1` values. - When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values. - It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`. - Defaults to `None`, which means nothing needed to be prevented attention to. - past_key_values (tuple(tuple(Tensor)), optional): - The length of tuple equals to the number of layers, and each inner - tuple haves 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`) - which contains precomputed key and value hidden states of the attention blocks. - If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that - don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all - `input_ids` of shape `(batch_size, sequence_length)`. - use_cache (`bool`, optional): - If set to `True`, `past_key_values` key value states are returned. - Defaults to `None`. - output_hidden_states (bool, optional): - Whether to return the hidden states of all layers. - Defaults to `None`. - output_attentions (bool, optional): - Whether to return the attentions tensors of all attention layers. - Defaults to `None`. - return_dict (bool, optional): - Whether to return a :class:`~paddleformers.transformers.model_outputs.ModelOutput` object. If `False`, the output - will be a tuple of tensors. Defaults to `None`. - - Returns: - An instance of :class:`~paddleformers.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions` if - `return_dict=True`. Otherwise it returns a tuple of tensors corresponding - to ordered and not None (depending on the input arguments) fields of - :class:`~paddleformers.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions`. - - Example: - .. code-block:: - - import paddle - from paddleformers.transformers import BertModel, BertTokenizer - - tokenizer = BertTokenizer.from_pretrained('bert-wwm-chinese') - model = BertModel.from_pretrained('bert-wwm-chinese') - - inputs = tokenizer("欢迎使用百度飞桨!") - inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()} - output = model(**inputs) - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - use_cache = use_cache if use_cache is not None else self.config.use_cache - - past_key_values_length = None - if past_key_values is not None: - past_key_values_length = past_key_values[0][0].shape[2] - if attention_mask is None: - attention_mask = paddle.unsqueeze( - (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e4, axis=[1, 2] - ) - if past_key_values is not None: - batch_size = past_key_values[0][0].shape[0] - past_mask = paddle.zeros([batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype) - attention_mask = paddle.cat([past_mask, attention_mask], axis=-1) - else: - if attention_mask.ndim == 2: - # attention_mask [batch_size, sequence_length] -> [batch_size, 1, 1, sequence_length] - attention_mask = attention_mask.unsqueeze(axis=[1, 2]).astype(paddle.get_default_dtype()) - attention_mask = (1.0 - attention_mask) * -1e4 - - embedding_output = self.embeddings( - input_ids=input_ids, - position_ids=position_ids, - token_type_ids=token_type_ids, - past_key_values_length=past_key_values_length, - ) - if self.fuse: - assert not output_attentions, "Not support attentions output currently." - assert past_key_values is None, "Not support past_key_values currently." - hidden_states = embedding_output - all_hidden_states = [] if output_hidden_states else None - for layer in self.encoder: - hidden_states = layer(hidden_states, attention_mask) - if output_hidden_states: - all_hidden_states.append(hidden_states) - pooled_output = self.pooler(hidden_states) - - if return_dict: - return BaseModelOutputWithPoolingAndCrossAttentions( - last_hidden_state=hidden_states, pooler_output=pooled_output, hidden_states=all_hidden_states - ) - else: - return ( - (hidden_states, pooled_output, all_hidden_states) - if output_hidden_states - else (hidden_states, pooled_output) - ) - else: - self.encoder._use_cache = use_cache # To be consistent with HF - encoder_outputs = self.encoder( - embedding_output, - src_mask=attention_mask, - cache=past_key_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - if isinstance(encoder_outputs, type(embedding_output)): - sequence_output = encoder_outputs - pooled_output = self.pooler(sequence_output) - return (sequence_output, pooled_output) - else: - sequence_output = encoder_outputs[0] - pooled_output = self.pooler(sequence_output) - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( - last_hidden_state=sequence_output, - pooler_output=pooled_output, - past_key_values=encoder_outputs.past_key_values, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) - - -class BertForQuestionAnswering(BertPretrainedModel): - """ - Bert Model with a linear layer on top of the hidden-states output to compute `span_start_logits` - and `span_end_logits`, designed for question-answering tasks like SQuAD. - - Args: - config (:class:`BertConfig`): - An instance of BertConfig used to construct BertForQuestionAnswering. - """ - - def __init__(self, config: BertConfig): - super(BertForQuestionAnswering, self).__init__(config) - self.bert = BertModel(config) - self.dropout = nn.Dropout( - config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob - ) - self.classifier = nn.Linear(config.hidden_size, 2) - - def forward( - self, - input_ids: Tensor, - token_type_ids: Optional[Tensor] = None, - position_ids: Optional[Tensor] = None, - attention_mask: Optional[Tensor] = None, - start_positions: Optional[Tensor] = None, - end_positions: Optional[Tensor] = None, - output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None, - return_dict: Optional[bool] = None, - ): - r""" - The BertForQuestionAnswering forward method, overrides the __call__() special method. - - Args: - input_ids (Tensor): - See :class:`BertModel`. - token_type_ids (Tensor, optional): - See :class:`BertModel`. - position_ids(Tensor, optional): - See :class:`BertModel`. - attention_mask (Tensor, optional): - See :class:`BertModel`. - start_positions (Tensor of shape `(batch_size,)`, optional): - Labels for position (index) of the start of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence - are not taken into account for computing the loss. - end_positions (Tensor of shape `(batch_size,)`, optional): - Labels for position (index) of the end of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence - are not taken into account for computing the loss. - output_hidden_states (bool, optional): - Whether to return the hidden states of all layers. - Defaults to `None`. - output_attentions (bool, optional): - Whether to return the attentions tensors of all attention layers. - Defaults to `None`. - return_dict (bool, optional): - Whether to return a :class:`~paddleformers.transformers.model_outputs.QuestionAnsweringModelOutput` object. If - `False`, the output will be a tuple of tensors. Defaults to `None`. - - Returns: - An instance of :class:`~paddleformers.transformers.model_outputs.QuestionAnsweringModelOutput` if `return_dict=True`. - Otherwise it returns a tuple of tensors corresponding to ordered and - not None (depending on the input arguments) fields of :class:`~paddleformers.transformers.model_outputs.QuestionAnsweringModelOutput`. - - Example: - .. code-block:: - - import paddle - from paddleformers.transformers.bert.modeling import BertForQuestionAnswering - from paddleformers.transformers.bert.tokenizer import BertTokenizer - - tokenizer = BertTokenizer.from_pretrained('bert-base-cased') - model = BertForQuestionAnswering.from_pretrained('bert-base-cased') - - inputs = tokenizer("Welcome to use PaddlePaddle and PaddleFormers!") - inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()} - outputs = model(**inputs) - - start_logits = outputs[0] - end_logits = outputs[1] - """ - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.bert( - input_ids, - token_type_ids=token_type_ids, - position_ids=position_ids, - attention_mask=attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output = outputs[0] - - logits = self.classifier(sequence_output) - logits = paddle.transpose(logits, perm=[2, 0, 1]) - start_logits, end_logits = paddle.unstack(x=logits, axis=0) - - total_loss = None - if start_positions is not None and end_positions is not None: - # If we are on multi-GPU, split add a dimension - if start_positions.ndim > 1: - start_positions = start_positions.squeeze(-1) - if start_positions.ndim > 1: - end_positions = end_positions.squeeze(-1) - # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = start_logits.shape[1] - start_positions = start_positions.clip(0, ignored_index) - end_positions = end_positions.clip(0, ignored_index) - - loss_fct = paddle.nn.CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) - total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - - return QuestionAnsweringModelOutput( - loss=total_loss, - start_logits=start_logits, - end_logits=end_logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -class BertForSequenceClassification(BertPretrainedModel): - """ - Bert Model with a linear layer on top of the output layer, - designed for sequence classification/regression tasks like GLUE tasks. - - Args: - config (:class:`BertConfig`): - An instance of BertConfig used to construct BertForSequenceClassification. - """ - - def __init__(self, config: BertConfig): - super(BertForSequenceClassification, self).__init__(config) - - self.bert = BertModel(config) - self.num_labels = config.num_labels - self.dropout = nn.Dropout( - config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob - ) - self.classifier = nn.Linear(config.hidden_size, config.num_labels) - - def forward( - self, - input_ids: Tensor, - token_type_ids: Optional[Tensor] = None, - position_ids: Optional[Tensor] = None, - attention_mask: Optional[Tensor] = None, - labels: Optional[Tensor] = None, - output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None, - return_dict: Optional[bool] = None, - ): - r""" - The BertForSequenceClassification forward method, overrides the __call__() special method. - - Args: - input_ids (Tensor): - See :class:`BertModel`. - token_type_ids (Tensor, optional): - See :class:`BertModel`. - position_ids(Tensor, optional): - See :class:`BertModel`. - attention_mask (Tensor, optional): - See :class:`BertModel`. - labels (Tensor of shape `(batch_size,)`, optional): - Labels for computing the sequence classification/regression loss. - Indices should be in `[0, ..., num_labels - 1]`. If `num_labels == 1` - a regression loss is computed (Mean-Square loss), If `num_labels > 1` - a classification loss is computed (Cross-Entropy). - output_hidden_states (bool, optional): - Whether to return the hidden states of all layers. - Defaults to `None`. - output_attentions (bool, optional): - Whether to return the attentions tensors of all attention layers. - Defaults to `None`. - return_dict (bool, optional): - Whether to return a :class:`~paddleformers.transformers.model_outputs.SequenceClassifierOutput` object. If - `False`, the output will be a tuple of tensors. Defaults to `None`. - - Returns: - An instance of :class:`~paddleformers.transformers.model_outputs.SequenceClassifierOutput` if `return_dict=True`. - Otherwise it returns a tuple of tensors corresponding to ordered and - not None (depending on the input arguments) fields of :class:`~paddleformers.transformers.model_outputs.SequenceClassifierOutput`. - - Example: - .. code-block:: - - import paddle - from paddleformers.transformers.bert.modeling import BertForSequenceClassification - from paddleformers.transformers.bert.tokenizer import BertTokenizer - - tokenizer = BertTokenizer.from_pretrained('bert-base-cased') - model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2) - - inputs = tokenizer("Welcome to use PaddlePaddle and PaddleFormers!") - inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()} - - logits = model(**inputs) - print(logits.shape) - # [1, 2] - - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.bert( - input_ids, - token_type_ids=token_type_ids, - position_ids=position_ids, - attention_mask=attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - pooled_output = outputs[1] - - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) - - loss = None - if labels is not None: - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = paddle.nn.MSELoss() - if self.num_labels == 1: - loss = loss_fct(logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = paddle.nn.CrossEntropyLoss() - loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,))) - elif self.config.problem_type == "multi_label_classification": - loss_fct = paddle.nn.BCEWithLogitsLoss() - loss = loss_fct(logits, labels) - - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output) - - return SequenceClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -class BertForTokenClassification(BertPretrainedModel): - """ - Bert Model with a linear layer on top of the hidden-states output layer, - designed for token classification tasks like NER tasks. - - Args: - config (:class:`BertConfig`): - An instance of BertConfig used to construct BertForTokenClassification. - """ - - def __init__(self, config: BertConfig): - super().__init__(config) - - self.bert = BertModel(config) - self.num_labels = config.num_labels - self.dropout = nn.Dropout( - config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob - ) - self.classifier = nn.Linear(config.hidden_size, config.num_labels) - - def forward( - self, - input_ids: Tensor, - token_type_ids: Optional[Tensor] = None, - position_ids: Optional[Tensor] = None, - attention_mask: Optional[Tensor] = None, - labels: Optional[Tensor] = None, - output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None, - return_dict: Optional[bool] = None, - ): - r""" - The BertForTokenClassification forward method, overrides the __call__() special method. - - Args: - input_ids (Tensor): - See :class:`BertModel`. - token_type_ids (Tensor, optional): - See :class:`BertModel`. - position_ids(Tensor, optional): - See :class:`BertModel`. - attention_mask (list, optional): - See :class:`BertModel`. - labels (Tensor of shape `(batch_size, sequence_length)`, optional): - Labels for computing the token classification loss. Indices should be in `[0, ..., num_labels - 1]`. - output_hidden_states (bool, optional): - Whether to return the hidden states of all layers. - Defaults to `None`. - output_attentions (bool, optional): - Whether to return the attentions tensors of all attention layers. - Defaults to `None`. - return_dict (bool, optional): - Whether to return a :class:`~paddleformers.transformers.model_outputs.TokenClassifierOutput` object. If - `False`, the output will be a tuple of tensors. Defaults to `None`. - - Returns: - An instance of :class:`~paddleformers.transformers.model_outputs.TokenClassifierOutput` if `return_dict=True`. - Otherwise it returns a tuple of tensors corresponding to ordered and - not None (depending on the input arguments) fields of :class:`~paddleformers.transformers.model_outputs.TokenClassifierOutput`. - - Example: - .. code-block:: - - import paddle - from paddleformers.transformers.bert.modeling import BertForTokenClassification - from paddleformers.transformers.bert.tokenizer import BertTokenizer - - tokenizer = BertTokenizer.from_pretrained('bert-base-cased') - model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=2) - - inputs = tokenizer("Welcome to use PaddlePaddle and PaddleFormers!") - inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()} - - logits = model(**inputs) - print(logits.shape) - # [1, 13, 2] - - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.bert( - input_ids, - token_type_ids=token_type_ids, - position_ids=position_ids, - attention_mask=attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output = outputs[0] - - sequence_output = self.dropout(sequence_output) - logits = self.classifier(sequence_output) - - loss = None - if labels is not None: - loss_fct = paddle.nn.CrossEntropyLoss() - loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,))) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output) - - return TokenClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -class BertLMPredictionHead(Layer): - """ - Bert Model with a `language modeling` head on top for CLM fine-tuning. - """ - - def __init__(self, config: BertConfig): - super(BertLMPredictionHead, self).__init__() - - self.transform = nn.Linear(config.hidden_size, config.hidden_size) - self.activation = getattr(nn.functional, config.hidden_act) - self.layer_norm = nn.LayerNorm(config.hidden_size) - self.decoder = TransposedLinear(config.hidden_size, config.vocab_size) - # link bias to load pretrained weights - self.decoder_bias = self.decoder.bias - - def forward(self, hidden_states, masked_positions=None): - if masked_positions is not None: - hidden_states = paddle.reshape(hidden_states, [-1, hidden_states.shape[-1]]) - hidden_states = paddle.tensor.gather(hidden_states, masked_positions) - # gather masked tokens might be more quick - hidden_states = self.transform(hidden_states) - hidden_states = self.activation(hidden_states) - hidden_states = self.layer_norm(hidden_states) - hidden_states = self.decoder(hidden_states) - return hidden_states - - -class BertPretrainingHeads(Layer): - """ - Perform language modeling task and next sentence classification task. - - Args: - config (:class:`BertConfig`): - An instance of BertConfig used to construct BertForPretraining. - - """ - - def __init__(self, config: BertConfig): - super(BertPretrainingHeads, self).__init__() - self.predictions = BertLMPredictionHead(config) - self.seq_relationship = nn.Linear(config.hidden_size, 2) - - def forward(self, sequence_output, pooled_output, masked_positions=None): - """ - Args: - sequence_output(Tensor): - Sequence of hidden-states at the last layer of the model. - It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size]. - pooled_output(Tensor): - The output of first token (`[CLS]`) in sequence. - We "pool" the model by simply taking the hidden state corresponding to the first token. - Its data type should be float32 and its shape is [batch_size, hidden_size]. - masked_positions(Tensor, optional): - A tensor indicates positions to be masked in the position embedding. - Its data type should be int64 and its shape is [batch_size, mask_token_num]. - `mask_token_num` is the number of masked tokens. It should be no bigger than `sequence_length`. - Defaults to `None`, which means we output hidden-states of all tokens in masked token prediction. - - Returns: - tuple: Returns tuple (``prediction_scores``, ``seq_relationship_score``). - - With the fields: - - - `prediction_scores` (Tensor): - The scores of masked token prediction. Its data type should be float32. - If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size]. - Otherwise, its shape is [batch_size, mask_token_num, vocab_size]. - - - `seq_relationship_score` (Tensor): - The scores of next sentence prediction. - Its data type should be float32 and its shape is [batch_size, 2]. - - """ - prediction_scores = self.predictions(sequence_output, masked_positions) - seq_relationship_score = self.seq_relationship(pooled_output) - return prediction_scores, seq_relationship_score - - -@dataclass -class BertForPreTrainingOutput(ModelOutput): - """ - Output type of [`BertForPreTraining`]. - - Args: - loss (*optional*, returned when `labels` is provided, `paddle.Tensor` of shape `(1,)`): - Total loss as the sum of the masked language modeling loss and the next sequence prediction - (classification) loss. - prediction_logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - seq_relationship_logits (`paddle.Tensor` of shape `(batch_size, 2)`): - Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation - before SoftMax). - hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ - - loss: Optional[paddle.Tensor] = None - prediction_logits: paddle.Tensor = None - seq_relationship_logits: paddle.Tensor = None - hidden_states: Optional[Tuple[paddle.Tensor]] = None - attentions: Optional[Tuple[paddle.Tensor]] = None - - -class BertForPretraining(BertPretrainedModel): - """ - Bert Model with pretraining tasks on top. - - Args: - config (:class:`BertConfig`): - An instance of BertConfig used to construct BertForPretraining. - - """ - - def __init__(self, config: BertConfig): - super(BertForPretraining, self).__init__(config) - self.bert = BertModel(config) - self.cls = BertPretrainingHeads(config) - self.tie_weights() - - def get_output_embeddings(self): - return self.cls.predictions.decoder - - def forward( - self, - input_ids: Tensor, - token_type_ids: Optional[Tensor] = None, - position_ids: Optional[Tensor] = None, - attention_mask: Optional[Tensor] = None, - masked_positions: Optional[Tensor] = None, - labels: Optional[Tensor] = None, - next_sentence_label: Optional[Tensor] = None, - output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None, - return_dict: Optional[bool] = None, - ): - r""" - - Args: - input_ids (Tensor): - See :class:`BertModel`. - token_type_ids (Tensor, optional): - See :class:`BertModel`. - position_ids (Tensor, optional): - See :class:`BertModel`. - attention_mask (Tensor, optional): - See :class:`BertModel`. - masked_positions(Tensor, optional): - See :class:`BertPretrainingHeads`. - labels (Tensor of shape `(batch_size, sequence_length)`, optional): - Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., - vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), - the loss is only computed for the tokens with labels in `[0, ..., vocab_size]`. - next_sentence_label (Tensor of shape `(batch_size,)`, optional): - Labels for computing the next sequence prediction (classification) loss. Input should be a sequence - pair (see `input_ids` docstring) Indices should be in `[0, 1]`: - - - 0 indicates sequence B is a continuation of sequence A, - - 1 indicates sequence B is a random sequence. - output_hidden_states (bool, optional): - Whether to return the hidden states of all layers. - Defaults to `None`. - output_attentions (bool, optional): - Whether to return the attentions tensors of all attention layers. - Defaults to `None`. - return_dict (bool, optional): - Whether to return a :class:`~paddleformers.transformers.bert.BertForPreTrainingOutput` object. If - `False`, the output will be a tuple of tensors. Defaults to `None`. - - Returns: - An instance of :class:`~paddleformers.transformers.bert.BertForPreTrainingOutput` if `return_dict=True`. - Otherwise it returns a tuple of tensors corresponding to ordered and - not None (depending on the input arguments) fields of :class:`~paddleformers.transformers.bert.BertForPreTrainingOutput`. - - """ - with paddle.static.amp.fp16_guard(): - outputs = self.bert( - input_ids, - token_type_ids=token_type_ids, - position_ids=position_ids, - attention_mask=attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - sequence_output, pooled_output = outputs[:2] - prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output, masked_positions) - - total_loss = None - if labels is not None and next_sentence_label is not None: - loss_fct = paddle.nn.CrossEntropyLoss() - masked_lm_loss = loss_fct( - prediction_scores.reshape((-1, prediction_scores.shape[-1])), labels.reshape((-1,)) - ) - next_sentence_loss = loss_fct( - seq_relationship_score.reshape((-1, 2)), next_sentence_label.reshape((-1,)) - ) - total_loss = masked_lm_loss + next_sentence_loss - if not return_dict: - output = (prediction_scores, seq_relationship_score) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - - return BertForPreTrainingOutput( - loss=total_loss, - prediction_logits=prediction_scores, - seq_relationship_logits=seq_relationship_score, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -class BertPretrainingCriterion(paddle.nn.Layer): - """ - - Args: - vocab_size(int): - Vocabulary size of `inputs_ids` in `BertModel`. Defines the number of different tokens that can - be represented by the `inputs_ids` passed when calling `BertModel`. - - """ - - def __init__(self, vocab_size): - super(BertPretrainingCriterion, self).__init__() - # CrossEntropyLoss is expensive since the inner reshape (copy) - self.loss_fn = paddle.nn.loss.CrossEntropyLoss(ignore_index=-1) - self.vocab_size = vocab_size - - def forward( - self, prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels, masked_lm_scale - ): - """ - Args: - prediction_scores(Tensor): - The scores of masked token prediction. Its data type should be float32. - If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size]. - Otherwise, its shape is [batch_size, mask_token_num, vocab_size] - seq_relationship_score(Tensor): - The scores of next sentence prediction. Its data type should be float32 and - its shape is [batch_size, 2] - masked_lm_labels(Tensor): - The labels of the masked language modeling, its dimensionality is equal to `prediction_scores`. - Its data type should be int64. If `masked_positions` is None, its shape is [batch_size, sequence_length, 1]. - Otherwise, its shape is [batch_size, mask_token_num, 1] - next_sentence_labels(Tensor): - The labels of the next sentence prediction task, the dimensionality of `next_sentence_labels` - is equal to `seq_relation_labels`. Its data type should be int64 and - its shape is [batch_size, 1] - masked_lm_scale(Tensor or int): - The scale of masked tokens. Used for the normalization of masked language modeling loss. - If it is a `Tensor`, its data type should be int64 and its shape is equal to `prediction_scores`. - - Returns: - Tensor: The pretraining loss, equals to the sum of `masked_lm_loss` plus the mean of `next_sentence_loss`. - Its data type should be float32 and its shape is [1]. - - - """ - with paddle.static.amp.fp16_guard(): - masked_lm_loss = F.cross_entropy(prediction_scores, masked_lm_labels, reduction="none", ignore_index=-1) - masked_lm_loss = masked_lm_loss / masked_lm_scale - next_sentence_loss = F.cross_entropy(seq_relationship_score, next_sentence_labels, reduction="none") - return paddle.sum(masked_lm_loss) + paddle.mean(next_sentence_loss) - - -class BertForMultipleChoice(BertPretrainedModel): - """ - Bert Model with a linear layer on top of the hidden-states output layer, - designed for multiple choice tasks like RocStories/SWAG tasks. - - Args: - config (:class:`BertConfig`): - An instance of BertConfig used to construct BertForMultipleChoice. - - Examples: - >>> model = BertForMultipleChoice(config, dropout=0.1) - >>> # or - >>> config.hidden_dropout_prob = 0.1 - >>> model = BertForMultipleChoice(config) - """ - - def __init__(self, config: BertConfig): - super(BertForMultipleChoice, self).__init__(config) - - self.bert = BertModel(config) - self.num_choices = config.num_choices - self.dropout = nn.Dropout( - config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob - ) - self.classifier = nn.Linear(config.hidden_size, 1) - - def forward( - self, - input_ids: Tensor, - token_type_ids: Optional[Tensor] = None, - position_ids: Optional[Tensor] = None, - attention_mask: Optional[Tensor] = None, - labels: Optional[Tensor] = None, - output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None, - return_dict: Optional[bool] = None, - ): - r""" - The BertForMultipleChoice forward method, overrides the __call__() special method. - - Args: - input_ids (Tensor): - See :class:`BertModel` and shape as [batch_size, num_choice, sequence_length]. - token_type_ids(Tensor, optional): - See :class:`BertModel` and shape as [batch_size, num_choice, sequence_length]. - position_ids(Tensor, optional): - See :class:`BertModel` and shape as [batch_size, num_choice, sequence_length]. - attention_mask (list, optional): - See :class:`BertModel` and shape as [batch_size, num_choice, sequence_length]. - labels (Tensor of shape `(batch_size, )`, optional): - Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., - num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See - `input_ids` above) - output_hidden_states (bool, optional): - Whether to return the hidden states of all layers. - Defaults to `None`. - output_attentions (bool, optional): - Whether to return the attentions tensors of all attention layers. - Defaults to `None`. - return_dict (bool, optional): - Whether to return a :class:`~paddleformers.transformers.model_outputs.MultipleChoiceModelOutput` object. If - `False`, the output will be a tuple of tensors. Defaults to `None`. - - Returns: - An instance of :class:`~paddleformers.transformers.model_outputs.MultipleChoiceModelOutput` if `return_dict=True`. - Otherwise it returns a tuple of tensors corresponding to ordered and - not None (depending on the input arguments) fields of :class:`~paddleformers.transformers.model_outputs.MultipleChoiceModelOutput`. - - Example: - .. code-block:: - - import paddle - from paddleformers.transformers import BertForMultipleChoice, BertTokenizer - from paddleformers.data import Pad, Dict - - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - model = BertForMultipleChoice.from_pretrained('bert-base-uncased', num_choices=2) - - data = [ - { - "question": "how do you turn on an ipad screen?", - "answer1": "press the volume button.", - "answer2": "press the lock button.", - "label": 1, - }, - { - "question": "how do you indent something?", - "answer1": "leave a space before starting the writing", - "answer2": "press the spacebar", - "label": 0, - }, - ] - - text = [] - text_pair = [] - for d in data: - text.append(d["question"]) - text_pair.append(d["answer1"]) - text.append(d["question"]) - text_pair.append(d["answer2"]) - - inputs = tokenizer(text, text_pair) - batchify_fn = lambda samples, fn=Dict( - { - "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), # input_ids - "token_type_ids": Pad( - axis=0, pad_val=tokenizer.pad_token_type_id - ), # token_type_ids - } - ): fn(samples) - inputs = batchify_fn(inputs) - - reshaped_logits = model( - input_ids=paddle.to_tensor(inputs[0], dtype="int64"), - token_type_ids=paddle.to_tensor(inputs[1], dtype="int64"), - ) - print(reshaped_logits.shape) - # [2, 2] - - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - # input_ids: [bs, num_choice, seq_l] - input_ids = input_ids.reshape(shape=(-1, input_ids.shape[-1])) # flat_input_ids: [bs*num_choice,seq_l] - - if position_ids is not None: - position_ids = position_ids.reshape(shape=(-1, position_ids.shape[-1])) - if token_type_ids is not None: - token_type_ids = token_type_ids.reshape(shape=(-1, token_type_ids.shape[-1])) - - if attention_mask is not None: - attention_mask = attention_mask.reshape(shape=(-1, attention_mask.shape[-1])) - - outputs = self.bert( - input_ids, - token_type_ids=token_type_ids, - position_ids=position_ids, - attention_mask=attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - pooled_output = outputs[1] - pooled_output = self.dropout(pooled_output) - - logits = self.classifier(pooled_output) # logits: (bs*num_choice,1) - reshaped_logits = logits.reshape(shape=(-1, self.num_choices)) # logits: (bs, num_choice) - - loss = None - if labels is not None: - loss_fct = paddle.nn.CrossEntropyLoss() - loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output) - - return MultipleChoiceModelOutput( - loss=loss, - logits=reshaped_logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -class BertOnlyMLMHead(nn.Layer): - def __init__(self, config: BertConfig): - super().__init__() - self.predictions = BertLMPredictionHead(config=config) - - def forward(self, sequence_output, masked_positions=None): - prediction_scores = self.predictions(sequence_output, masked_positions) - return prediction_scores - - -class BertForMaskedLM(BertPretrainedModel): - """ - Bert Model with a `masked language modeling` head on top. - - Args: - config (:class:`BertConfig`): - An instance of BertConfig used to construct BertForMaskedLM. - - """ - - def __init__(self, config: BertConfig): - super(BertForMaskedLM, self).__init__(config) - self.bert = BertModel(config) - - self.cls = BertOnlyMLMHead(config=config) - self.tie_weights() - - def get_output_embeddings(self): - return self.cls.predictions.decoder - - def forward( - self, - input_ids: Tensor, - token_type_ids: Optional[Tensor] = None, - position_ids: Optional[Tensor] = None, - attention_mask: Optional[Tensor] = None, - masked_positions: Optional[Tensor] = None, - labels: Optional[Tensor] = None, - output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None, - return_dict: Optional[bool] = None, - ): - r""" - - Args: - input_ids (Tensor): - See :class:`BertModel`. - token_type_ids (Tensor, optional): - See :class:`BertModel`. - position_ids (Tensor, optional): - See :class:`BertModel`. - attention_mask (Tensor, optional): - See :class:`BertModel`. - labels (Tensor of shape `(batch_size, sequence_length)`, optional): - Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., - vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the - loss is only computed for the tokens with labels in `[0, ..., vocab_size]` - output_hidden_states (bool, optional): - Whether to return the hidden states of all layers. - Defaults to `None`. - output_attentions (bool, optional): - Whether to return the attentions tensors of all attention layers. - Defaults to `None`. - return_dict (bool, optional): - Whether to return a :class:`~paddleformers.transformers.model_outputs.MaskedLMOutput` object. If - `False`, the output will be a tuple of tensors. Defaults to `None`. - - Returns: - An instance of :class:`~paddleformers.transformers.model_outputs.MaskedLMOutput` if `return_dict=True`. - Otherwise it returns a tuple of tensors corresponding to ordered and - not None (depending on the input arguments) fields of :class:`~paddleformers.transformers.model_outputs.MaskedLMOutput`. - - Example: - .. code-block:: - - import paddle - from paddleformers.transformers import BertForMaskedLM, BertTokenizer - - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - model = BertForMaskedLM.from_pretrained('bert-base-uncased') - - inputs = tokenizer("Welcome to use PaddlePaddle and PaddleFormers!") - inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()} - - logits = model(**inputs) - print(logits.shape) - # [1, 13, 30522] - - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.bert( - input_ids, - token_type_ids=token_type_ids, - position_ids=position_ids, - attention_mask=attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - sequence_output = outputs[0] - prediction_scores = self.cls(sequence_output, masked_positions=masked_positions) - - masked_lm_loss = None - if labels is not None: - loss_fct = paddle.nn.CrossEntropyLoss() # -100 index = padding token - masked_lm_loss = loss_fct( - prediction_scores.reshape((-1, prediction_scores.shape[-1])), labels.reshape((-1,)) - ) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ( - ((masked_lm_loss,) + output) - if masked_lm_loss is not None - else (output[0] if len(output) == 1 else output) - ) - - return MaskedLMOutput( - loss=masked_lm_loss, - logits=prediction_scores, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) diff --git a/paddleformers/transformers/bert/modeling.pyi b/paddleformers/transformers/bert/modeling.pyi deleted file mode 100644 index c1a831a4d58..00000000000 --- a/paddleformers/transformers/bert/modeling.pyi +++ /dev/null @@ -1,347 +0,0 @@ -from typing import Dict, Optional, Tuple, Union, overload - -import paddle -import paddle.nn as nn -from _typeshed import Incomplete -from paddle import Tensor -from paddle.nn import Embedding, Layer, Linear - -from ..model_outputs import ModelOutput -from ..model_utils import PretrainedModel -from .configuration import BertConfig - -class BertEmbeddings(Layer): - word_embeddings: Embedding - position_embeddings: Embedding - token_type_embeddings: Embedding - layer_norm: Layer - dropout: float - def __init__(self, config: BertConfig) -> None: ... - def forward( - self, - input_ids: Tensor, - token_type_ids: Tensor | None = ..., - position_ids: Tensor | None = ..., - past_key_values_length: int = 0, - ): ... - -class BertPooler(Layer): - dense: Linear - activation: Layer - pool_act: Layer - def __init__(self, config: BertConfig) -> None: ... - def forward(self, hidden_states): ... - -class BertPretrainedModel(PretrainedModel): - model_config_file: str - config_class: Incomplete - resource_files_names: Dict[str, str] - base_model_prefix: str - pretrained_init_configuration: Dict[str, dict] - pretrained_resource_files_map: Dict[str, str] - def init_weights(self, layer) -> None: ... - -class BertModel(BertPretrainedModel): - pad_token_id: int - initializer_range: float - embeddings: Embedding - fuse: bool - encoder: nn.TransformerDecoder - pooler: BertPooler - - def __init__(self, config: BertConfig) -> None: ... - def get_input_embeddings(self): ... - def set_input_embeddings(self, value) -> None: ... - def forward( - self, - input_ids, - token_type_ids: Tensor | None = ..., - position_ids: Tensor | None = ..., - attention_mask: Tensor | None = ..., - past_key_values: Tensor | None = ..., - use_cache: Tensor | None = ..., - output_hidden_states: bool = ..., - output_attentions: bool = ..., - return_dict: bool = ..., - ): ... - @staticmethod - def from_pretrained( - cls, - pretrained_model_name_or_path: str, - cache_dir: str | None = None, - config: Optional[BertConfig] = None, - *args, - **kwargs - ) -> BertModel: ... - -class BertForQuestionAnswering(BertPretrainedModel): - bert: BertModel - dropout: nn.Dropout - classifier: Linear - def __init__(self, config: BertConfig): ... - def forward( - self, - input_ids, - token_type_ids: Tensor | None = ..., - position_ids: Tensor | None = ..., - attention_mask: Tensor | None = ..., - start_positions: Tensor | None = ..., - end_positions: Tensor | None = ..., - output_hidden_states: bool = ..., - output_attentions: bool = ..., - return_dict: bool = ..., - ): ... - def __call__( - self, - input_ids, - token_type_ids: Tensor | None = ..., - position_ids: Tensor | None = ..., - attention_mask: Tensor | None = ..., - start_positions: Tensor | None = ..., - end_positions: Tensor | None = ..., - output_hidden_states: bool = ..., - output_attentions: bool = ..., - return_dict: bool = ..., - ): ... - @staticmethod - def from_pretrained( - cls, - pretrained_model_name_or_path: str, - cache_dir: str | None = None, - classifier_dropout: float | None = None, - config: Optional[BertConfig] = None, - *args, - **kwargs - ) -> BertForQuestionAnswering: ... - -class BertForSequenceClassification(BertPretrainedModel): - bert: BertModel - num_labels: int - dropout: nn.Dropout - classifier: Linear - def __init__(self, config: BertConfig): ... - def forward( - self, - input_ids: Tensor, - token_type_ids: Tensor | None = ..., - position_ids: Tensor | None = ..., - attention_mask: Tensor | None = ..., - labels: Tensor | None = ..., - output_hidden_states: bool = ..., - output_attentions: bool = ..., - return_dict: bool = ..., - ): ... - def __call__( - self, - input_ids: Tensor, - token_type_ids: Tensor | None = ..., - position_ids: Tensor | None = ..., - attention_mask: Tensor | None = ..., - labels: Tensor | None = ..., - output_hidden_states: bool = ..., - output_attentions: bool = ..., - return_dict: bool = ..., - ): ... - @staticmethod - def from_pretrained( - cls, - pretrained_model_name_or_path: str, - cache_dir: str | None = None, - num_labels: int | None = 2, - classifier_dropout: float | None = None, - config: Optional[BertConfig] = None, - *args, - **kwargs - ) -> BertForSequenceClassification: ... - -class BertForTokenClassification(BertPretrainedModel): - bert: BertModel - num_labels: int - dropout: nn.Dropout - classifier: Linear - def __init__(self, config: BertConfig): ... - def forward( - self, - input_ids, - token_type_ids: Tensor | None = ..., - position_ids: Tensor | None = ..., - attention_mask: Tensor | None = ..., - labels: Tensor | None = ..., - output_hidden_states: bool = ..., - output_attentions: bool = ..., - return_dict: bool = ..., - ): ... - def __call__( - self, - input_ids, - token_type_ids: Tensor | None = ..., - position_ids: Tensor | None = ..., - attention_mask: Tensor | None = ..., - labels: Tensor | None = ..., - output_hidden_states: bool = ..., - output_attentions: bool = ..., - return_dict: bool = ..., - ): ... - @staticmethod - def from_pretrained( - cls, - pretrained_model_name_or_path: str, - cache_dir: str | None = None, - num_labels: int | None = 2, - classifier_dropout: float | None = None, - config: Optional[BertConfig] = None, - *args, - **kwargs - ) -> BertForTokenClassification: ... - -class BertLMPredictionHead(Layer): - transform: Incomplete - activation: Incomplete - layer_norm: nn.LayerNorm - decoder_weight: paddle.ParamAttr - decoder_bias: paddle.ParamAttr - def __init__(self, config: BertConfig, embedding_weights: Tensor | None = ...) -> None: ... - def forward(self, hidden_states, masked_positions: Tensor | None = ...): ... - -class BertPretrainingHeads(Layer): - predictions: Incomplete - seq_relationship: Incomplete - def __init__(self, config: BertConfig, embedding_weights: Tensor | None = ...) -> None: ... - def forward(self, sequence_output, pooled_output, masked_positions: Tensor | None = ...): ... - -class BertForPreTrainingOutput(ModelOutput): - loss: Optional[paddle.Tensor] - prediction_logits: paddle.Tensor - seq_relationship_logits: paddle.Tensor - hidden_states: Optional[Tuple[paddle.Tensor]] - attentions: Optional[Tuple[paddle.Tensor]] - def __init__( - self, - loss: Tensor | None, - prediction_logits: Tensor | None, - seq_relationship_logits: Tensor | None, - hidden_states: Tensor | None, - attentions: Tensor | None, - ) -> None: ... - -class BertForPretraining(BertPretrainedModel): - bert: BertModel - cls: Incomplete - def __init__(self, config: BertConfig) -> None: ... - def forward( - self, - input_ids, - token_type_ids: Tensor | None = ..., - position_ids: Tensor | None = ..., - attention_mask: Tensor | None = ..., - masked_positions: Tensor | None = ..., - labels: Tensor | None = ..., - next_sentence_label: Tensor | None = ..., - output_hidden_states: bool = ..., - output_attentions: bool = ..., - return_dict: bool = ..., - ): ... - def __call__( - self, - input_ids, - token_type_ids: Tensor | None = ..., - position_ids: Tensor | None = ..., - attention_mask: Tensor | None = ..., - masked_positions: Tensor | None = ..., - labels: Tensor | None = ..., - next_sentence_label: Tensor | None = ..., - output_hidden_states: bool = ..., - output_attentions: bool = ..., - return_dict: bool = ..., - ): ... - @staticmethod - def from_pretrained( - cls, - pretrained_model_name_or_path: str, - cache_dir: str | None = None, - config: Optional[BertConfig] = None, - *args, - **kwargs - ) -> BertForQuestionAnswering: ... - -class BertPretrainingCriterion(paddle.nn.Layer): - loss_fn: nn.Layer - vocab_size: int - def __init__(self, vocab_size) -> None: ... - def forward( - self, prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels, masked_lm_scale - ): ... - def __call__( - self, prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels, masked_lm_scale - ): ... - -class BertForMultipleChoice(BertPretrainedModel): - bert: BertModel - num_choices: int - dropout: nn.Dropout - classifier: Linear - @overload - def __init__(self, config: BertConfig) -> None: ... - def forward( - self, - input_ids, - token_type_ids: Tensor | None = ..., - position_ids: Tensor | None = ..., - attention_mask: Tensor | None = ..., - labels: Tensor | None = ..., - output_hidden_states: bool = ..., - output_attentions: bool = ..., - return_dict: bool = ..., - ): ... - @staticmethod - def from_pretrained( - cls, - pretrained_model_name_or_path: str, - cache_dir: str | None = None, - num_choices: int | None = 2, - classifier_dropout: float | None = None, - config: Optional[BertConfig] = None, - *args, - **kwargs - ) -> BertForMultipleChoice: ... - -class BertOnlyMLMHead(nn.Layer): - predictions: BertLMPredictionHead - def __init__(self, config: BertConfig, embedding_weights: Tensor | None = ...) -> None: ... - def forward(self, sequence_output, masked_positions: Tensor | None = ...): ... - -class BertForMaskedLM(BertPretrainedModel): - bert: BertModel - cls: BertOnlyMLMHead - def __init__(self, config: BertConfig) -> None: ... - def forward( - self, - input_ids, - token_type_ids: Tensor | None = ..., - position_ids: Tensor | None = ..., - attention_mask: Tensor | None = ..., - labels: Tensor | None = ..., - output_hidden_states: bool = ..., - output_attentions: bool = ..., - return_dict: bool = ..., - ): ... - def __call__( - self, - input_ids, - token_type_ids: Tensor | None = ..., - position_ids: Tensor | None = ..., - attention_mask: Tensor | None = ..., - labels: Tensor | None = ..., - output_hidden_states: bool = ..., - output_attentions: bool = ..., - return_dict: bool = ..., - ): ... - @staticmethod - def from_pretrained( - cls, - pretrained_model_name_or_path: str, - cache_dir: str | None = None, - config: Optional[BertConfig] = None, - *args, - **kwargs - ) -> BertForMaskedLM: ... diff --git a/paddleformers/transformers/bert/tokenizer.py b/paddleformers/transformers/bert/tokenizer.py deleted file mode 100644 index 0b59d076bca..00000000000 --- a/paddleformers/transformers/bert/tokenizer.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import transformers as hf - -from ..tokenizer_utils import warp_tokenizer - -BertTokenizer = warp_tokenizer(hf.BertTokenizer) diff --git a/paddleformers/transformers/bert/tokenizer_fast.py b/paddleformers/transformers/bert/tokenizer_fast.py deleted file mode 100644 index 4fd6ac5dfcc..00000000000 --- a/paddleformers/transformers/bert/tokenizer_fast.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import transformers as hf - -from ..tokenizer_utils import warp_tokenizer - -BertTokenizerFast = warp_tokenizer(hf.BertTokenizerFast) diff --git a/paddleformers/transformers/export.py b/paddleformers/transformers/export.py deleted file mode 100644 index 46c957ab9c1..00000000000 --- a/paddleformers/transformers/export.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from typing import List, Optional, Tuple - -import paddle - -from ..utils.log import logger -from .model_utils import PretrainedModel, unwrap_model - -__all__ = ["export_model"] - - -def export_model( - model: "PretrainedModel", input_spec=None, path: Optional[str] = None, model_format: Optional[str] = "paddle" -) -> Tuple[List[str], List[str]]: - """ - Export paddle inference model or onnx model. - - Args: - model ([`PretrainedModel`]: - The model to export. - input_spec (paddle.static.InputSpec, optional): - Describes the input of the saved model’s forward method, which can be described - by InputSpec or example Tensor. Default None. - path (Optional[str], optional): - Output dir to save the exported model. Defaults to None. - model_format (Optional[str], optional): - Export model format. There are two options: paddle or onnx, defaults to paddle. - - """ - if path is None: - path = "./" - logger.info("Export path is missing, set default path to current dir.") - - if issubclass(type(model), PretrainedModel): - model = unwrap_model(model) - model.eval() - - model_format = model_format.lower() - file_prefix = "model" - if model_format == "paddle": - # Convert to static graph with specific input description - model = paddle.jit.to_static(model, input_spec=input_spec) - # Save in static graph model. - save_path = os.path.join(path, file_prefix) - logger.info("Exporting inference model to %s" % save_path) - paddle.jit.save(model, save_path) - logger.info("Inference model exported.") - elif model_format == "onnx": - # Export ONNX model. - save_path = os.path.join(path, file_prefix) - logger.info("Exporting ONNX model to %s" % save_path) - paddle.onnx.export(model, save_path, input_spec=input_spec) - logger.info("ONNX model exported.") - else: - logger.info("This export format is not supported, please select paddle or onnx!") diff --git a/paddleformers/transformers/feature_extraction_sequence_utils.py b/paddleformers/transformers/feature_extraction_sequence_utils.py deleted file mode 100644 index f02a0e07b4a..00000000000 --- a/paddleformers/transformers/feature_extraction_sequence_utils.py +++ /dev/null @@ -1,365 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" - Sequence feature extraction class for common feature extractors to preprocess sequences. -""" -from typing import Dict, List, Optional, Union - -import numpy as np -import paddle - -from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin -from .tokenizer_utils_base import PaddingStrategy - - -class SequenceFeatureExtractor(FeatureExtractionMixin): - """ - This is a general feature extraction class for speech recognition. - - Args: - feature_size (`int`): - The feature dimension of the extracted features. - sampling_rate (`int`): - The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). - padding_value (`float`): - The value that is used to fill the padding values / vectors. - """ - - def __init__(self, feature_size: int, sampling_rate: int, padding_value: float, **kwargs): - self.feature_size = feature_size - self.sampling_rate = sampling_rate - self.padding_value = padding_value - - self.padding_side = kwargs.pop("padding_side", "right") - self.return_attention_mask = kwargs.pop("return_attention_mask", True) - - super().__init__(**kwargs) - - def pad( - self, - processed_features: Union[ - BatchFeature, - List[BatchFeature], - Dict[str, BatchFeature], - Dict[str, List[BatchFeature]], - List[Dict[str, BatchFeature]], - ], - padding: Union[bool, str, PaddingStrategy] = True, - max_length: Optional[int] = None, - truncation: bool = False, - pad_to_multiple_of: Optional[int] = None, - return_attention_mask: Optional[bool] = None, - return_tensors: Optional[str] = None, - ) -> BatchFeature: - """ - Pad input values / input vectors or a batch of input values / input vectors up to predefined length or to the - max sequence length in the batch. - - Padding side (left/right) padding values are defined at the feature extractor level (with `self.padding_side`, - `self.padding_value`) - - - - If the `processed_features` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the - result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of - PyTorch tensors, you will lose the specific device of your tensors however. - - - - Args: - processed_features ([`BatchFeature`], list of [`BatchFeature`], `Dict[str, List[float]]`, `Dict[str, List[List[float]]` or `List[Dict[str, List[float]]]`): - Processed inputs. Can represent one input ([`BatchFeature`] or `Dict[str, List[float]]`) or a batch of - input values / vectors (list of [`BatchFeature`], *Dict[str, List[List[float]]]* or *List[Dict[str, - List[float]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader - collate function. - - Instead of `List[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), - see the note above for the return type. - padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): - Select a strategy to pad the returned sequences (according to the model's padding side and padding - index) among: - - - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single - sequence if provided). - - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum - acceptable input length for the model if that argument is not provided. - - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different - lengths). - max_length (`int`, *optional*): - Maximum length of the returned list and optionally padding length (see above). - truncation (`bool`): - Activates truncation to cut input sequences longer than `max_length` to `max_length`. - pad_to_multiple_of (`int`, *optional*): - If set will pad the sequence to a multiple of the provided value. - - This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability - `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. - return_attention_mask (`bool`, *optional*): - Whether to return the attention mask. If left to the default, will return the attention mask according - to the specific feature_extractor's default. - - [What are attention masks?](../glossary#attention-mask) - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors instead of list of python integers. Acceptable values are: - - `'pd'`: Return PaddlePaddle `paddle.Tensor` objects. - - `'np'`: Return Numpy `np.ndarray` objects. - """ - # If we have a list of dicts, let's convert it in a dict of lists - # We do this to allow using this method as a collate_fn function in PyTorch Dataloader - if isinstance(processed_features, (list, tuple)) and isinstance(processed_features[0], (dict, BatchFeature)): - processed_features = { - key: [example[key] for example in processed_features] for key in processed_features[0].keys() - } - - # The model's main input name, usually `input_values`, has be passed for padding - if self.model_input_names[0] not in processed_features: - raise ValueError( - "You should supply an instance of `transformers.BatchFeature` or list of `transformers.BatchFeature`" - f" to this method that includes {self.model_input_names[0]}, but you provided" - f" {list(processed_features.keys())}" - ) - - required_input = processed_features[self.model_input_names[0]] - return_attention_mask = ( - return_attention_mask if return_attention_mask is not None else self.return_attention_mask - ) - - if len(required_input) == 0: - if return_attention_mask: - processed_features["attention_mask"] = [] - return processed_features - - # If we have PyTorch/TF tensors or lists as inputs, we cast them as Numpy arrays - # and rebuild them afterwards if no return_tensors is specified - # Note that we lose the specific device the tensor may be on for PyTorch - - first_element = required_input[0] - if isinstance(first_element, (list, tuple)): - # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element. - index = 0 - while len(required_input[index]) == 0: - index += 1 - if index < len(required_input): - first_element = required_input[index][0] - - if return_tensors is None: - if isinstance(first_element, paddle.Tensor): - return_tensors = "pd" - elif isinstance(first_element, (int, float, list, tuple, np.ndarray)): - return_tensors = "np" - else: - raise ValueError( - f"type of {first_element} unknown: {type(first_element)}. " - "Should be one of a python, numpy, pytorch or tensorflow object." - ) - - for key, value in processed_features.items(): - if isinstance(value[0], (int, float)): - processed_features[key] = np.array(value) - else: - processed_features[key] = [np.array(v) for v in value] - - # Convert padding_strategy in PaddingStrategy - padding_strategy = self._get_padding_strategies(padding=padding, max_length=max_length) - - required_input = processed_features[self.model_input_names[0]] - - batch_size = len(required_input) - if not all(len(v) == batch_size for v in processed_features.values()): - raise ValueError("Some items in the output dictionary have a different batch size than others.") - - truncated_inputs = [] - for i in range(batch_size): - inputs = {k: v[i] for k, v in processed_features.items()} - # truncation - inputs_slice = self._truncate( - inputs, - max_length=max_length, - pad_to_multiple_of=pad_to_multiple_of, - truncation=truncation, - ) - truncated_inputs.append(inputs_slice) - - if padding_strategy == PaddingStrategy.LONGEST: - # make sure that `max_length` cannot be longer than the longest truncated length - max_length = max(len(input_slice[self.model_input_names[0]]) for input_slice in truncated_inputs) - padding_strategy = PaddingStrategy.MAX_LENGTH - - batch_outputs = {} - for i in range(batch_size): - # padding - outputs = self._pad( - truncated_inputs[i], - max_length=max_length, - padding_strategy=padding_strategy, - pad_to_multiple_of=pad_to_multiple_of, - return_attention_mask=return_attention_mask, - ) - - for key, value in outputs.items(): - if key not in batch_outputs: - batch_outputs[key] = [] - if value.dtype is np.dtype(np.float64): - value = value.astype(np.float32) - batch_outputs[key].append(value) - - return BatchFeature(batch_outputs, tensor_type=return_tensors) - - def _pad( - self, - processed_features: Union[Dict[str, np.ndarray], BatchFeature], - max_length: Optional[int] = None, - padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, - pad_to_multiple_of: Optional[int] = None, - return_attention_mask: Optional[bool] = None, - ) -> dict: - """ - Pad inputs (on left/right and up to predefined length or max length in the batch) - - Args: - processed_features (`Union[Dict[str, np.ndarray], BatchFeature]`): - Dictionary of input values (`np.ndarray[float]`) / input vectors (`List[np.ndarray[float]]`) or batch - of inputs values (`List[np.ndarray[int]]`) / input vectors (`List[np.ndarray[int]]`) - max_length (`int`, *optional*): - Maximum length of the returned list and optionally padding length (see below) - padding_strategy (`PaddingStrategy`, *optional*, default to `PaddingStrategy.DO_NOT_PAD`): - PaddingStrategy to use for padding. - - - PaddingStrategy.LONGEST Pad to the longest sequence in the batch - - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) - - PaddingStrategy.DO_NOT_PAD: Do not pad - The feature_extractor padding sides are defined in self.padding_side: - - - 'left': pads on the left of the sequences - - 'right': pads on the right of the sequences - pad_to_multiple_of (`int`, *optional*): - Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to - enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs - which benefit from having sequence lengths be a multiple of 128. - return_attention_mask (`bool`, *optional*): - Set to False to avoid returning attention mask (default: set to model specifics) - """ - required_input = processed_features[self.model_input_names[0]] - - if padding_strategy == PaddingStrategy.LONGEST: - max_length = len(required_input) - - if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): - max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of - - needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) < max_length - - if return_attention_mask and "attention_mask" not in processed_features: - processed_features["attention_mask"] = np.ones(len(required_input), dtype=np.int32) - - if needs_to_be_padded: - difference = max_length - len(required_input) - if self.padding_side == "right": - if return_attention_mask: - processed_features["attention_mask"] = np.pad( - processed_features["attention_mask"], (0, difference) - ) - padding_shape = ((0, difference), (0, 0)) if self.feature_size > 1 else (0, difference) - processed_features[self.model_input_names[0]] = np.pad( - required_input, padding_shape, "constant", constant_values=self.padding_value - ) - elif self.padding_side == "left": - if return_attention_mask: - processed_features["attention_mask"] = np.pad( - processed_features["attention_mask"], (difference, 0) - ) - padding_shape = ((difference, 0), (0, 0)) if self.feature_size > 1 else (difference, 0) - processed_features[self.model_input_names[0]] = np.pad( - required_input, padding_shape, "constant", constant_values=self.padding_value - ) - else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) - - return processed_features - - def _truncate( - self, - processed_features: Union[Dict[str, np.ndarray], BatchFeature], - max_length: Optional[int] = None, - pad_to_multiple_of: Optional[int] = None, - truncation: Optional[bool] = None, - ): - """ - Truncate inputs to predefined length or max length in the batch - - Args: - processed_features(`Union[Dict[str, np.ndarray], BatchFeature]`): - Dictionary of input values (`np.ndarray[float]`) / input vectors (`List[np.ndarray[float]]`) or batch - of inputs values (`List[np.ndarray[int]]`) / input vectors (`List[np.ndarray[int]]`) - max_length (`int`, *optional*): - maximum length of the returned list and optionally padding length (see below) - pad_to_multiple_of (`int`, *optional*) : - Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to - enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs - which benefit from having sequence lengths be a multiple of 128. - truncation (`bool`, *optional*): - Activates truncation to cut input sequences longer than `max_length` to `max_length`. - """ - if not truncation: - return processed_features - elif truncation and max_length is None: - raise ValueError("When setting ``truncation=True``, make sure that ``max_length`` is defined.") - - required_input = processed_features[self.model_input_names[0]] - - # find `max_length` that fits `pad_to_multiple_of` - if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): - max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of - - needs_to_be_truncated = len(required_input) > max_length - - if needs_to_be_truncated: - processed_features[self.model_input_names[0]] = processed_features[self.model_input_names[0]][:max_length] - if "attention_mask" in processed_features: - processed_features["attention_mask"] = processed_features["attention_mask"][:max_length] - - return processed_features - - def _get_padding_strategies(self, padding=False, max_length=None): - """ - Find the correct padding strategy - """ - - # Get padding strategy - if padding is not False: - if padding is True: - padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch - elif not isinstance(padding, PaddingStrategy): - padding_strategy = PaddingStrategy(padding) - elif isinstance(padding, PaddingStrategy): - padding_strategy = padding - else: - padding_strategy = PaddingStrategy.DO_NOT_PAD - - # Set max length if needed - if max_length is None: - if padding_strategy == PaddingStrategy.MAX_LENGTH: - raise ValueError( - f"When setting ``padding={PaddingStrategy.MAX_LENGTH}``, make sure that max_length is defined" - ) - - # Test if we have a padding value - if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.padding_value is None): - raise ValueError( - "Asking to pad but the feature_extractor does not have a padding value. Please select a value to use" - " as `padding_value`. For example: `feature_extractor.padding_value = 0.0`." - ) - - return padding_strategy diff --git a/paddleformers/transformers/long_sequence_strategies/__init__.py b/paddleformers/transformers/long_sequence_strategies/__init__.py deleted file mode 100644 index 115830832a2..00000000000 --- a/paddleformers/transformers/long_sequence_strategies/__init__.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -from typing import TYPE_CHECKING - -from ...utils.lazy_import import _LazyModule - -import_structure = { - "long_sequence_strategies": ["LongSequenceStrategies"], - "attention_strategies": ["AttentionWithLinearBias"], - "embedding_strategies": [ - "RotaryEmbedding", - "LinearScalingRotaryEmbedding", - "NTKScalingRotaryEmbedding", - "DynamicNTKScalingRotaryEmbedding", - "YaRNScalingRotaryEmbedding", - ], -} - -if TYPE_CHECKING: - from .attention_strategies import * - from .embedding_strategies import * - from .long_sequence_strategies import * -else: - sys.modules[__name__] = _LazyModule( - __name__, - globals()["__file__"], - import_structure, - module_spec=__spec__, - ) diff --git a/paddleformers/transformers/long_sequence_strategies/attention_strategies.py b/paddleformers/transformers/long_sequence_strategies/attention_strategies.py deleted file mode 100755 index 3b19d19452e..00000000000 --- a/paddleformers/transformers/long_sequence_strategies/attention_strategies.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math - -import numpy as np -import paddle -from paddle import Tensor, nn - -__all__ = ["AttentionWithLinearBias"] - - -class AttentionWithLinearBias(nn.Layer): - def __init__(self, **init_args): - super().__init__() - - def _get_interleave(self, n): - def _get_interleave_power_of_2(n): - start = 2 ** (-(2 ** -(math.log2(n) - 3))) - return np.array([start * start**i for i in range(n)]).astype(np.float32) - - if math.log2(n).is_integer(): - return _get_interleave_power_of_2(n) - else: - closest_power_of_2 = 2 ** math.floor(math.log2(n)) - return ( - _get_interleave_power_of_2(closest_power_of_2) - + self._get_interleave(2 * closest_power_of_2)[0::2][: n - closest_power_of_2] - ) - - def forward(self, bool_attention_mask: Tensor, num_heads: int, dtype: paddle.dtype): - attention_mask = bool_attention_mask.astype("float32") - batch_size, seq_length = attention_mask.shape[0], attention_mask.shape[-1] - slopes = paddle.to_tensor(self._get_interleave(num_heads), dtype="float32") - with paddle.amp.auto_cast(enable=False): - alibi = slopes.unsqueeze(axis=[1, 2]) * paddle.arange(seq_length, dtype="float32").unsqueeze( - axis=[0, 1] - ).expand([num_heads, -1, -1]) - alibi = alibi.reshape(shape=(1, num_heads, 1, seq_length)).expand([batch_size, -1, -1, -1]) - return paddle.cast(alibi, dtype) diff --git a/paddleformers/transformers/long_sequence_strategies/embedding_strategies.py b/paddleformers/transformers/long_sequence_strategies/embedding_strategies.py deleted file mode 100755 index 2cb82c4dc68..00000000000 --- a/paddleformers/transformers/long_sequence_strategies/embedding_strategies.py +++ /dev/null @@ -1,223 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math - -import paddle -from paddle import nn - -__all__ = [ - "RotaryEmbedding", - "LinearScalingRotaryEmbedding", - "NTKScalingRotaryEmbedding", - "DynamicNTKScalingRotaryEmbedding", - "YaRNScalingRotaryEmbedding", -] - - -class RotaryEmbedding(nn.Layer): - def __init__(self, **init_args): - super().__init__() - self.dim = init_args["dim"] - self.max_position_embeddings = init_args["max_position_embeddings"] - self.base = init_args["base"] - self.position_encoding_2d = init_args["position_encoding_2d"] if "position_encoding_2d" in init_args else False - if self.position_encoding_2d: - # [dim / 4]# 2D--Embedding - self.dim = self.dim / 2 - inv_freq = 1.0 / ( - self.base ** (paddle.cast(paddle.arange(0, self.dim, 2), dtype=paddle.float32) / self.dim) - ) - else: - # [dim / 2] - inv_freq = 1.0 / ( - self.base ** (paddle.cast(paddle.arange(0, self.dim, 2), dtype=paddle.float32) / self.dim) - ) - self.register_buffer("inv_freq", inv_freq) - self._set_cos_sin_cache(seq_len=self.max_position_embeddings) - - def _set_cos_sin_cache(self, seq_len): - self.max_seq_len_cached = seq_len - # [seq_len] - t = paddle.arange(seq_len, dtype=paddle.float32) - # [seq_len, dim/2] - with paddle.amp.auto_cast(enable=False): - freqs = paddle.outer(t.astype(self.inv_freq.dtype), self.inv_freq) - # [seq_len, dim] - emb = paddle.cat([freqs, freqs], axis=-1) - self.cos_cached = emb.cos()[:, :] - self.sin_cached = emb.sin()[:, :] - - def forward(self, seq_len=None, ntk_alpha=None): - - return self.cos_cached[:, :], self.sin_cached[:, :] - - -class LinearScalingRotaryEmbedding(RotaryEmbedding): - def __init__(self, **init_args): - self.scaling_factor = init_args["scaling_factor"] - super().__init__(**init_args) - - def _set_cos_sin_cache(self, seq_len): - self.max_seq_len_cached = seq_len - # [seq_len] - t = paddle.arange(seq_len, dtype=paddle.float32) - t = t / self.scaling_factor - # [seq_len, dim/2] - with paddle.amp.auto_cast(enable=False): - freqs = paddle.outer(t.astype(self.inv_freq.dtype), self.inv_freq) - # [seq_len, dim] - emb = paddle.cat([freqs, freqs], axis=-1) - self.cos_cached = emb.cos()[:, :] - self.sin_cached = emb.sin()[:, :] - - -class NTKScalingRotaryEmbedding(RotaryEmbedding): - """RotaryEmbedding extended with NTK scaling. https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/""" - - def __init__(self, **init_args): - init_args["base"] = init_args["base"] * init_args["scaling_factor"] ** ( - init_args["dim"] / (init_args["dim"] - 2) - ) - super().__init__(**init_args) - - -class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding): - """RotaryEmbedding extended with Dynamic NTK scaling. https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/""" - - def __init__(self, **init_args): - self.scaling_factor = init_args["scaling_factor"] - self._seq_len_cached = 0 - super().__init__(**init_args) - - def _scale_cos_sin(self, seq_len, ntk_alpha=None): - # [seq_len] - t = paddle.arange(seq_len, dtype=paddle.float32) - if ntk_alpha is None: - ntk_alpha = (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) - base = self.base * ntk_alpha ** (self.dim / (self.dim - 2)) - - # [seq_len, dim/2] - inv_freq = 1.0 / (base ** (paddle.cast(paddle.arange(0, self.dim, 2), dtype=paddle.float32) / self.dim)) - with paddle.amp.auto_cast(enable=False): - freqs = paddle.outer(t.astype(inv_freq.dtype), inv_freq) - # [seq_len, dim] - emb = paddle.cat([freqs, freqs], axis=-1) - self.cos_cached = emb.cos()[:, :] - self.sin_cached = emb.sin()[:, :] - - def forward(self, seq_len=None, ntk_alpha=None): - - if seq_len > self.max_position_embeddings: - self._scale_cos_sin(seq_len=seq_len, ntk_alpha=ntk_alpha) - - return self.cos_cached[:, :], self.sin_cached[:, :] - - -class YaRNScalingRotaryEmbedding(nn.Layer): - """RotaryEmbedding extended with YaRN scaling.""" - - def __init__( - self, - dim, - max_position_embeddings=2048, - base=10000, - scaling_factor=1, - original_max_position_embeddings=2048, - extrapolation_factor=1, - attn_factor=1, - beta_fast=32, - beta_slow=1, - ): - super().__init__() - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - self.scaling_factor = scaling_factor # scaling_factor - self.original_max_position_embeddings = original_max_position_embeddings - self.extrapolation_factor = extrapolation_factor - self.attn_factor = attn_factor - self.beta_fast = beta_fast - self.beta_slow = beta_slow - - self.yarn() - - self._set_cos_sin_cache(seq_len=self.max_position_embeddings) - - def _set_cos_sin_cache(self, seq_len): - self.max_seq_len_cached = seq_len - # [seq_len] - t = paddle.arange(seq_len, dtype=paddle.float32) - # [seq_len, dim/2] - with paddle.amp.auto_cast(enable=False): - freqs = paddle.outer(t.astype(self.inv_freq.dtype), self.inv_freq) - # [seq_len, dim] - emb = paddle.cat([freqs, freqs], axis=-1) - self.cos_cached = emb.cos()[:, :] * self.mscale - self.sin_cached = emb.sin()[:, :] * self.mscale - - def _scale_cos_sin(self, seq_len): - self.max_seq_len_cached = seq_len - - t = paddle.arange(self.max_seq_len_cached, dtype=self.inv_freq.dtype) - freqs = paddle.einsum("i,j->ij", t, self.inv_freq) - emb = paddle.cat((freqs, freqs), axis=-1) - - self.cos_cached = emb.cos()[:, :] * self.mscale - self.sin_cached = emb.sin()[:, :] * self.mscale - - def forward(self, seq_len=None, ntk_alpha=None): - if seq_len > self.max_seq_len_cached: - self._scale_cos_sin(seq_len=seq_len) - - return self.cos_cached[:, :], self.sin_cached[:, :] - - def yarn(self): - inv_freq = 1.0 / (self.base ** (paddle.cast(paddle.arange(0, self.dim, 2), dtype=paddle.float32) / self.dim)) - - low, high = self._yarn_find_correction_range( - self.beta_fast, self.beta_slow, self.dim, self.base, self.original_max_position_embeddings - ) - inv_freq_mask = ( - 1 - paddle.cast(self._yarn_linear_ramp_mask(low, high, self.dim // 2), dtype=paddle.float32) - ) * self.extrapolation_factor - - inv_freq = inv_freq / ((1 - inv_freq_mask) * self.scaling_factor + inv_freq_mask) - self.register_buffer("inv_freq", inv_freq) - self.mscale = self._yarn_get_mscale(self.scaling_factor) * self.attn_factor - - @classmethod - def _yarn_find_correction_dim(cls, num_rotations, dim, base=10000, max_position_embeddings=2048): - return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base)) - - @classmethod - def _yarn_find_correction_range(cls, low_rot, high_rot, dim, base=10000, max_position_embeddings=2048): - low = math.floor(cls._yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)) - high = math.ceil(cls._yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)) - return max(low, 0), min(high, dim - 1) # Clamp values just in case - - @classmethod - def _yarn_linear_ramp_mask(cls, low, high, dim): - if low == high: - high += 0.001 # Prevent singularity - - linear_func = (paddle.arange(dim, dtype=paddle.float32) - low) / (high - low) - ramp_func = paddle.clip(linear_func, 0, 1) - return ramp_func - - @classmethod - def _yarn_get_mscale(cls, scaling_factor=1): - if scaling_factor <= 1: - return 1.0 - return 0.1 * math.log(scaling_factor) + 1.0 diff --git a/paddleformers/transformers/long_sequence_strategies/long_sequence_strategies.py b/paddleformers/transformers/long_sequence_strategies/long_sequence_strategies.py deleted file mode 100644 index a33b981e709..00000000000 --- a/paddleformers/transformers/long_sequence_strategies/long_sequence_strategies.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import importlib - -all_strategy_types = ["embedding_strategies", "attention_strategies"] - - -class LongSequenceStrategies: - @classmethod - def build_long_sequence_strategy(cls, strategy_type=None, stratety_name=None, **init_args): - """ - - **init_args: head_dim, - max_position_embeddings, - rope_scaling_type, - rope_scaling_factor, - ... - - strategy_type: "None" ---------------走原始的built-in模块 - "embedding_strategies"、 - "attention_strategies" - ... - - stratety_name: "RotaryEmbedding"、 - "LinearScalingRotaryEmbedding"、 - "NTKScalingRotaryEmbedding"、 - "DynamicNTKScalingRotaryEmbedding"、 - "AttentionWithLinearBias" - ... - - """ - - """ - paddleformers.transformers.long_sequence_strategies.{strategy_type<->import_class)}.{stratety_name<->strategy_class)} - paddleformers.transformers.long_sequence_strategies.{embedding_strategies}.{RoPE,...} - paddleformers.transformers.long_sequence_strategies.{attention_strategies}.{ALiBi,...} - """ - try: - import_class = importlib.import_module( - f"paddleformers.transformers.long_sequence_strategies.{strategy_type}" - ) - except ModuleNotFoundError: - raise ModuleNotFoundError( - f"Wrong strategy type {strategy_type}. module only supports the following types: " - + ", ".join(m for m in all_strategy_types) - ) - try: - strategy_class = getattr(import_class, stratety_name) - except: - all_strategy_classes = import_class.__all__ - raise LookupError( - f"module '{import_class.__name__}' only supports the following classes: " - + ", ".join(m for m in all_strategy_classes) - ) - strategy_instance = strategy_class(**init_args) - return strategy_instance diff --git a/paddleformers/transformers/optimization.py b/paddleformers/transformers/optimization.py index 5c6b10d5100..018c06bff7e 100644 --- a/paddleformers/transformers/optimization.py +++ b/paddleformers/transformers/optimization.py @@ -14,13 +14,9 @@ import math -from paddle.optimizer.lr import LambdaDecay, LRScheduler +from paddle.optimizer.lr import LRScheduler __all__ = [ - "LinearDecayWithWarmup", - "ConstScheduleWithWarmup", - "CosineDecayWithWarmup", - "PolyDecayWithWarmup", "CosineAnnealingWithWarmupDecay", "LinearAnnealingWithWarmupDecay", ] @@ -73,232 +69,3 @@ def get_lr(self): decay_ratio = float(num_step_) / float(decay_step_) coeff = 1.0 - decay_ratio return self.min_lr + coeff * (self.max_lr - self.min_lr) - - -class LinearDecayWithWarmup(LambdaDecay): - """ - Creates a learning rate scheduler, which increases learning rate linearly - from 0 to given `learning_rate`, after this warmup period learning rate - would be decreased linearly from the base learning rate to 0. - - Args: - learning_rate (float): - The base learning rate. It is a python float number. - total_steps (int): - The number of training steps. - warmup (int or float): - If int, it means the number of steps for warmup. If float, it means - the proportion of warmup in total training steps. - last_epoch (int, optional): - The index of last epoch. It can be set to restart training. If - None, it means initial learning rate. - Defaults to -1. - verbose (bool, optional): - If True, prints a message to stdout for each update. - Defaults to False. - - Examples: - - .. code-block:: python - - from paddleformers.transformers import LinearDecayWithWarmup - lr, warmup_steps, max_steps = 0.1, 100, 1000 - lr_scheduler = LinearDecayWithWarmup(lr, max_steps, warmup_steps) - - """ - - def __init__(self, learning_rate, total_steps, warmup, last_epoch=-1, verbose=False): - warmup_steps = warmup if is_integer(warmup) else int(math.floor(warmup * total_steps)) - - def lr_lambda(current_step): - if current_step < warmup_steps: - return float(current_step) / float(max(1, warmup_steps)) - return max(0.0, float(total_steps - current_step) / float(max(1, total_steps - warmup_steps))) - - super(LinearDecayWithWarmup, self).__init__(learning_rate, lr_lambda, last_epoch, verbose) - - -class ConstScheduleWithWarmup(LambdaDecay): - """ - Creates a learning rate scheduler, which increases learning rate linearly - from 0 to given `learning_rate` during warmup periods and keeps learning - rate a constant after that. - - Args: - learning_rate (float): - The base learning rate. It is a python float number. - warmup (int or float): - If int, it means the number of steps for warmup. If float, it means - the proportion of warmup in total training steps. - total_steps (int, optional): - The number of training steps. If `warmup` is a float number, - `total_steps` must be provided. - Defaults to None. - last_epoch (int, optional): - The index of last epoch. It can be set to restart training. If - None, it means initial learning rate. - Defaults to -1. - - Examples: - - .. code-block:: python - - from paddleformers.transformers import ConstScheduleWithWarmup - lr, warmup_steps = 0.1, 100 - lr_scheduler = ConstScheduleWithWarmup(lr, warmup_steps) - - """ - - def __init__(self, learning_rate, warmup, total_steps=None, last_epoch=-1, verbose=False): - warmup_steps = warmup if is_integer(warmup) else int(math.floor(warmup * total_steps)) - if is_integer(warmup): - warmup_steps = warmup - elif total_steps: - warmup_steps = int(math.floor(warmup * total_steps)) - else: - raise ValueError( - "Please provide total steps if `warmup` is a float number , or provide integer for argument `warmup`." - ) - - def lr_lambda(current_step): - if current_step < warmup_steps: - return float(current_step) / float(max(1.0, warmup_steps)) - return 1.0 - - super(ConstScheduleWithWarmup, self).__init__(learning_rate, lr_lambda, last_epoch, verbose) - - -class CosineDecayWithWarmup(LambdaDecay): - """ - Creates a learning rate scheduler, which increases learning rate linearly - from 0 to given `learning_rate`, after this warmup period learning rate - would be decreased following the values of the cosine function. If - `with_hard_restarts` is True, the cosine function could have several hard - restarts. - - Args: - learning_rate (float): - The base learning rate. It is a python float number. - total_steps (int): - The number of training steps. - warmup (int or float): - If int, it means the number of steps for warmup. If float, it means - the proportion of warmup in total training steps. - with_hard_restarts (bool): - Whether cosine function has several hard restarts. - Defaults to False. - num_cycles (int or float, optional): - If `with_hard_restarts` is False, it means the number of waves in - cosine scheduler and should be an integer number and defaults to 1. - If `with_hard_restarts` is True, it means the number of hard - restarts to use and should be a float number and defaults to be 0.5. - Defaults to None. - last_epoch (int, optional): - The index of last epoch. It can be set to restart training. If - None, it means initial learning rate. - Defaults to -1. - - Examples: - - .. code-block:: python - - from paddleformers.transformers import CosineDecayWithWarmup - lr, warmup_steps, max_steps = 0.1, 100, 1000 - lr_scheduler = CosineDecayWithWarmup(lr, max_steps, warmup_steps) - - """ - - def __init__( - self, - learning_rate, - total_steps, - warmup, - with_hard_restarts=False, - num_cycles=None, - last_epoch=-1, - verbose=False, - ): - warmup_steps = warmup if is_integer(warmup) else int(math.floor(warmup * total_steps)) - # Input check - if num_cycles is not None: - assert ( - not with_hard_restarts - and isinstance(num_cycles, int) - or with_hard_restarts - and isinstance(num_cycles, float) - ), "`num_circles` should be an integer while `with_hard_restarts` is False, an float while `with_hard_restarts` is True." - else: - num_cycles = 1 if not with_hard_restarts else 0.5 - - def lr_lambda(current_step): - if current_step < warmup_steps: - return float(current_step) / float(max(1, warmup_steps)) - - progress = float(current_step - warmup_steps) / float(max(1, total_steps - warmup_steps)) - - if with_hard_restarts: - if progress >= 1.0: - return 0.0 - return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0)))) - - return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) - - super(CosineDecayWithWarmup, self).__init__(learning_rate, lr_lambda, last_epoch, verbose) - - -class PolyDecayWithWarmup(LambdaDecay): - """ - Creates a learning rate scheduler, which increases learning rate linearly - from 0 to given `lr_init`, after this warmup period learning rate would - be decreased as a polynomial decay from the base learning rate to the end - learning rate `lr_end`. - - Args: - learning_rate (float): - The base learning rate. It is a python float number. - total_steps (int): - The number of training steps. - warmup (int or float): - If int, it means the number of steps for warmup. If float, it means - the proportion of warmup in total training steps. - lr_end (float, optional): - The end learning rate. - Defaults to 1e-7. - power (float, optional): - Power factor. - Defaults to 1.0. - last_epoch (int, optional): - The index of last epoch. It can be set to restart training. If - None, it means initial learning rate. - Defaults to -1. - - Examples: - - .. code-block:: python - - from paddleformers.transformers import PolyDecayWithWarmup - lr, lr_end, warmup_steps, max_steps = 0.1, 1e-6, 100, 1000 - lr_scheduler = PolyDecayWithWarmup(lr, max_steps, warmup_steps, lr_end) - - """ - - def __init__(self, learning_rate, total_steps, warmup, lr_end=1e-7, power=1.0, last_epoch=-1, verbose=False): - lr_init = learning_rate - assert ( - lr_init > lr_end - ), f"`lr_end` must be be smaller than `learning_rate`. But `lr_end` is {lr_end} while `learning_rate` is {lr_init}." - warmup_steps = warmup if is_integer(warmup) else int(math.floor(warmup * total_steps)) - - def lr_lambda(current_step): - if current_step < warmup_steps: - return float(current_step) / float(max(1, warmup_steps)) - elif current_step > total_steps: - return lr_end / lr_init # it multiplies by lr_init equals to lr_end - else: - lr_range = lr_init - lr_end - decay_steps = total_steps - warmup_steps - pct_remaining = 1 - (current_step - warmup_steps) / decay_steps - decay = lr_range * pct_remaining**power + lr_end - return decay / lr_init # it multiplies by lr_init equals to decay - - super(PolyDecayWithWarmup, self).__init__(lr_init, lr_lambda, last_epoch, verbose) diff --git a/paddleformers/transformers/qwen/modeling.py b/paddleformers/transformers/qwen/modeling.py index dd06cc1ae61..b765aab0de8 100755 --- a/paddleformers/transformers/qwen/modeling.py +++ b/paddleformers/transformers/qwen/modeling.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import importlib import math import os import warnings @@ -50,7 +51,6 @@ def swiglu(x, y=None): from ...utils.log import logger from .. import linear_utils from ..linear_utils import Linear -from ..long_sequence_strategies import LongSequenceStrategies from ..model_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast from ..model_utils import PretrainedModel from ..utils import caculate_llm_per_token_flops @@ -89,6 +89,60 @@ def swiglu(x, y=None): fused_rotary_position_embedding = None +all_strategy_types = ["embedding_strategies", "attention_strategies"] + + +class LongSequenceStrategies: + @classmethod + def build_long_sequence_strategy(cls, strategy_type=None, stratety_name=None, **init_args): + """ + + **init_args: head_dim, + max_position_embeddings, + rope_scaling_type, + rope_scaling_factor, + ... + + strategy_type: "None" ---------------走原始的built-in模块 + "embedding_strategies"、 + "attention_strategies" + ... + + stratety_name: "RotaryEmbedding"、 + "LinearScalingRotaryEmbedding"、 + "NTKScalingRotaryEmbedding"、 + "DynamicNTKScalingRotaryEmbedding"、 + "AttentionWithLinearBias" + ... + + """ + + """ + paddleformers.transformers.long_sequence_strategies.{strategy_type<->import_class)}.{stratety_name<->strategy_class)} + paddleformers.transformers.long_sequence_strategies.{embedding_strategies}.{RoPE,...} + paddleformers.transformers.long_sequence_strategies.{attention_strategies}.{ALiBi,...} + """ + try: + import_class = importlib.import_module( + f"paddleformers.transformers.long_sequence_strategies.{strategy_type}" + ) + except ModuleNotFoundError: + raise ModuleNotFoundError( + f"Wrong strategy type {strategy_type}. module only supports the following types: " + + ", ".join(m for m in all_strategy_types) + ) + try: + strategy_class = getattr(import_class, stratety_name) + except: + all_strategy_classes = import_class.__all__ + raise LookupError( + f"module '{import_class.__name__}' only supports the following classes: " + + ", ".join(m for m in all_strategy_classes) + ) + strategy_instance = strategy_class(**init_args) + return strategy_instance + + def get_use_casual_mask(): """Get the value of the 'USE_CASUAL_MASK' environment variable.""" return os.getenv("USE_CASUAL_MASK", "False") == "True" diff --git a/paddleformers/transformers/sentencepiece_model_pb2.py b/paddleformers/transformers/sentencepiece_model_pb2.py deleted file mode 100644 index 2502772a9e0..00000000000 --- a/paddleformers/transformers/sentencepiece_model_pb2.py +++ /dev/null @@ -1,1534 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: sentencepiece_model.proto -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Generated protocol buffer code.""" -from google.protobuf import descriptor as _descriptor -from google.protobuf import message as _message -from google.protobuf import reflection as _reflection -from google.protobuf import symbol_database as _symbol_database - -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - -DESCRIPTOR = _descriptor.FileDescriptor( - name="sentencepiece_model.proto", - package="sentencepiece", - syntax="proto2", - serialized_options=b"H\003", - create_key=_descriptor._internal_create_key, - serialized_pb=b'\n\x19sentencepiece_model.proto\x12\rsentencepiece"\xdb\x0b\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12*\n\x1b\x65nable_differential_privacy\x18\x32 \x01(\x08:\x05\x66\x61lse\x12+\n differential_privacy_noise_level\x18\x33 \x01(\x02:\x01\x30\x12\x32\n\'differential_privacy_clipping_threshold\x18\x34 \x01(\x04:\x01\x30\x12"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12+\n\x1c\x61llow_whitespace_only_pieces\x18\x1a \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18 \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05\x12\x16\n\tbos_piece\x18. \x01(\t:\x03\x12\x17\n\teos_piece\x18/ \x01(\t:\x04\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03', -) - -_TRAINERSPEC_MODELTYPE = _descriptor.EnumDescriptor( - name="ModelType", - full_name="sentencepiece.TrainerSpec.ModelType", - filename=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - values=[ - _descriptor.EnumValueDescriptor( - name="UNIGRAM", - index=0, - number=1, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="BPE", - index=1, - number=2, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="WORD", - index=2, - number=3, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="CHAR", - index=3, - number=4, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - ], - containing_type=None, - serialized_options=None, - serialized_start=1480, - serialized_end=1533, -) -_sym_db.RegisterEnumDescriptor(_TRAINERSPEC_MODELTYPE) - -_MODELPROTO_SENTENCEPIECE_TYPE = _descriptor.EnumDescriptor( - name="Type", - full_name="sentencepiece.ModelProto.SentencePiece.Type", - filename=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - values=[ - _descriptor.EnumValueDescriptor( - name="NORMAL", - index=0, - number=1, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="UNKNOWN", - index=1, - number=2, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="CONTROL", - index=2, - number=3, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="USER_DEFINED", - index=3, - number=4, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="BYTE", - index=4, - number=6, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.EnumValueDescriptor( - name="UNUSED", - index=5, - number=5, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key, - ), - ], - containing_type=None, - serialized_options=None, - serialized_start=2286, - serialized_end=2370, -) -_sym_db.RegisterEnumDescriptor(_MODELPROTO_SENTENCEPIECE_TYPE) - -_TRAINERSPEC = _descriptor.Descriptor( - name="TrainerSpec", - full_name="sentencepiece.TrainerSpec", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="input", - full_name="sentencepiece.TrainerSpec.input", - index=0, - number=1, - type=9, - cpp_type=9, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="input_format", - full_name="sentencepiece.TrainerSpec.input_format", - index=1, - number=7, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="model_prefix", - full_name="sentencepiece.TrainerSpec.model_prefix", - index=2, - number=2, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="model_type", - full_name="sentencepiece.TrainerSpec.model_type", - index=3, - number=3, - type=14, - cpp_type=8, - label=1, - has_default_value=True, - default_value=1, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="vocab_size", - full_name="sentencepiece.TrainerSpec.vocab_size", - index=4, - number=4, - type=5, - cpp_type=1, - label=1, - has_default_value=True, - default_value=8000, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="accept_language", - full_name="sentencepiece.TrainerSpec.accept_language", - index=5, - number=5, - type=9, - cpp_type=9, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="self_test_sample_size", - full_name="sentencepiece.TrainerSpec.self_test_sample_size", - index=6, - number=6, - type=5, - cpp_type=1, - label=1, - has_default_value=True, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="enable_differential_privacy", - full_name="sentencepiece.TrainerSpec.enable_differential_privacy", - index=7, - number=50, - type=8, - cpp_type=7, - label=1, - has_default_value=True, - default_value=False, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="differential_privacy_noise_level", - full_name="sentencepiece.TrainerSpec.differential_privacy_noise_level", - index=8, - number=51, - type=2, - cpp_type=6, - label=1, - has_default_value=True, - default_value=float(0), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="differential_privacy_clipping_threshold", - full_name="sentencepiece.TrainerSpec.differential_privacy_clipping_threshold", - index=9, - number=52, - type=4, - cpp_type=4, - label=1, - has_default_value=True, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="character_coverage", - full_name="sentencepiece.TrainerSpec.character_coverage", - index=10, - number=10, - type=2, - cpp_type=6, - label=1, - has_default_value=True, - default_value=float(0.9995), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="input_sentence_size", - full_name="sentencepiece.TrainerSpec.input_sentence_size", - index=11, - number=11, - type=4, - cpp_type=4, - label=1, - has_default_value=True, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="shuffle_input_sentence", - full_name="sentencepiece.TrainerSpec.shuffle_input_sentence", - index=12, - number=19, - type=8, - cpp_type=7, - label=1, - has_default_value=True, - default_value=True, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="mining_sentence_size", - full_name="sentencepiece.TrainerSpec.mining_sentence_size", - index=13, - number=12, - type=5, - cpp_type=1, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\030\001", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="training_sentence_size", - full_name="sentencepiece.TrainerSpec.training_sentence_size", - index=14, - number=13, - type=5, - cpp_type=1, - label=1, - has_default_value=False, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=b"\030\001", - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="seed_sentencepiece_size", - full_name="sentencepiece.TrainerSpec.seed_sentencepiece_size", - index=15, - number=14, - type=5, - cpp_type=1, - label=1, - has_default_value=True, - default_value=1000000, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="shrinking_factor", - full_name="sentencepiece.TrainerSpec.shrinking_factor", - index=16, - number=15, - type=2, - cpp_type=6, - label=1, - has_default_value=True, - default_value=float(0.75), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="max_sentence_length", - full_name="sentencepiece.TrainerSpec.max_sentence_length", - index=17, - number=18, - type=5, - cpp_type=1, - label=1, - has_default_value=True, - default_value=4192, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="num_threads", - full_name="sentencepiece.TrainerSpec.num_threads", - index=18, - number=16, - type=5, - cpp_type=1, - label=1, - has_default_value=True, - default_value=16, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="num_sub_iterations", - full_name="sentencepiece.TrainerSpec.num_sub_iterations", - index=19, - number=17, - type=5, - cpp_type=1, - label=1, - has_default_value=True, - default_value=2, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="max_sentencepiece_length", - full_name="sentencepiece.TrainerSpec.max_sentencepiece_length", - index=20, - number=20, - type=5, - cpp_type=1, - label=1, - has_default_value=True, - default_value=16, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="split_by_unicode_script", - full_name="sentencepiece.TrainerSpec.split_by_unicode_script", - index=21, - number=21, - type=8, - cpp_type=7, - label=1, - has_default_value=True, - default_value=True, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="split_by_number", - full_name="sentencepiece.TrainerSpec.split_by_number", - index=22, - number=23, - type=8, - cpp_type=7, - label=1, - has_default_value=True, - default_value=True, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="split_by_whitespace", - full_name="sentencepiece.TrainerSpec.split_by_whitespace", - index=23, - number=22, - type=8, - cpp_type=7, - label=1, - has_default_value=True, - default_value=True, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="treat_whitespace_as_suffix", - full_name="sentencepiece.TrainerSpec.treat_whitespace_as_suffix", - index=24, - number=24, - type=8, - cpp_type=7, - label=1, - has_default_value=True, - default_value=False, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="allow_whitespace_only_pieces", - full_name="sentencepiece.TrainerSpec.allow_whitespace_only_pieces", - index=25, - number=26, - type=8, - cpp_type=7, - label=1, - has_default_value=True, - default_value=False, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="split_digits", - full_name="sentencepiece.TrainerSpec.split_digits", - index=26, - number=25, - type=8, - cpp_type=7, - label=1, - has_default_value=True, - default_value=False, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="control_symbols", - full_name="sentencepiece.TrainerSpec.control_symbols", - index=27, - number=30, - type=9, - cpp_type=9, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="user_defined_symbols", - full_name="sentencepiece.TrainerSpec.user_defined_symbols", - index=28, - number=31, - type=9, - cpp_type=9, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="required_chars", - full_name="sentencepiece.TrainerSpec.required_chars", - index=29, - number=36, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="byte_fallback", - full_name="sentencepiece.TrainerSpec.byte_fallback", - index=30, - number=35, - type=8, - cpp_type=7, - label=1, - has_default_value=True, - default_value=False, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="vocabulary_output_piece_score", - full_name="sentencepiece.TrainerSpec.vocabulary_output_piece_score", - index=31, - number=32, - type=8, - cpp_type=7, - label=1, - has_default_value=True, - default_value=True, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="hard_vocab_limit", - full_name="sentencepiece.TrainerSpec.hard_vocab_limit", - index=32, - number=33, - type=8, - cpp_type=7, - label=1, - has_default_value=True, - default_value=True, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="use_all_vocab", - full_name="sentencepiece.TrainerSpec.use_all_vocab", - index=33, - number=34, - type=8, - cpp_type=7, - label=1, - has_default_value=True, - default_value=False, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="unk_id", - full_name="sentencepiece.TrainerSpec.unk_id", - index=34, - number=40, - type=5, - cpp_type=1, - label=1, - has_default_value=True, - default_value=0, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="bos_id", - full_name="sentencepiece.TrainerSpec.bos_id", - index=35, - number=41, - type=5, - cpp_type=1, - label=1, - has_default_value=True, - default_value=1, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="eos_id", - full_name="sentencepiece.TrainerSpec.eos_id", - index=36, - number=42, - type=5, - cpp_type=1, - label=1, - has_default_value=True, - default_value=2, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="pad_id", - full_name="sentencepiece.TrainerSpec.pad_id", - index=37, - number=43, - type=5, - cpp_type=1, - label=1, - has_default_value=True, - default_value=-1, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="unk_piece", - full_name="sentencepiece.TrainerSpec.unk_piece", - index=38, - number=45, - type=9, - cpp_type=9, - label=1, - has_default_value=True, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="bos_piece", - full_name="sentencepiece.TrainerSpec.bos_piece", - index=39, - number=46, - type=9, - cpp_type=9, - label=1, - has_default_value=True, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="eos_piece", - full_name="sentencepiece.TrainerSpec.eos_piece", - index=40, - number=47, - type=9, - cpp_type=9, - label=1, - has_default_value=True, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="pad_piece", - full_name="sentencepiece.TrainerSpec.pad_piece", - index=41, - number=48, - type=9, - cpp_type=9, - label=1, - has_default_value=True, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="unk_surface", - full_name="sentencepiece.TrainerSpec.unk_surface", - index=42, - number=44, - type=9, - cpp_type=9, - label=1, - has_default_value=True, - default_value=b" \342\201\207 ".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="train_extremely_large_corpus", - full_name="sentencepiece.TrainerSpec.train_extremely_large_corpus", - index=43, - number=49, - type=8, - cpp_type=7, - label=1, - has_default_value=True, - default_value=False, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[ - _TRAINERSPEC_MODELTYPE, - ], - serialized_options=None, - is_extendable=True, - syntax="proto2", - extension_ranges=[ - (200, 536870912), - ], - oneofs=[], - serialized_start=45, - serialized_end=1544, -) - -_NORMALIZERSPEC = _descriptor.Descriptor( - name="NormalizerSpec", - full_name="sentencepiece.NormalizerSpec", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="name", - full_name="sentencepiece.NormalizerSpec.name", - index=0, - number=1, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="precompiled_charsmap", - full_name="sentencepiece.NormalizerSpec.precompiled_charsmap", - index=1, - number=2, - type=12, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"", - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="add_dummy_prefix", - full_name="sentencepiece.NormalizerSpec.add_dummy_prefix", - index=2, - number=3, - type=8, - cpp_type=7, - label=1, - has_default_value=True, - default_value=True, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="remove_extra_whitespaces", - full_name="sentencepiece.NormalizerSpec.remove_extra_whitespaces", - index=3, - number=4, - type=8, - cpp_type=7, - label=1, - has_default_value=True, - default_value=True, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="escape_whitespaces", - full_name="sentencepiece.NormalizerSpec.escape_whitespaces", - index=4, - number=5, - type=8, - cpp_type=7, - label=1, - has_default_value=True, - default_value=True, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="normalization_rule_tsv", - full_name="sentencepiece.NormalizerSpec.normalization_rule_tsv", - index=5, - number=6, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=True, - syntax="proto2", - extension_ranges=[ - (200, 536870912), - ], - oneofs=[], - serialized_start=1547, - serialized_end=1756, -) - -_SELFTESTDATA_SAMPLE = _descriptor.Descriptor( - name="Sample", - full_name="sentencepiece.SelfTestData.Sample", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="input", - full_name="sentencepiece.SelfTestData.Sample.input", - index=0, - number=1, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="expected", - full_name="sentencepiece.SelfTestData.Sample.expected", - index=1, - number=2, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[], - serialized_options=None, - is_extendable=False, - syntax="proto2", - extension_ranges=[], - oneofs=[], - serialized_start=1827, - serialized_end=1868, -) - -_SELFTESTDATA = _descriptor.Descriptor( - name="SelfTestData", - full_name="sentencepiece.SelfTestData", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="samples", - full_name="sentencepiece.SelfTestData.samples", - index=0, - number=1, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[ - _SELFTESTDATA_SAMPLE, - ], - enum_types=[], - serialized_options=None, - is_extendable=True, - syntax="proto2", - extension_ranges=[ - (200, 536870912), - ], - oneofs=[], - serialized_start=1758, - serialized_end=1879, -) - -_MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor( - name="SentencePiece", - full_name="sentencepiece.ModelProto.SentencePiece", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="piece", - full_name="sentencepiece.ModelProto.SentencePiece.piece", - index=0, - number=1, - type=9, - cpp_type=9, - label=1, - has_default_value=False, - default_value=b"".decode("utf-8"), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="score", - full_name="sentencepiece.ModelProto.SentencePiece.score", - index=1, - number=2, - type=2, - cpp_type=6, - label=1, - has_default_value=False, - default_value=float(0), - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="type", - full_name="sentencepiece.ModelProto.SentencePiece.type", - index=2, - number=3, - type=14, - cpp_type=8, - label=1, - has_default_value=True, - default_value=1, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[], - enum_types=[ - _MODELPROTO_SENTENCEPIECE_TYPE, - ], - serialized_options=None, - is_extendable=True, - syntax="proto2", - extension_ranges=[ - (200, 536870912), - ], - oneofs=[], - serialized_start=2171, - serialized_end=2381, -) - -_MODELPROTO = _descriptor.Descriptor( - name="ModelProto", - full_name="sentencepiece.ModelProto", - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name="pieces", - full_name="sentencepiece.ModelProto.pieces", - index=0, - number=1, - type=11, - cpp_type=10, - label=3, - has_default_value=False, - default_value=[], - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="trainer_spec", - full_name="sentencepiece.ModelProto.trainer_spec", - index=1, - number=2, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="normalizer_spec", - full_name="sentencepiece.ModelProto.normalizer_spec", - index=2, - number=3, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="self_test_data", - full_name="sentencepiece.ModelProto.self_test_data", - index=3, - number=4, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - _descriptor.FieldDescriptor( - name="denormalizer_spec", - full_name="sentencepiece.ModelProto.denormalizer_spec", - index=4, - number=5, - type=11, - cpp_type=10, - label=1, - has_default_value=False, - default_value=None, - message_type=None, - enum_type=None, - containing_type=None, - is_extension=False, - extension_scope=None, - serialized_options=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - ), - ], - extensions=[], - nested_types=[ - _MODELPROTO_SENTENCEPIECE, - ], - enum_types=[], - serialized_options=None, - is_extendable=True, - syntax="proto2", - extension_ranges=[ - (200, 536870912), - ], - oneofs=[], - serialized_start=1882, - serialized_end=2392, -) - -_TRAINERSPEC.fields_by_name["model_type"].enum_type = _TRAINERSPEC_MODELTYPE -_TRAINERSPEC_MODELTYPE.containing_type = _TRAINERSPEC -_SELFTESTDATA_SAMPLE.containing_type = _SELFTESTDATA -_SELFTESTDATA.fields_by_name["samples"].message_type = _SELFTESTDATA_SAMPLE -_MODELPROTO_SENTENCEPIECE.fields_by_name["type"].enum_type = _MODELPROTO_SENTENCEPIECE_TYPE -_MODELPROTO_SENTENCEPIECE.containing_type = _MODELPROTO -_MODELPROTO_SENTENCEPIECE_TYPE.containing_type = _MODELPROTO_SENTENCEPIECE -_MODELPROTO.fields_by_name["pieces"].message_type = _MODELPROTO_SENTENCEPIECE -_MODELPROTO.fields_by_name["trainer_spec"].message_type = _TRAINERSPEC -_MODELPROTO.fields_by_name["normalizer_spec"].message_type = _NORMALIZERSPEC -_MODELPROTO.fields_by_name["self_test_data"].message_type = _SELFTESTDATA -_MODELPROTO.fields_by_name["denormalizer_spec"].message_type = _NORMALIZERSPEC -DESCRIPTOR.message_types_by_name["TrainerSpec"] = _TRAINERSPEC -DESCRIPTOR.message_types_by_name["NormalizerSpec"] = _NORMALIZERSPEC -DESCRIPTOR.message_types_by_name["SelfTestData"] = _SELFTESTDATA -DESCRIPTOR.message_types_by_name["ModelProto"] = _MODELPROTO -_sym_db.RegisterFileDescriptor(DESCRIPTOR) - -TrainerSpec = _reflection.GeneratedProtocolMessageType( - "TrainerSpec", - (_message.Message,), - { - "DESCRIPTOR": _TRAINERSPEC, - "__module__": "sentencepiece_model_pb2" - # @@protoc_insertion_point(class_scope:sentencepiece.TrainerSpec) - }, -) -_sym_db.RegisterMessage(TrainerSpec) - -NormalizerSpec = _reflection.GeneratedProtocolMessageType( - "NormalizerSpec", - (_message.Message,), - { - "DESCRIPTOR": _NORMALIZERSPEC, - "__module__": "sentencepiece_model_pb2" - # @@protoc_insertion_point(class_scope:sentencepiece.NormalizerSpec) - }, -) -_sym_db.RegisterMessage(NormalizerSpec) - -SelfTestData = _reflection.GeneratedProtocolMessageType( - "SelfTestData", - (_message.Message,), - { - "Sample": _reflection.GeneratedProtocolMessageType( - "Sample", - (_message.Message,), - { - "DESCRIPTOR": _SELFTESTDATA_SAMPLE, - "__module__": "sentencepiece_model_pb2" - # @@protoc_insertion_point(class_scope:sentencepiece.SelfTestData.Sample) - }, - ), - "DESCRIPTOR": _SELFTESTDATA, - "__module__": "sentencepiece_model_pb2" - # @@protoc_insertion_point(class_scope:sentencepiece.SelfTestData) - }, -) -_sym_db.RegisterMessage(SelfTestData) -_sym_db.RegisterMessage(SelfTestData.Sample) - -ModelProto = _reflection.GeneratedProtocolMessageType( - "ModelProto", - (_message.Message,), - { - "SentencePiece": _reflection.GeneratedProtocolMessageType( - "SentencePiece", - (_message.Message,), - { - "DESCRIPTOR": _MODELPROTO_SENTENCEPIECE, - "__module__": "sentencepiece_model_pb2" - # @@protoc_insertion_point(class_scope:sentencepiece.ModelProto.SentencePiece) - }, - ), - "DESCRIPTOR": _MODELPROTO, - "__module__": "sentencepiece_model_pb2" - # @@protoc_insertion_point(class_scope:sentencepiece.ModelProto) - }, -) -_sym_db.RegisterMessage(ModelProto) -_sym_db.RegisterMessage(ModelProto.SentencePiece) - -DESCRIPTOR._options = None -_TRAINERSPEC.fields_by_name["mining_sentence_size"]._options = None -_TRAINERSPEC.fields_by_name["training_sentence_size"]._options = None -# @@protoc_insertion_point(module_scope) diff --git a/paddleformers/transformers/transposed_linear.py b/paddleformers/transformers/transposed_linear.py deleted file mode 100644 index 2d136ad97eb..00000000000 --- a/paddleformers/transformers/transposed_linear.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from paddle import nn -from paddle.nn import functional as F - - -class TransposedLinear(nn.Layer): - """ - Same as paddle.layer.Linear, except weight matrix is stored as [out_features, in_features] (same as torch), - instead of [in_features, out_features] - """ - - def __init__( - self, - in_features, - out_features, - weight_attr=None, - bias_attr=None, - name=None, - ): - super(TransposedLinear, self).__init__() - self._dtype = self._helper.get_default_dtype() - self._weight_attr = weight_attr - self._bias_attr = bias_attr - self.weight = self.create_parameter( - shape=[out_features, in_features], # regular linear has shape [in_features, out_features] - attr=self._weight_attr, - dtype=self._dtype, - is_bias=False, - ) - self.bias = self.create_parameter( - shape=[out_features], - attr=self._bias_attr, - dtype=self._dtype, - is_bias=True, - ) - self.name = name - - def forward(self, input): - out = F.linear(x=input, weight=self.weight.T, bias=self.bias, name=self.name) - return out - - def extra_repr(self): - name_str = ", name={}".format(self.name) if self.name else "" - return "in_features={}, out_features={}, dtype={}{}".format( - self.weight.shape[1], self.weight.shape[0], self._dtype, name_str - ) diff --git a/tests/mergekit/test_merge_model.py b/tests/mergekit/test_merge_model.py index 0a8b1b2ea6c..16b5a913ed6 100644 --- a/tests/mergekit/test_merge_model.py +++ b/tests/mergekit/test_merge_model.py @@ -19,7 +19,7 @@ from parameterized import parameterized from paddleformers.mergekit import MergeConfig, MergeModel -from paddleformers.transformers import AutoModel +from paddleformers.transformers import AutoModelForCausalLM from tests.testing_utils import require_package @@ -27,7 +27,9 @@ class TestMergeModel(unittest.TestCase): @parameterized.expand([("slerp",), ("della",), ("dare_linear",), ("ties",)]) def test_merge_model_np(self, merge_method): with TemporaryDirectory() as tempdir: - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert", dtype="bfloat16") + model = AutoModelForCausalLM.from_pretrained( + "Paddleformers/tiny-random-qwen3", convert_from_hf=True, dtype="bfloat16" + ) pd_path = os.path.join(tempdir, "pd_model") model.save_pretrained(pd_path) safe_path = os.path.join(tempdir, "safe_model") @@ -71,7 +73,9 @@ def test_merge_model_np(self, merge_method): @parameterized.expand([("slerp",), ("della",), ("dare_linear",), ("ties",)]) def test_merge_model_pd(self, merge_method): with TemporaryDirectory() as tempdir: - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert", dtype="bfloat16") + model = AutoModelForCausalLM.from_pretrained( + "Paddleformers/tiny-random-qwen3", convert_from_hf=True, dtype="bfloat16" + ) pd_path = os.path.join(tempdir, "pd_model") model.save_pretrained(pd_path) safe_path = os.path.join(tempdir, "safe_model") diff --git a/tests/peft/test_lokr.py b/tests/peft/test_lokr.py deleted file mode 100644 index 84ab2606b62..00000000000 --- a/tests/peft/test_lokr.py +++ /dev/null @@ -1,264 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import os -import random -import re -import unittest -from tempfile import TemporaryDirectory - -import numpy as np -import paddle - -from paddleformers.peft.lokr import LoKrConfig, LoKrLinear, LoKrModel -from paddleformers.transformers import AutoModel, BertModel - -DEFAULT_LINEAR_TEST_CONFIG = { - "in_features": 4864, - "out_features": 4864, - "lokr_dim": 8, - "lokr_alpha": 8, - "factor": -1, - "decompose_both": False, -} -DEFAULT_MODEL_TEST_CONFIG = { - "base_model_name_or_path": "Paddleformers/tiny-random-bert", - "target_modules": [".*q_proj*.", ".*v_proj*."], - "lokr_alpha": 8, - "lokr_dim": 8, - "decompose_both": False, - "factor": -1, -} - -defaultTestLayer = LoKrLinear(**DEFAULT_LINEAR_TEST_CONFIG) - - -class TestLoKrLayer(unittest.TestCase): - def test_r_raise_exception(self): - with self.assertRaises(ValueError): - LoKrLinear(in_features=16, out_features=8, lokr_dim=0, lokr_alpha=8) - - def test_forward(self): - def myForward(): - input = paddle.randn([2, 4, DEFAULT_LINEAR_TEST_CONFIG["in_features"]], "float32") - self.assertEqual(defaultTestLayer.scale, 1.0) - output = defaultTestLayer(input) - self.assertEqual(output.shape, [2, 4, DEFAULT_LINEAR_TEST_CONFIG["out_features"]]) - - def randomForward(): - for _ in range(50): - inFeatureRand = random.randint(100, 200) - outFeatureRand = random.randint(100, 200) - decompose_both_rand = random.choice([True, False]) - factorRand = random.choice([-1, random.randint(2, min(inFeatureRand, outFeatureRand))]) - lokr_layer = LoKrLinear( - in_features=inFeatureRand, - out_features=outFeatureRand, - lokr_dim=8, - lokr_alpha=8, - factor=factorRand, - decompose_both=decompose_both_rand, - ) - input = paddle.randn([2, 4, inFeatureRand], "float32") - self.assertEqual(lokr_layer.scale, 1.0) - output = lokr_layer(input) - self.assertEqual(output.shape, [2, 4, outFeatureRand]) - - myForward() - randomForward() - - def test_train_eval(self): - def myTrainEval(): - x = paddle.randn([2, 4, DEFAULT_LINEAR_TEST_CONFIG["in_features"]], "float32") - defaultTestLayer.train() - train_result = defaultTestLayer(x) - train_weight = copy.deepcopy(defaultTestLayer.weight) # deep copy since this is a pointer - defaultTestLayer.eval() - eval_result = defaultTestLayer(x) - eval_weight = defaultTestLayer.weight - self.assertTrue(paddle.allclose(train_result, eval_result)) - self.assertTrue(paddle.allclose(train_weight, eval_weight)) - - def randomTrainEval(): - for _ in range(100): - inFeatureRand = random.randint(10, 50) - outFeatureRand = random.randint(10, 50) - decompose_both_rand = random.choice([True, False]) - factorRand = random.choice([-1, random.randint(2, min(inFeatureRand, outFeatureRand))]) - lokr_layer = LoKrLinear( - in_features=inFeatureRand, - out_features=outFeatureRand, - lokr_dim=8, - lokr_alpha=8, - factor=factorRand, - decompose_both=decompose_both_rand, - ) - x = paddle.randn([2, 4, inFeatureRand], "float32") - lokr_layer.train() - train_result = lokr_layer(x) - train_weight = copy.deepcopy(lokr_layer.weight) # deep copy since this is a pointer - lokr_layer.eval() - eval_result = lokr_layer(x) - eval_weight = lokr_layer.weight - self.assertTrue(paddle.allclose(train_result, eval_result)) - self.assertTrue(paddle.allclose(train_weight, eval_weight)) - - myTrainEval() - randomTrainEval() - - def test_save_load(self): - for _ in range(10): - with TemporaryDirectory() as tempdir: - weights_path = os.path.join(tempdir, "model.pdparams") - paddle.save(defaultTestLayer.state_dict(), weights_path) - new_lokr_layer = defaultTestLayer - state_dict = paddle.load(weights_path) - new_lokr_layer.set_dict(state_dict) - x = paddle.randn([2, 4, DEFAULT_LINEAR_TEST_CONFIG["in_features"]], "float32") - self.assertTrue(paddle.allclose(new_lokr_layer(x), defaultTestLayer(x))) # something goes wrong here - - def test_load_regular_linear(self): - for i in range(10): - with TemporaryDirectory() as tempdir: - inFeatureRand = random.randint(10, 30) - outFeatureRand = random.randint(10, 50) - regular_linear = paddle.nn.Linear(in_features=inFeatureRand, out_features=outFeatureRand) - weights_path = os.path.join(tempdir, "model.pdparams") - paddle.save(regular_linear.state_dict(), weights_path) - state_dict = paddle.load(weights_path) - lokr_layer = LoKrLinear( - in_features=inFeatureRand, - out_features=outFeatureRand, - lokr_dim=8, - lokr_alpha=8, - factor=-1, - decompose_both=False, - ) - lokr_layer.set_dict(state_dict) - x = paddle.randn([2, 4, inFeatureRand], "float32") - self.assertTrue(paddle.allclose(lokr_layer(x), regular_linear(x))) - - -class TestLoKrModel(unittest.TestCase): - def test_tp_raise_exception(self): - with self.assertRaises(NotImplementedError): - lokr_config = LoKrConfig(**DEFAULT_MODEL_TEST_CONFIG, tensor_parallel_degree=2) - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert") - lokr_model = LoKrModel(model, lokr_config) - lokr_model.eval() - - def test_lokr_model_restore(self): - lokr_config = LoKrConfig(**DEFAULT_MODEL_TEST_CONFIG) - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert") - input_ids = paddle.to_tensor(np.random.randint(100, 200, [1, 20])) - model.eval() - original_results_1 = model(input_ids) - lokr_model = LoKrModel(model, lokr_config) - restored_model = lokr_model.restore_original_model() - restored_model.eval() - original_results_2 = restored_model(input_ids) - self.assertIsNotNone(original_results_1) - self.assertIsNotNone(original_results_2) - self.assertIsInstance(restored_model, BertModel) - self.assertTrue(paddle.allclose(original_results_1[0], original_results_2[0])) - - def test_lokr_model_constructor(self): - lokr_config = LoKrConfig(**DEFAULT_MODEL_TEST_CONFIG) - model = AutoModel.from_pretrained( - "Paddleformers/tiny-random-bert", - hidden_dropout_prob=0, - attention_probs_dropout_prob=0, - ) - lokr_model = LoKrModel(model, lokr_config) - for name, weight in lokr_model.state_dict().items(): - if any([re.fullmatch(target_module, name) for target_module in lokr_config.target_modules]): - # general rule of thumb: any weight in state_dict with name having "lokr" should enable training, vice versa. - if "lokr" in name: - self.assertFalse(weight.stop_gradient) - else: - self.assertTrue(weight.stop_gradient) - - def test_lokr_model_save_load(self): - with TemporaryDirectory() as tempdir: - input_ids = paddle.to_tensor(np.random.randint(100, 200, [1, 20])) - lokr_config = LoKrConfig(**DEFAULT_MODEL_TEST_CONFIG) - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert") - lokr_model = LoKrModel(model, lokr_config) - lokr_model.eval() - original_results = lokr_model(input_ids) - lokr_model.save_pretrained(tempdir) - - loaded_lokr_model = LoKrModel.from_pretrained(model, tempdir) - loaded_lokr_model.eval() - loaded_results = loaded_lokr_model(input_ids) - self.assertTrue(paddle.allclose(original_results[0], loaded_results[0])) - - config_loaded_lokr_model = LoKrModel.from_pretrained(model, tempdir, lokr_config=lokr_config) - config_loaded_lokr_model.eval() - config_loaded_results = config_loaded_lokr_model(input_ids) - self.assertTrue(paddle.allclose(original_results[0], config_loaded_results[0])) - - -class TestLoKrConfig(unittest.TestCase): - def test_to_dict(self): - config = LoKrConfig() - expected_dict = { - "base_model_name_or_path": None, - "target_modules": None, - "trainable_modules": None, - "trainable_bias": None, - "lokr_dim": 8, - "factor": -1, - "decompose_both": False, - "lokr_alpha": 0.0, - "merge_weight": False, - "tensor_parallel_degree": -1, - "dtype": None, - } - self.assertEqual(config.to_dict(), expected_dict) - - def test_invalid_directory_save_pretrained(self): - config = LoKrConfig() - with TemporaryDirectory() as tempdir: - # Create a file instead of directory - invalid_dir = os.path.join(tempdir, "invalid_dir") - with open(invalid_dir, "w") as f: - f.write("This is a file, not a directory.") - with self.assertRaises(AssertionError): - config.save_pretrained(invalid_dir) - - def test_from_pretrained_not_found(self): - with TemporaryDirectory() as tempdir: - with self.assertRaises(ValueError): - LoKrConfig.from_pretrained(tempdir) # No config file in directory - - def test_scaling_property(self): - lokr_config = LoKrConfig(lokr_alpha=10, lokr_dim=2) - self.assertEqual(lokr_config.scaling, 5.0) - lokr_config = LoKrConfig(lokr_alpha=0, lokr_dim=8) - self.assertEqual(lokr_config.scaling, 0.0) - lokr_config = LoKrConfig(lokr_alpha=0, lokr_dim=0) - self.assertEqual(lokr_config.scaling, 1.0) - - def test_save_load(self): - with TemporaryDirectory() as tempdir: - lokr_config = LoKrConfig(**DEFAULT_MODEL_TEST_CONFIG) - lokr_config.save_pretrained(tempdir) - loaded_lokr_config = LoKrConfig.from_pretrained(tempdir) - self.assertEqual(lokr_config, loaded_lokr_config) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/peft/test_lora.py b/tests/peft/test_lora.py index fde2935a716..d9d10f1d00e 100644 --- a/tests/peft/test_lora.py +++ b/tests/peft/test_lora.py @@ -23,7 +23,11 @@ from parameterized import parameterized from paddleformers.peft.lora import LoRAConfig, LoRALinear, LoRAModel -from paddleformers.transformers import AutoModel, BertModel, Glm4MoeModel +from paddleformers.transformers import ( + AutoModelForCausalLM, + Glm4MoeModel, + Qwen3ForCausalLM, +) class TestLoraLayer(unittest.TestCase): @@ -89,7 +93,7 @@ def test_lora_model_restore(self): enable_lora_list=[None, [True, False]], head_dim=2, ) - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert") + model = AutoModelForCausalLM.from_pretrained("Paddleformers/tiny-random-qwen3", convert_from_hf=True) input_ids = paddle.to_tensor(np.random.randint(100, 200, [1, 20])) model.eval() original_results_1 = model(input_ids) @@ -99,7 +103,7 @@ def test_lora_model_restore(self): original_results_2 = restored_model(input_ids) self.assertIsNotNone(original_results_1) self.assertIsNotNone(original_results_2) - self.assertIsInstance(restored_model, BertModel) + self.assertIsInstance(restored_model, Qwen3ForCausalLM) self.assertTrue(paddle.allclose(original_results_1[0], original_results_2[0])) @parameterized.expand([(None,), ("all",), ("lora",)]) @@ -113,8 +117,9 @@ def test_lora_model_constructor(self, bias): head_dim=2, ) # turn off plm dropout for to test train vs test - model = AutoModel.from_pretrained( - "Paddleformers/tiny-random-bert", + model = AutoModelForCausalLM.from_pretrained( + "Paddleformers/tiny-random-qwen3", + convert_from_hf=True, hidden_dropout_prob=0, attention_probs_dropout_prob=0, ) @@ -150,7 +155,7 @@ def test_lora_model_save_load(self): r=4, lora_alpha=8, ) - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert") + model = AutoModelForCausalLM.from_pretrained("Paddleformers/tiny-random-qwen3", convert_from_hf=True) lora_model = LoRAModel(model, lora_config) lora_model.eval() original_results = lora_model(input_ids) @@ -173,7 +178,7 @@ def test_lora_module_raise_exception(self): lora_alpha=8, enable_lora_list=None, ) - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert") + model = AutoModelForCausalLM.from_pretrained("Paddleformers/tiny-random-qwen3", convert_from_hf=True) with self.assertRaises(ValueError): LoRAModel(model, lora_config) diff --git a/tests/peft/test_lorapro.py b/tests/peft/test_lorapro.py index b7c61174b78..befe2147285 100644 --- a/tests/peft/test_lorapro.py +++ b/tests/peft/test_lorapro.py @@ -26,7 +26,7 @@ from parameterized import parameterized from paddleformers.peft.lora import LoRAConfig, LoRALinear, LoRAModel -from paddleformers.transformers import AutoModel, BertModel +from paddleformers.transformers import AutoModelForCausalLM, Qwen3Model from paddleformers.utils.optimizer import AdamWLoRAPro @@ -100,7 +100,7 @@ def test_lorapro_model_restore(self): head_dim=2, lorapro=True, ) - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert") + model = AutoModelForCausalLM.from_pretrained("Paddleformers/tiny-random-qwen3", convert_from_hf=True) input_ids = paddle.to_tensor(np.random.randint(100, 200, [1, 20])) model.eval() original_results_1 = model(input_ids) @@ -110,7 +110,7 @@ def test_lorapro_model_restore(self): original_results_2 = restored_model(input_ids) self.assertIsNotNone(original_results_1) self.assertIsNotNone(original_results_2) - self.assertIsInstance(restored_model, BertModel) + self.assertIsInstance(restored_model, Qwen3Model) self.assertTrue(paddle.allclose(original_results_1[0], original_results_2[0])) @parameterized.expand([(None,), ("all",), ("lora",)]) @@ -125,8 +125,9 @@ def test_lorapro_model_constructor(self, bias): lorapro=True, ) # turn off plm dropout for to test train vs test - model = AutoModel.from_pretrained( - "Paddleformers/tiny-random-bert", + model = AutoModelForCausalLM.from_pretrained( + "Paddleformers/tiny-random-qwen3", + convert_from_hf=True, hidden_dropout_prob=0, attention_probs_dropout_prob=0, ) @@ -158,7 +159,7 @@ def test_lorapro_model_save_load(self): with TemporaryDirectory() as tempdir: input_ids = paddle.to_tensor(np.random.randint(100, 200, [1, 20])) lorapro_config = LoRAConfig(target_modules=[".*q_proj.*", ".*v_proj.*"], r=4, lora_alpha=8, lorapro=True) - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert") + model = AutoModelForCausalLM.from_pretrained("Paddleformers/tiny-random-qwen3", convert_from_hf=True) lorapro_model = LoRAModel(model, lorapro_config) lorapro_model.eval() original_results = lorapro_model(input_ids) @@ -186,7 +187,7 @@ def test_lorapro_modes(self, x_mode): lorapro=True, ) - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert") + model = AutoModelForCausalLM.from_pretrained("Paddleformers/tiny-random-qwen3", convert_from_hf=True) lorapro_model = LoRAModel(model, lorapro_config) lorapro_model.mark_only_lora_as_trainable() @@ -218,7 +219,7 @@ def test_lorapro_module_raise_exception(self): lorapro_config = LoRAConfig( target_modules=[".*norm1.*"], r=4, lora_alpha=8, enable_lora_list=None, lorapro=True ) - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert") + model = AutoModelForCausalLM.from_pretrained("Paddleformers/tiny-random-qwen3", convert_from_hf=True) with self.assertRaises(ValueError): LoRAModel(model, lorapro_config) diff --git a/tests/peft/test_mora.py b/tests/peft/test_mora.py index 69173ddeb6f..cf1dba65c68 100644 --- a/tests/peft/test_mora.py +++ b/tests/peft/test_mora.py @@ -23,7 +23,7 @@ from parameterized import parameterized from paddleformers.peft.lora import LoRAConfig, LoRALinear, LoRAModel -from paddleformers.transformers import AutoModel, BertModel +from paddleformers.transformers import AutoModelForCausalLM, Qwen3Model class TestMoraLayer(unittest.TestCase): @@ -101,7 +101,7 @@ def test_mora_model_restore(self): head_dim=2, use_mora=True, ) - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert") + model = AutoModelForCausalLM.from_pretrained("Paddleformers/tiny-random-qwen3", convert_from_hf=True) input_ids = paddle.to_tensor(np.random.randint(100, 200, [1, 20])) model.eval() original_results_1 = model(input_ids) @@ -111,7 +111,7 @@ def test_mora_model_restore(self): original_results_2 = restored_model(input_ids) self.assertIsNotNone(original_results_1) self.assertIsNotNone(original_results_2) - self.assertIsInstance(restored_model, BertModel) + self.assertIsInstance(restored_model, Qwen3Model) self.assertTrue(paddle.allclose(original_results_1[0], original_results_2[0])) @parameterized.expand([(None,), ("all",), ("lora",)]) @@ -126,8 +126,9 @@ def test_mora_model_constructor(self, bias): use_mora=True, ) # turn off plm dropout for to test train vs test - model = AutoModel.from_pretrained( - "Paddleformers/tiny-random-bert", + model = AutoModelForCausalLM.from_pretrained( + "Paddleformers/tiny-random-qwen3", + convert_from_hf=True, hidden_dropout_prob=0, attention_probs_dropout_prob=0, ) @@ -159,7 +160,7 @@ def test_mora_model_save_load(self): with TemporaryDirectory() as tempdir: input_ids = paddle.to_tensor(np.random.randint(100, 200, [1, 20])) mora_config = LoRAConfig(target_modules=[".*q_proj.*", ".*v_proj.*"], r=4, lora_alpha=8, use_mora=True) - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert") + model = AutoModelForCausalLM.from_pretrained("Paddleformers/tiny-random-qwen3", convert_from_hf=True) mora_model = LoRAModel(model, mora_config) mora_model.eval() original_results = mora_model(input_ids) @@ -177,7 +178,7 @@ def test_mora_model_save_load(self): def test_lora_module_raise_exception(self): mora_config = LoRAConfig(target_modules=[".*norm1.*"], r=4, lora_alpha=8, enable_lora_list=None, use_mora=True) - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert") + model = AutoModelForCausalLM.from_pretrained("Paddleformers/tiny-random-qwen3", convert_from_hf=True) with self.assertRaises(ValueError): LoRAModel(model, mora_config) diff --git a/tests/peft/test_mos_lora.py b/tests/peft/test_mos_lora.py index 377b4256618..1cf8d8425d2 100644 --- a/tests/peft/test_mos_lora.py +++ b/tests/peft/test_mos_lora.py @@ -23,7 +23,7 @@ from parameterized import parameterized from paddleformers.peft.lora import LoRAConfig, LoRALinear, LoRAModel -from paddleformers.transformers import AutoModel, BertModel +from paddleformers.transformers import AutoModelForCausalLM, Qwen3Model class TestMosLoraLayer(unittest.TestCase): @@ -104,7 +104,7 @@ def test_lora_model_restore(self): head_dim=2, lora_use_mixer=True, ) - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert") + model = AutoModelForCausalLM.from_pretrained("Paddleformers/tiny-random-qwen3", convert_from_hf=True) input_ids = paddle.to_tensor(np.random.randint(100, 200, [1, 20])) model.eval() original_results_1 = model(input_ids) @@ -114,7 +114,7 @@ def test_lora_model_restore(self): original_results_2 = restored_model(input_ids) self.assertIsNotNone(original_results_1) self.assertIsNotNone(original_results_2) - self.assertIsInstance(restored_model, BertModel) + self.assertIsInstance(restored_model, Qwen3Model) self.assertTrue(paddle.allclose(original_results_1[0], original_results_2[0])) def test_parallel_support(self): @@ -127,7 +127,7 @@ def test_parallel_support(self): lora_use_mixer=True, tensor_parallel_degree=2, ) - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert") + model = AutoModelForCausalLM.from_pretrained("Paddleformers/tiny-random-qwen3", convert_from_hf=True) model.eval() with self.assertRaises(NotImplementedError): LoRAModel(model, lora_config) @@ -144,8 +144,9 @@ def test_lora_model_constructor(self, bias): lora_use_mixer=True, ) # turn off plm dropout for to test train vs test - model = AutoModel.from_pretrained( - "Paddleformers/tiny-random-bert", + model = AutoModelForCausalLM.from_pretrained( + "Paddleformers/tiny-random-qwen3", + convert_from_hf=True, hidden_dropout_prob=0, attention_probs_dropout_prob=0, ) @@ -179,7 +180,7 @@ def test_lora_model_save_load(self): lora_config = LoRAConfig( target_modules=[".*q_proj.*", ".*v_proj.*"], r=4, lora_alpha=8, lora_use_mixer=True ) - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert") + model = AutoModelForCausalLM.from_pretrained("Paddleformers/tiny-random-qwen3", convert_from_hf=True) lora_model = LoRAModel(model, lora_config) lora_model.eval() original_results = lora_model(input_ids) @@ -199,7 +200,7 @@ def test_lora_module_raise_exception(self): lora_config = LoRAConfig( target_modules=[".*norm1.*"], r=4, lora_alpha=8, enable_lora_list=None, lora_use_mixer=True ) - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert") + model = AutoModelForCausalLM.from_pretrained("Paddleformers/tiny-random-qwen3", convert_from_hf=True) with self.assertRaises(ValueError): LoRAModel(model, lora_config) diff --git a/tests/peft/test_prefix.py b/tests/peft/test_prefix.py deleted file mode 100644 index 02c54fc85f8..00000000000 --- a/tests/peft/test_prefix.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest -from tempfile import TemporaryDirectory - -import paddle - -from paddleformers.peft.prefix import ( - PrefixConfig, - PrefixModelForCausalLM, - llama_postprocess_past_key_value, -) -from paddleformers.transformers import LlamaConfig, LlamaForCausalLM - - -class TestPrefixModel(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.config = LlamaConfig( - vocab_size=200, - hidden_size=32, - intermediate_size=86, - num_hidden_layers=1, - num_attention_heads=1, - dtype="float32", - ) - - cls.model = LlamaForCausalLM(cls.config) - cls.prefix_config = PrefixConfig( - num_prefix_tokens=2, - num_attention_heads=cls.model.config.num_attention_heads, - num_hidden_layers=cls.model.config.num_hidden_layers, - hidden_size=cls.model.config.hidden_size, - prefix_projection_hidden_size=cls.model.config.hidden_size, - dtype="float32", - ) - cls.prefix_model = PrefixModelForCausalLM( - model=cls.model, - prefix_config=cls.prefix_config, - postprocess_past_key_value=llama_postprocess_past_key_value, - ) - - def test_prefix_config(self): - with TemporaryDirectory() as tempdir: - self.prefix_config.save_pretrained(tempdir) - loaded_prefix_config = PrefixConfig.from_pretrained(tempdir) - self.assertEqual(self.prefix_config, loaded_prefix_config) - - def test_prefix_model_save_load(self): - with TemporaryDirectory() as tempdir: - input_ids = paddle.randint(100, 200, [1, 20]) - self.prefix_model.eval() - self.prefix_model.save_pretrained(tempdir) - loaded_prefix_model = PrefixModelForCausalLM.from_pretrained( - self.model, tempdir, llama_postprocess_past_key_value - ) - loaded_prefix_model.eval() - - original_results = self.prefix_model(input_ids) - loaded_results = loaded_prefix_model(input_ids) - - self.assertIsNotNone(original_results) - self.assertEqual(original_results[0].shape, [1, 20, self.config.vocab_size]) - self.assertIsNotNone(loaded_results) - self.assertEqual(loaded_results[0].shape, [1, 20, self.config.vocab_size]) - self.assertTrue(paddle.allclose(original_results[0], loaded_results[0])) - - def test_prefix_model_attention_mask(self): - inputs = { - "input_ids": paddle.randint(100, 200, [1, 20]), - "attention_mask": paddle.ones([1, 20]), - "position_ids": paddle.arange(20).unsqueeze(0), - } - logits_2d = self.prefix_model(**inputs)[0] - inputs["attention_mask"] = paddle.tril(paddle.ones([1, 20, 20])) - logits_3d = self.prefix_model(**inputs)[0] - inputs["attention_mask"] = paddle.tril(paddle.ones([1, 1, 20, 20])) - logits_4d = self.prefix_model(**inputs)[0] - self.assertTrue(paddle.allclose(logits_2d, logits_3d)) - self.assertTrue(paddle.allclose(logits_3d, logits_4d)) - - def test_prefix_model_generate(self): - inputs = { - "input_ids": paddle.randint(100, 200, [1, 20]), - "attention_mask": paddle.ones([1, 20]), - "position_ids": paddle.arange(20).unsqueeze(0), - } - self.prefix_model.generate( - **inputs, - max_length=5, - decode_strategy="sampling", - temperature=1.0, - top_k=1, - top_p=1.0, - repetition_penalty=1.0, - ) diff --git a/tests/peft/test_quant_lora.py b/tests/peft/test_quant_lora.py index b103acf478a..de0994ad2ac 100644 --- a/tests/peft/test_quant_lora.py +++ b/tests/peft/test_quant_lora.py @@ -25,7 +25,7 @@ from paddleformers.peft.lora import LoRAConfig, LoRALinear, LoRAModel from paddleformers.peft.lora.lora_quant_layers import QuantedLoRALinear -from paddleformers.transformers import AutoModel +from paddleformers.transformers import AutoModelForCausalLM class TestQuantedLoraLayer(unittest.TestCase): @@ -102,7 +102,7 @@ def setUpClass(cls): r=4, lora_alpha=8, ) - cls.model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert") + cls.model = AutoModelForCausalLM.from_pretrained("Paddleformers/tiny-random-qwen3", convert_from_hf=True) cls.lora_model = LoRAModel(cls.model, lora_config) cls.lora_model.mark_only_lora_as_trainable() # lora_B parameter is initialized to 0, therefore AB = 0 and W + AB = W diff --git a/tests/peft/test_reft.py b/tests/peft/test_reft.py deleted file mode 100644 index f8fc0b1b7a2..00000000000 --- a/tests/peft/test_reft.py +++ /dev/null @@ -1,247 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import unittest - -import paddle - -# from llm.utils.data import convert_example_for_reft -from paddleformers.data import DataCollatorForSeq2Seq -from paddleformers.peft.reft import ( - LoreftIntervention, - LowRankRotateLayer, - ReFTConfig, - ReftDataCollator, - ReFTModel, - TinyIntervention, -) -from paddleformers.peft.reft.modeling_utils import ( - count_parameters, - get_type_from_string, - set_seed, -) -from paddleformers.transformers import AutoModelForCausalLM, AutoTokenizer - - -class TestReftDataCollator(unittest.TestCase): - def test_call(self): - model_name = "Paddleformers/tiny-random-llama" - tokenizer = AutoTokenizer.from_pretrained( - model_name, - model_max_length=512, - padding_side="right", - ) - tokenizer.pad_token_id = tokenizer.eos_token_id - model = AutoModelForCausalLM.from_pretrained(model_name) - data_collator = DataCollatorForSeq2Seq( - tokenizer=tokenizer, model=model, label_pad_token_id=-100, padding="longest" - ) - reft_data_collator = ReftDataCollator(data_collator) - instances = [ - { - "input_ids": [[1, 2, 3], [4, 5, 6]], - "intervention_locations": [[0, 1, 0], [1, 0, 1]], - }, - { - "input_ids": [[7, 8, 9], [10, 11, 12]], - "intervention_locations": [[1, 0, 1], [0, 1, 0]], - }, - ] - - batch_inputs = reft_data_collator(instances) - - self.assertIn("input_ids", batch_inputs) - self.assertIn("intervention_locations", batch_inputs) - self.assertIsInstance(batch_inputs["input_ids"], paddle.Tensor) - self.assertIsInstance(batch_inputs["intervention_locations"], paddle.Tensor) - - -class TestBasicUtils(unittest.TestCase): - def test_get_type_from_string(self): - class_str = "paddleformers.peft.reft.LoreftIntervention" - cls = get_type_from_string(class_str) - self.assertIsInstance(cls, type(LoreftIntervention)) - - def test_set_seed(self): - set_seed(42) - set_seed(66) - - def test_count_param(self): - model = AutoModelForCausalLM.from_pretrained("Paddleformers/tiny-random-llama") - count_parameters(model) - - -class TestReftConfig(unittest.TestCase): - def test_reft_config(self): - layers = [0, 1, 2] - representations = [ - { - "layer": l, - "component": "block_output", - "low_rank_dimension": 4, - "intervention": LoreftIntervention( - embed_dim=768, - low_rank_dimension=4, - dropout=0.00, - dtype="float32", - act_fn="linear", - device="gpu", - add_bias=False, - ), - } - for l in layers - ] - reft_config = ReFTConfig(representations=representations) - reft_config.__str__() - - -class TestLoReftIntervention(unittest.TestCase): - def setUp(self): - self.kwargs = { - "embed_dim": 64, - "low_rank_dimension": 4, - "dtype": paddle.float32, - "dropout": 0.1, - "act_fn": "linear", - } - - def test_initialization(self): - intervention = LoreftIntervention(**self.kwargs) - self.assertIsInstance(intervention.rotate_layer, LowRankRotateLayer) - self.assertIsInstance(intervention.learned_source, paddle.nn.Linear) - self.assertEqual(intervention.dropout.p, self.kwargs["dropout"]) - - def test_forward(self): - base = paddle.randn([10, self.kwargs["embed_dim"]]) - intervention = LoreftIntervention(**self.kwargs) - output = intervention.forward(base) - self.assertEqual(output.shape, base.shape) - self.assertEqual(output.dtype, self.kwargs["dtype"]) - - def test_load_state_dict(self): - model = LoreftIntervention(**self.kwargs) - state_dict = { - "learned_source.weight": paddle.randn([64, 4]), - "learned_source.bias": paddle.zeros([4]), - "rotate_layer.weight": paddle.randn([64, 4]), - } - model.load_state_dict(state_dict) - self.assertTrue(paddle.allclose(model.learned_source.weight.data, state_dict["learned_source.weight"])) - self.assertTrue(paddle.allclose(model.learned_source.bias.data, state_dict["learned_source.bias"])) - self.assertTrue( - paddle.allclose( - model.rotate_layer.weight[:, : state_dict["rotate_layer.weight"].shape[-1]], - state_dict["rotate_layer.weight"], - ) - ) - - -class TestTinyIntervention(unittest.TestCase): - def setUp(self): - self.kwargs = { - "embed_dim": 768, - "low_rank_dimension": 4, - "dtype": paddle.float32, - "dropout": 0.1, - "act_fn": "relu", - } - - def test_initialization(self): - intervention = TinyIntervention(**self.kwargs) - self.assertEqual(intervention.rank, self.kwargs["low_rank_dimension"]) - self.assertEqual(intervention.hidden_size, self.kwargs["embed_dim"]) - self.assertEqual(intervention.param_A.shape, [self.kwargs["embed_dim"], self.kwargs["low_rank_dimension"]]) - self.assertEqual(intervention.param_B.shape, [self.kwargs["low_rank_dimension"], self.kwargs["embed_dim"]]) - self.assertEqual(intervention.param_a.shape, [self.kwargs["low_rank_dimension"]]) - self.assertEqual(intervention.param_b.shape, [self.kwargs["embed_dim"]]) - - def test_forward(self): - base = paddle.randn([10, self.kwargs["embed_dim"]]) - intervention = TinyIntervention(**self.kwargs) - output = intervention.forward(base) - self.assertEqual(output.shape, base.shape) - self.assertEqual(output.dtype, self.kwargs["dtype"]) - - def test_load_state_dict(self): - model = TinyIntervention(**self.kwargs) - state_dict = { - "param_A": paddle.randn([768, 4]), - "param_B": paddle.randn([4, 768]), - "param_a": paddle.randn([4]), - "param_b": paddle.randn([768]), - } - model.load_state_dict(state_dict) - self.assertTrue(paddle.allclose(model.param_A, state_dict["param_A"])) - self.assertTrue(paddle.allclose(model.param_B, state_dict["param_B"])) - self.assertTrue(paddle.allclose(model.param_a, state_dict["param_a"])) - self.assertTrue(paddle.allclose(model.param_b, state_dict["param_b"])) - - -class TestReftModel(unittest.TestCase): - def test_get_reft_model(self): - model = AutoModelForCausalLM.from_pretrained("Paddleformers/tiny-random-llama") - layers = [0] - representations = [ - { - "layer": l, - "component": "block_output", - "low_rank_dimension": 4, - "intervention": LoreftIntervention( - embed_dim=768, - low_rank_dimension=4, - dropout=0.00, - dtype="float32", - act_fn="linear", - device="gpu", - add_bias=False, - ), - } - for l in layers - ] - reft_config = ReFTConfig(representations=representations) - reft_model = ReFTModel(reft_config, model) - reft_model.print_trainable_parameters() - self.assertTrue(type(reft_model), ReFTModel) - - def test_reft_model_forward(self): - model = AutoModelForCausalLM.from_pretrained("Paddleformers/tiny-random-llama") - - layers = [0] - representations = [ - { - "layer": l, - "component": "block_output", - "low_rank_dimension": 4, - "intervention": LoreftIntervention( - embed_dim=768, - low_rank_dimension=4, - dropout=0.00, - dtype="float32", - act_fn="linear", - device="gpu", - add_bias=False, - ), - } - for l in layers - ] - reft_config = ReFTConfig(representations=representations) - reft_model = ReFTModel(reft_config, model) - reft_model.print_trainable_parameters() - outputs = reft_model.model(**{"input_ids": paddle.randint(low=1, high=100, shape=(5, 10))}) - self.assertTrue(outputs[0].shape, [5, 10, 32000]) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/peft/test_vera.py b/tests/peft/test_vera.py deleted file mode 100644 index 4e76676d0e8..00000000000 --- a/tests/peft/test_vera.py +++ /dev/null @@ -1,205 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import os -import re -import unittest -from tempfile import NamedTemporaryFile, TemporaryDirectory - -import numpy as np -import paddle -from paddle import nn -from parameterized import parameterized - -from paddleformers.peft.vera import VeRAConfig, VeRALinear, VeRAModel -from paddleformers.transformers import AutoModel - - -class TestVeraLayer(unittest.TestCase): - def test_r_raise_exception(self): - with self.assertRaises(ValueError): - VeRALinear( - in_features=16, - out_features=16, - r=0, - vera_dropout=0.1, - vera_alpha=4, - base_linear_module=nn.Linear(in_features=16, out_features=16), - ) - - def test_forward(self): - vera_layer = VeRALinear( - in_features=16, - out_features=16, - r=4, - vera_dropout=0.1, - vera_alpha=4, - base_linear_module=nn.Linear(16, 16), - pissa_init=True, - ) - x = paddle.randn([2, 4, 16], "float32") - output = vera_layer(x) - self.assertFalse(vera_layer.vera_b.stop_gradient) - self.assertFalse(vera_layer.vera_d.stop_gradient) - self.assertTrue(vera_layer.weight.stop_gradient) - self.assertFalse(vera_layer.bias.stop_gradient) - self.assertEqual(output.shape, [2, 4, 16]) - - def test_train_eval(self): - x = paddle.randn([2, 4, 16], "float32") - vera_layer = VeRALinear( - in_features=16, out_features=16, r=4, base_linear_module=nn.Linear(in_features=16, out_features=16) - ) - vera_layer.train() - train_result = vera_layer(x) - train_weight = copy.deepcopy(vera_layer.weight) # deep copy since this is a pointer - vera_layer.eval() - eval_result = vera_layer(x) - eval_weight = vera_layer.weight - self.assertTrue(paddle.allclose(train_result, eval_result)) - self.assertTrue(paddle.allclose(train_weight, eval_weight)) - - def test_save_load(self): - with TemporaryDirectory() as tempdir: - vera_layer = VeRALinear( - in_features=16, out_features=16, r=4, base_linear_module=nn.Linear(in_features=16, out_features=16) - ) - weights_path = os.path.join(tempdir, "model.pdparams") - paddle.save(vera_layer.state_dict(), weights_path) - new_vera_layer = VeRALinear( - in_features=16, out_features=16, r=4, base_linear_module=nn.Linear(in_features=16, out_features=16) - ) - state_dict = paddle.load(weights_path) - new_vera_layer.set_dict(state_dict) - x = paddle.randn([2, 4, 16], "float32") - self.assertTrue(paddle.allclose(new_vera_layer(x), vera_layer(x))) - - def test_load_regular_linear(self): - with TemporaryDirectory() as tempdir: - regular_linear = paddle.nn.Linear(in_features=16, out_features=16) - weights_path = os.path.join(tempdir, "model.pdparams") - paddle.save(regular_linear.state_dict(), weights_path) - state_dict = paddle.load(weights_path) - # should be identical to regular linear - vera_layer_r8 = VeRALinear( - in_features=16, out_features=16, r=8, base_linear_module=nn.Linear(in_features=16, out_features=16) - ) - vera_layer_r4 = VeRALinear( - in_features=16, out_features=16, r=4, base_linear_module=nn.Linear(in_features=16, out_features=16) - ) - vera_layer_r8.set_dict(state_dict) - vera_layer_r4.set_dict(state_dict) - x = paddle.randn([2, 4, 16], "float32") - self.assertTrue(paddle.allclose(vera_layer_r8(x), regular_linear(x))) - self.assertTrue(paddle.allclose(vera_layer_r4(x), regular_linear(x))) - - -class TestVeraModel(unittest.TestCase): - @parameterized.expand([(None,), ("all",), ("vera",)]) - def test_vera_model_constructor(self, bias): - vera_config = VeRAConfig( - target_modules=[".*q_proj.*", ".*v_proj.*"], r=4, vera_alpha=4, head_dim=2, pissa_init=True - ) - # turn off plm dropout for to test train vs test - model = AutoModel.from_pretrained( - "Paddleformers/tiny-random-bert", - hidden_dropout_prob=0, - attention_probs_dropout_prob=0, - ) - vera_model = VeRAModel(model, vera_config) - vera_model.mark_only_vera_as_trainable() - - for name, weight in vera_model.state_dict().items(): - if any([re.fullmatch(target_module, name) for target_module in vera_config.target_modules]): - if "vera_b" in name or "vera_d" in name: - self.assertFalse(weight.stop_gradient) - else: - self.assertTrue(weight.stop_gradient) - - input_ids = paddle.to_tensor(np.random.randint(100, 200, [1, 20])) - vera_model.train() - train_forward_results = vera_model(input_ids) - self.assertIsNotNone(train_forward_results) - vera_model.eval() - eval_forward_results = vera_model(input_ids) - self.assertIsNotNone(eval_forward_results) - self.assertTrue(paddle.allclose(train_forward_results[0], eval_forward_results[0])) - - def test_vera_model_save_load(self): - with TemporaryDirectory() as tempdir: - input_ids = paddle.to_tensor(np.random.randint(100, 200, [1, 20])) - vera_config = VeRAConfig( - target_modules=[".*q_proj.*", ".*v_proj.*"], - r=4, - vera_alpha=4, - ) - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert") - vera_model = VeRAModel(model, vera_config) - vera_model.eval() - original_results = vera_model(input_ids) - vera_model.save_pretrained(tempdir) - - loaded_vera_model = VeRAModel.from_pretrained(model, tempdir) - loaded_vera_model.eval() - loaded_results = loaded_vera_model(input_ids) - self.assertTrue(paddle.allclose(original_results[0], loaded_results[0])) - - config_loaded_vera_model = VeRAModel.from_pretrained(model, tempdir, vera_config=vera_config) - config_loaded_vera_model.eval() - config_loaded_results = config_loaded_vera_model(input_ids) - self.assertTrue(paddle.allclose(original_results[0], config_loaded_results[0])) - - def test_restore_original_model(self): - vera_config = VeRAConfig( - target_modules=[".*q_proj.*", ".*v_proj.*"], - r=4, - vera_alpha=4, - ) - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert") - vera_model = VeRAModel(model, vera_config) - with self.assertRaises(NotImplementedError): - vera_model.restore_original_model() - - def test_vera_module_raise_exception(self): - vera_config = VeRAConfig(target_modules=[".*norm1.*"], r=4, vera_alpha=4) - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert") - with self.assertRaises(ValueError): - VeRAModel(model, vera_config) - - def test_pissa_raise_exception(self): - vera_config = VeRAConfig(target_modules=[".*q_proj.*"], r=4, vera_alpha=8, pissa_init=True) - model = AutoModel.from_pretrained("Paddleformers/tiny-random-bert") - with self.assertRaises(AssertionError): - VeRAModel(model, vera_config) - - -class TestVeRAConfig(unittest.TestCase): - def test_save_load(self): - with TemporaryDirectory() as tempdir: - vera_config = VeRAConfig() - vera_config.save_pretrained(tempdir) - loaded_vera_config = VeRAConfig.from_pretrained(tempdir) - self.assertEqual(vera_config, loaded_vera_config) - - def test_save_load_err(self): - with NamedTemporaryFile("w+t") as f: - with self.assertRaises(ValueError): - VeRAConfig.from_pretrained(f.name) - - def test_save_pretrained_file_error(self): - with NamedTemporaryFile("w+t") as f: - vera_config = VeRAConfig() - with self.assertRaises(AssertionError): - vera_config.save_pretrained(f.name) diff --git a/tests/transformers/auto/test_configuration.py b/tests/transformers/auto/test_configuration.py index 89251bdf917..c6d21398e9c 100644 --- a/tests/transformers/auto/test_configuration.py +++ b/tests/transformers/auto/test_configuration.py @@ -22,7 +22,7 @@ from paddleformers.transformers import AutoConfig from paddleformers.transformers.auto.configuration import CONFIG_MAPPING -from paddleformers.transformers.bert.configuration import BertConfig +from paddleformers.transformers.qwen3.configuration import Qwen3Config from paddleformers.utils.env import CONFIG_NAME from tests.testing_utils import slow @@ -31,7 +31,7 @@ class AutoConfigTest(unittest.TestCase): def test_built_in_model_class_config(self): - config = AutoConfig.from_pretrained("PaddleFormers/tiny-random-bert", download_hub="aistudio") + config = AutoConfig.from_pretrained("PaddleFormers/tiny-random-qwen3", download_hub="aistudio") number = random.randint(0, 10000) self.assertEqual(config.hidden_size, 32) config.hidden_size = number @@ -77,7 +77,7 @@ def test_from_modelscope(self): def test_load_from_legacy_config(self): number = random.randint(0, 10000) - legacy_config = {"init_class": "BertModel", "hidden_size": number} + legacy_config = {"init_class": "Qwen3Model", "hidden_size": number} with tempfile.TemporaryDirectory() as tempdir: with open(os.path.join(tempdir, AutoConfig.legacy_config_file), "w", encoding="utf-8") as f: json.dump(legacy_config, f, ensure_ascii=False) @@ -94,7 +94,7 @@ def test_new_config_registration(self): AutoConfig.register("model", CustomConfig) # Trying to register something existing in the PaddleFormers library will raise an error with self.assertRaises(ValueError): - AutoConfig.register("bert", BertConfig) + AutoConfig.register("qwen3", Qwen3Config) # Now that the config is registered, it can be used as any other config with the auto-API config = CustomConfig() @@ -109,7 +109,7 @@ def test_new_config_registration(self): @slow def test_from_pretrained_cache_dir(self): - model_id = "Paddleformers/tiny-random-bert" + model_id = "Paddleformers/tiny-random-qwen3" with tempfile.TemporaryDirectory() as tempdir: AutoConfig.from_pretrained(model_id, download_hub="aistudio", cache_dir=tempdir) self.assertTrue(os.path.exists(os.path.join(tempdir, model_id, CONFIG_NAME))) diff --git a/tests/transformers/test_configuration_common.py b/tests/transformers/test_configuration_common.py index a6995c74920..693bc13d634 100644 --- a/tests/transformers/test_configuration_common.py +++ b/tests/transformers/test_configuration_common.py @@ -22,7 +22,7 @@ from requests.exceptions import HTTPError -from paddleformers.transformers import BertConfig +from paddleformers.transformers import Qwen3Config from paddleformers.transformers.configuration_utils import PretrainedConfig @@ -245,10 +245,10 @@ def test_cached_files_are_used_when_internet_is_down(self): response_mock.raise_for_status.side_effect = HTTPError # Download this model to make sure it's in the cache. - _ = BertConfig.from_pretrained("Paddleformers/tiny-random-bert") + _ = Qwen3Config.from_pretrained("Paddleformers/tiny-random-qwen3", convert_from_hf=True) # Under the mock environment we get a 500 error when trying to reach the model. with mock.patch("transformers.utils.hub.requests.head", return_value=response_mock) as mock_head: - _ = BertConfig.from_pretrained("Paddleformers/tiny-random-bert") + _ = Qwen3Config.from_pretrained("Paddleformers/tiny-random-qwen3", convert_from_hf=True) # This check we did call the fake head request mock_head.assert_called() diff --git a/tests/transformers/test_configuration_utils.py b/tests/transformers/test_configuration_utils.py index ced72aaa1c4..b68f2eb30b6 100644 --- a/tests/transformers/test_configuration_utils.py +++ b/tests/transformers/test_configuration_utils.py @@ -18,7 +18,7 @@ import unittest from typing import Dict, Optional -from paddleformers.transformers import BertConfig +from paddleformers.transformers import Qwen3Config from paddleformers.transformers.configuration_utils import ( PretrainedConfig, attribute_map, @@ -131,32 +131,32 @@ def test_get_value_with_default_from_config(self): class StandardConfigMappingTest(unittest.TestCase): - def test_bert_config_mapping(self): - # create new fake-bert class to prevent static-attributed modified by this test - class FakeBertConfig(BertConfig): + def test_qwen3_config_mapping(self): + # create new fake-qwen3 class to prevent static-attributed modified by this test + class FakeQwen3Config(Qwen3Config): pass - config = FakeBertConfig.from_pretrained("Paddleformers/tiny-random-bert") + config = FakeQwen3Config.from_pretrained("Paddleformers/tiny-random-qwen3") hidden_size = config.hidden_size - FakeBertConfig.attribute_map = {"fake_field": "hidden_size"} + FakeQwen3Config.attribute_map = {"fake_field": "hidden_size"} - loaded_config = FakeBertConfig.from_pretrained("Paddleformers/tiny-random-bert") + loaded_config = FakeQwen3Config.from_pretrained("Paddleformers/tiny-random-qwen3") fake_field = loaded_config.fake_field self.assertEqual(fake_field, hidden_size) @slow def test_from_pretrained_cache_dir(self): - model_id = "Paddleformers/tiny-random-bert" + model_id = "Paddleformers/tiny-random-qwen3" with tempfile.TemporaryDirectory() as tempdir: - BertConfig.from_pretrained(model_id, cache_dir=tempdir) + Qwen3Config.from_pretrained(model_id, cache_dir=tempdir) self.assertTrue(os.path.exists(os.path.join(tempdir, model_id, CONFIG_NAME))) # check against double appending model_name in cache_dir self.assertFalse(os.path.exists(os.path.join(tempdir, model_id, model_id))) def test_load_from_hf(self): """test load config from hf""" - config = BertConfig.from_pretrained("Baicai003/tiny-bert", download_hub="huggingface") + config = Qwen3Config.from_pretrained("Paddleformers/tiny-random-qwen3", download_hub="huggingface") self.assertEqual(config.hidden_size, 16) with tempfile.TemporaryDirectory() as tempdir: @@ -164,24 +164,24 @@ def test_load_from_hf(self): self.assertTrue(os.path.exists(os.path.join(tempdir, CONFIG_NAME))) - loaded_config = BertConfig.from_pretrained(tempdir) + loaded_config = Qwen3Config.from_pretrained(tempdir) self.assertEqual(loaded_config.hidden_size, 16) def test_config_mapping(self): - # create new fake-bert class to prevent static-attributed modified by this test - class FakeBertConfig(BertConfig): + # create new fake-qwen3 class to prevent static-attributed modified by this test + class FakeQwen3Config(Qwen3Config): pass with tempfile.TemporaryDirectory() as tempdir: - config = FakeBertConfig.from_pretrained("PaddleFormers/tiny-random-bert") + config = FakeQwen3Config.from_pretrained("PaddleFormers/tiny-random-qwen3") config.save_pretrained(tempdir) # rename `config.json` -> `model_config.json` shutil.move(os.path.join(tempdir, CONFIG_NAME), os.path.join(tempdir, LEGACY_CONFIG_NAME)) - FakeBertConfig.attribute_map = {"fake_field": "hidden_size"} + FakeQwen3Config.attribute_map = {"fake_field": "hidden_size"} - loaded_config = FakeBertConfig.from_pretrained(tempdir) + loaded_config = FakeQwen3Config.from_pretrained(tempdir) self.assertEqual(loaded_config.fake_field, config.hidden_size) diff --git a/tests/transformers/test_modeling_utils.py b/tests/transformers/test_modeling_utils.py index 57f19326b1a..92b14d7bbf5 100644 --- a/tests/transformers/test_modeling_utils.py +++ b/tests/transformers/test_modeling_utils.py @@ -16,19 +16,19 @@ import unittest from tempfile import TemporaryDirectory -from paddleformers.transformers import BertModel +from paddleformers.transformers import Qwen3Model from paddleformers.utils.env import CONFIG_NAME, PADDLE_WEIGHTS_NAME from tests.testing_utils import slow -def download_bert_model(model_name: str): +def download_qwen3_model(model_name: str): """set the global method: multiprocessing can not pickle local method Args: model_name (str): the model name """ - model = BertModel.from_pretrained(model_name) + model = Qwen3Model.from_pretrained(model_name) # free the model resource del model @@ -38,9 +38,9 @@ class TestModeling(unittest.TestCase): @slow def test_from_pretrained_cache_dir_community_model(self): - model_name = "Paddleformers/tiny-random-bert" + model_name = "Paddleformers/tiny-random-qwen3" with TemporaryDirectory() as tempdir: - BertModel.from_pretrained(model_name, cache_dir=tempdir) + Qwen3Model.from_pretrained(model_name, cache_dir=tempdir) self.assertTrue(os.path.exists(os.path.join(tempdir, model_name, CONFIG_NAME))) self.assertTrue(os.path.exists(os.path.join(tempdir, model_name, PADDLE_WEIGHTS_NAME))) # check against double appending model_name in cache_dir @@ -48,9 +48,9 @@ def test_from_pretrained_cache_dir_community_model(self): @slow def test_from_pretrained_cache_dir_pretrained_init(self): - model_name = "PaddleFormers/tiny-random-bert" + model_name = "PaddleFormers/tiny-random-qwen3" with TemporaryDirectory() as tempdir: - BertModel.from_pretrained(model_name, cache_dir=tempdir) + Qwen3Model.from_pretrained(model_name, cache_dir=tempdir) self.assertTrue(os.path.exists(os.path.join(tempdir, model_name, CONFIG_NAME))) self.assertTrue(os.path.exists(os.path.join(tempdir, model_name, PADDLE_WEIGHTS_NAME))) # check against double appending model_name in cache_dir diff --git a/tests/transformers/test_shard_checkpoint.py b/tests/transformers/test_shard_checkpoint.py index 3f3a9967f37..953f7993a33 100644 --- a/tests/transformers/test_shard_checkpoint.py +++ b/tests/transformers/test_shard_checkpoint.py @@ -21,10 +21,10 @@ from paddleformers.transformers import ( AutoConfig, - BertModel, LlamaModel, PretrainedConfig, PretrainedModel, + Qwen3Model, register_base_model, ) from paddleformers.transformers.model_utils import ( @@ -127,12 +127,12 @@ def inner_convert_test(src_dtype, dst_dtype): str_src_dtype = str(src_dtype)[dtype_prefix_len:] str_dst_dtype = str(dst_dtype)[dtype_prefix_len:] - config = AutoConfig.from_pretrained("Paddleformers/tiny-random-bert") - model = BertModel.from_config(config, dtype=str_src_dtype) + config = AutoConfig.from_pretrained("Paddleformers/tiny-random-qwen3") + model = Qwen3Model.from_config(config, dtype=str_src_dtype) with tempfile.TemporaryDirectory() as tmp_dir: model.save_pretrained(tmp_dir) - new_model = BertModel.from_pretrained(tmp_dir, dtype=str_dst_dtype) + new_model = Qwen3Model.from_pretrained(tmp_dir, dtype=str_dst_dtype) for k, v in model.state_dict().items(): if v.is_floating_point(): @@ -275,7 +275,7 @@ def test_checkpoint_sharding_from_hub(self): self.assertTrue(paddle.allclose(p1, p2)) def test_checkpoint_variant_local(self): - model = BertModel.from_pretrained("Paddleformers/tiny-random-bert") + model = Qwen3Model.from_pretrained("Paddleformers/tiny-random-qwen3") with tempfile.TemporaryDirectory() as tmp_dir: model.save_pretrained(tmp_dir, variant="v2") @@ -287,15 +287,15 @@ def test_checkpoint_variant_local(self): self.assertFalse(os.path.isfile(os.path.join(tmp_dir, PADDLE_WEIGHTS_NAME))) with self.assertRaises(EnvironmentError): - _ = BertModel.from_pretrained(tmp_dir) + _ = Qwen3Model.from_pretrained(tmp_dir) - new_model = BertModel.from_pretrained(tmp_dir, variant="v2") + new_model = Qwen3Model.from_pretrained(tmp_dir, variant="v2") for p1, p2 in zip(model.parameters(), new_model.parameters()): self.assertTrue(paddle.allclose(p1, p2)) def test_checkpoint_variant_local_sharded(self): - model = BertModel.from_pretrained("Paddleformers/tiny-random-bert") + model = Qwen3Model.from_pretrained("Paddleformers/tiny-random-qwen3") with tempfile.TemporaryDirectory() as tmp_dir: model.save_pretrained(tmp_dir, variant="v2", max_shard_size="50kB") @@ -311,16 +311,16 @@ def test_checkpoint_variant_local_sharded(self): self.assertTrue(os.path.isfile(weights_name_file)) with self.assertRaises(EnvironmentError): - _ = BertModel.from_pretrained(tmp_dir) + _ = Qwen3Model.from_pretrained(tmp_dir) - new_model = BertModel.from_pretrained(tmp_dir, variant="v2") + new_model = Qwen3Model.from_pretrained(tmp_dir, variant="v2") for p1, p2 in zip(model.parameters(), new_model.parameters()): self.assertTrue(paddle.allclose(p1, p2)) @require_package("safetensors") def test_checkpoint_variant_local_safe(self): - model = BertModel.from_pretrained("Paddleformers/tiny-random-bert") + model = Qwen3Model.from_pretrained("Paddleformers/tiny-random-qwen3") with tempfile.TemporaryDirectory() as tmp_dir: model.save_pretrained(tmp_dir, variant="v2", safe_serialization=True) @@ -333,16 +333,16 @@ def test_checkpoint_variant_local_safe(self): self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME))) with self.assertRaises(EnvironmentError): - _ = BertModel.from_pretrained(tmp_dir) + _ = Qwen3Model.from_pretrained(tmp_dir) - new_model = BertModel.from_pretrained(tmp_dir, variant="v2") + new_model = Qwen3Model.from_pretrained(tmp_dir, variant="v2") for p1, p2 in zip(model.parameters(), new_model.parameters()): self.assertTrue(paddle.allclose(p1, p2)) @require_package("safetensors") def test_checkpoint_variant_local_sharded_safe(self): - model = BertModel.from_pretrained("Paddleformers/tiny-random-bert") + model = Qwen3Model.from_pretrained("Paddleformers/tiny-random-qwen3") with tempfile.TemporaryDirectory() as tmp_dir: model.save_pretrained(tmp_dir, variant="v2", max_shard_size="50kB", safe_serialization=True) @@ -358,9 +358,9 @@ def test_checkpoint_variant_local_sharded_safe(self): self.assertTrue(os.path.isfile(weights_name_file)) with self.assertRaises(EnvironmentError): - _ = BertModel.from_pretrained(tmp_dir) + _ = Qwen3Model.from_pretrained(tmp_dir) - new_model = BertModel.from_pretrained(tmp_dir, variant="v2") + new_model = Qwen3Model.from_pretrained(tmp_dir, variant="v2") for p1, p2 in zip(model.parameters(), new_model.parameters()): self.assertTrue(paddle.allclose(p1, p2)) diff --git a/tests/transformers/test_utils.py b/tests/transformers/test_utils.py index 8eb23833b4a..22555f1f9a8 100644 --- a/tests/transformers/test_utils.py +++ b/tests/transformers/test_utils.py @@ -16,8 +16,8 @@ import os import unittest -from paddleformers.transformers import BertModel, utils -from paddleformers.transformers.bert.modeling import BertForTokenClassification +from paddleformers.transformers import Qwen3Model, utils +from paddleformers.transformers.qwen3.modeling import Qwen3ForTokenClassification class TestUtils(unittest.TestCase): @@ -25,8 +25,8 @@ class TestUtils(unittest.TestCase): def test_find_transformer_model_type(self): """test for `find_transformer_model_type`""" - self.assertEqual(utils.find_transformer_model_type(BertModel), "bert") - self.assertEqual(utils.find_transformer_model_type(BertForTokenClassification), "bert") + self.assertEqual(utils.find_transformer_model_type(Qwen3Model), "qwen3") + self.assertEqual(utils.find_transformer_model_type(Qwen3ForTokenClassification), "qwen3") def check_json_file_has_correct_format(file_path):