From 6d1c427808b927d3931ef1e3a6202d42c80005cc Mon Sep 17 00:00:00 2001 From: Denis Kocetkov Date: Mon, 24 Feb 2025 15:13:30 +0200 Subject: [PATCH 01/10] added add_linear_biases as dict of sublayers keys --- fast_llm/layers/transformer/attention.py | 13 ++- fast_llm/layers/transformer/config.py | 76 +++++++++++++- .../layers/transformer/mixture_of_experts.py | 4 +- fast_llm/layers/transformer/mlp.py | 13 +-- fast_llm/layers/transformer/transformer.py | 2 +- tests/test_config.py | 99 ++++++++++++++++++- 6 files changed, 192 insertions(+), 15 deletions(-) diff --git a/fast_llm/layers/transformer/attention.py b/fast_llm/layers/transformer/attention.py index 8071a086..86572dd7 100644 --- a/fast_llm/layers/transformer/attention.py +++ b/fast_llm/layers/transformer/attention.py @@ -10,7 +10,12 @@ from fast_llm.functional.rotary import apply_rotary_embeddings from fast_llm.functional.triton.rotary import triton_rotary_autograd_ from fast_llm.layers.common.linear import InputParallelLinear, OutputParallelLinear -from fast_llm.layers.transformer.config import TransformerConfig, TransformerDimNames, TransformerKwargs +from fast_llm.layers.transformer.config import ( + TransformerConfig, + TransformerDimNames, + TransformerKwargs, + TransformerSubLayerKeys, +) from fast_llm.logging import log_distributed_grad, log_distributed_tensor from fast_llm.tensor import TensorMeta, init_normal_, init_zeros_ from fast_llm.utils import Assert @@ -102,7 +107,7 @@ def __init__( self.query = OutputParallelLinear( hidden_dim, self._tensor_space.get_tensor_dim(TransformerDimNames.composite_query), - bias=self._config.add_linear_biases, + bias=self._config.should_add_linear_bias(self._layer_index, TransformerSubLayerKeys.attn_query), weight_init_method=init_method_qkv, bias_init_method=init_method_qkv if self._config.random_bias_init else init_zeros_, sequence_parallel=self._sequence_parallel, @@ -111,7 +116,7 @@ def __init__( self.key_value = OutputParallelLinear( hidden_dim, self._tensor_space.get_tensor_dim(TransformerDimNames.composite_key_value), - bias=self._config.add_linear_biases, + bias=self._config.should_add_linear_bias(self._layer_index, TransformerSubLayerKeys.attn_key_value), weight_init_method=init_method_qkv, bias_init_method=init_method_qkv if self._config.random_bias_init else init_zeros_, sequence_parallel=self._sequence_parallel, @@ -123,7 +128,7 @@ def __init__( self.dense = InputParallelLinear( self._tensor_space.get_tensor_dim(TransformerDimNames.composite_dense), hidden_dim, - bias=self._config.add_linear_biases, + bias=self._config.should_add_linear_bias(self._layer_index, TransformerSubLayerKeys.attn_dense), weight_init_method=init_method_std_attn_proj, bias_init_method=init_method_std_attn_proj if self._config.random_bias_init else init_zeros_, sequence_parallel=self._sequence_parallel, diff --git a/fast_llm/layers/transformer/config.py b/fast_llm/layers/transformer/config.py index 1b4e7749..de4dcaee 100644 --- a/fast_llm/layers/transformer/config.py +++ b/fast_llm/layers/transformer/config.py @@ -1,6 +1,8 @@ import enum +import itertools import logging import math +import re import typing import warnings @@ -149,6 +151,14 @@ class RotaryConfig(RotaryArchitectureConfig, BaseModelConfig): pass +class TransformerSubLayerKeys(str, enum.Enum): + attn_query = "layers.self_attn.query" + attn_key_value = "layers.self_attn.key_value" + attn_dense = "layers.self_attn.dense" + mlp_layer1 = "layers.mlp.layer_1" + mlp_layer2 = "layers.mlp.layer_2" + + @config_class() class TransformerArchitectureConfig(BaseModelArchitectureConfig): _abstract = False @@ -174,7 +184,11 @@ class TransformerArchitectureConfig(BaseModelArchitectureConfig): hint=FieldHint.core, valid=check_field(Assert.gt, 0), ) - add_linear_biases: bool = Field(default=True, desc="Add biases to all dense layers.", hint=FieldHint.core) + add_linear_biases: bool | dict[TransformerSubLayerKeys, str] = Field( + default=True, + desc="Add biases to all or selected dense layers. Accepted values: True, False, or a dict with keys from TransformerSubLayerKeys and values as '*' or index ranges.", + hint=FieldHint.core, + ) ffn_hidden_size: int = Field( default=None, desc="Hidden dimension of the MLP intermediate state. Default: 4 * hidden_size.", @@ -234,6 +248,10 @@ class TransformerArchitectureConfig(BaseModelArchitectureConfig): hint=FieldHint.feature, ) + _parsed_add_linear_biases: bool | dict[TransformerSubLayerKeys, set[int] | str] = Field( + default=None, init=False, repr=False + ) + def _validate(self) -> None: if self.ffn_hidden_size is None: self.ffn_hidden_size = 4 * self.hidden_size @@ -243,7 +261,13 @@ def _validate(self) -> None: self.activation_type = ActivationType.silu if self.gated else ActivationType.gelu self.projection_size = self.num_attention_heads * self.kv_channels self.num_unshared_experts = self.num_experts - self.num_shared_experts + + # Validate before parent validate to have assertion error on invalid key for TransformerSubLayerKeys + self._validate_add_linear_biases() + self._parse_add_linear_biases() + super()._validate() + if not TritonConfig.TRITON_ENABLED: warnings.warn("Triton is disabled, but triton rotary kernel will be used anyway.") @@ -251,6 +275,56 @@ def _validate(self) -> None: Assert.leq(self.num_shared_experts + self.num_experts_per_token, self.num_experts) Assert.multiple(self.num_attention_heads, self.head_groups) + def _validate_add_linear_biases(self) -> None: + """Validate the `add_linear_biases` parameter.""" + if isinstance(self.add_linear_biases, dict): + Assert.gt(len(self.add_linear_biases), 0) + for key, value in self.add_linear_biases.items(): + Assert.incl(key, TransformerSubLayerKeys) # Assert valid sublayer key + Assert.custom( + lambda val: val == "*" or re.match(r"^\d+(:\d+(:\d+)?)?(,\s*\d+(:\d+(:\d+)?)?)*$", val), + value, + ) # Assert valid range format + + def _parse_add_linear_biases(self) -> bool | dict[TransformerSubLayerKeys, set[int] | str]: + """Parse `add_linear_biases` and store the result for quick lookup.""" + if isinstance(self.add_linear_biases, bool): + self._parsed_add_linear_biases = self.add_linear_biases + return + + parsed = {} + for key, value in self.add_linear_biases.items(): + parsed[key] = self._parse_indices(value) + self._parsed_add_linear_biases = parsed + + def _parse_indices(self, indices_str: str) -> set[int]: + """Parse layer indices from a string like '1:10:2, 20, 30' or '*'.""" + indices = [] + # Layers are numbered from 1 as 0 layer is embedding layer in Fast-LLM + if indices_str == "*": + indices.extend(range(1, self.num_layers + 1)) + else: + for part in indices_str.split(","): + part = part.strip() + if ":" in part: + parts = list(map(int, part.split(":"))) + start, stop = parts[0] + 1, parts[1] + 1 + step = parts[2] if len(parts) == 3 else 1 + indices.extend(range(start, stop, step)) + else: + indices.append(int(part) + 1) + return set(itertools.chain(indices)) + + def should_add_linear_bias(self, layer_index: int, sublayer_key: TransformerSubLayerKeys) -> bool: + """Check if linear bias should be added for a given layer and sublayer.""" + if isinstance(self._parsed_add_linear_biases, bool): + return self._parsed_add_linear_biases + + if sublayer_key in self._parsed_add_linear_biases: + return layer_index in self._parsed_add_linear_biases[sublayer_key] + + return False + @classmethod def _from_dict( cls, diff --git a/fast_llm/layers/transformer/mixture_of_experts.py b/fast_llm/layers/transformer/mixture_of_experts.py index 85c6686f..e374f31f 100644 --- a/fast_llm/layers/transformer/mixture_of_experts.py +++ b/fast_llm/layers/transformer/mixture_of_experts.py @@ -40,11 +40,11 @@ class MixtureOfExpertMLP(MLPBase): _group: ProcessGroup - def __init__(self, config: TransformerConfig, tensor_space: TensorSpace, name: str = "mlp"): + def __init__(self, config: TransformerConfig, tensor_space: TensorSpace, layer_index: int, name: str = "mlp"): Assert.gt(config.num_experts, 1) # TODO: Implement? assert not config.add_linear_biases, "Biases not supported for MoE." - super().__init__(config, tensor_space, name) + super().__init__(config, tensor_space, layer_index, name) self._config = config self._tensor_space = tensor_space self._debug_mode = self._config.debug_transformer or self._config.debug_transformer_memory diff --git a/fast_llm/layers/transformer/mlp.py b/fast_llm/layers/transformer/mlp.py index 76ebfcc0..02b0b4ca 100644 --- a/fast_llm/layers/transformer/mlp.py +++ b/fast_llm/layers/transformer/mlp.py @@ -8,15 +8,16 @@ from fast_llm.functional.config import TritonConfig from fast_llm.functional.triton.mlp import mlp_autograd, torch_mlp_activation, triton_mlp_activation_autograd from fast_llm.layers.common.linear import LinearBase -from fast_llm.layers.transformer.config import TransformerConfig, TransformerDimNames +from fast_llm.layers.transformer.config import TransformerConfig, TransformerDimNames, TransformerSubLayerKeys from fast_llm.tensor import init_normal_, init_zeros_ from fast_llm.utils import Assert class MLPBase(Layer, ABC): - def __init__(self, config: TransformerConfig, tensor_space: TensorSpace, name: str = "mlp"): + def __init__(self, config: TransformerConfig, tensor_space: TensorSpace, layer_index: int, name: str = "mlp"): super().__init__() self._name = name + self._layer_index = layer_index init_method_1 = init_normal_( std=config.init_method_std_mlp_1, @@ -42,7 +43,7 @@ def __init__(self, config: TransformerConfig, tensor_space: TensorSpace, name: s self.layer_1 = LinearBase( hidden_dim, tensor_space.get_tensor_dim(TransformerDimNames.composite_gated_expert_mlp), - bias=config.add_linear_biases, + bias=config.should_add_linear_bias(self._layer_index, TransformerSubLayerKeys.mlp_layer1), weight_init_method=init_method_1, bias_init_method=init_method_1 if config.random_bias_init else init_zeros_, lr_scale=tuple(config.mlp_lr_scale), @@ -50,7 +51,7 @@ def __init__(self, config: TransformerConfig, tensor_space: TensorSpace, name: s self.layer_2 = LinearBase( self._intermediate_dim, hidden_dim, - bias=config.add_linear_biases, + bias=config.should_add_linear_bias(self._layer_index, TransformerSubLayerKeys.mlp_layer2), weight_init_method=init_method_2, bias_init_method=init_method_2 if config.random_bias_init else init_zeros_, auto_bias_grad_accumulation=tensor_space.distributed_config.tensor_parallel > 1, @@ -60,9 +61,9 @@ def __init__(self, config: TransformerConfig, tensor_space: TensorSpace, name: s class MLP(MLPBase): - def __init__(self, config: TransformerConfig, tensor_space: TensorSpace, name: str = "mlp"): + def __init__(self, config: TransformerConfig, tensor_space: TensorSpace, layer_index: int, name: str = "mlp"): Assert.eq(config.num_experts, 1) - super().__init__(config, tensor_space, name) + super().__init__(config, tensor_space, layer_index, name) def forward( self, diff --git a/fast_llm/layers/transformer/transformer.py b/fast_llm/layers/transformer/transformer.py index 4780dd3a..df326c04 100644 --- a/fast_llm/layers/transformer/transformer.py +++ b/fast_llm/layers/transformer/transformer.py @@ -43,7 +43,7 @@ def __init__( self.self_attn = Attention(self._config, self._tensor_space, layer_index) self.mlp = (MixtureOfExpertMLP if self._config.num_experts > 1 else MLP)( - self._config, self._tensor_space, f"{self.name} mlp" + self._config, self._tensor_space, self._layer_index, f"{self.name} mlp" ) @torch.compile diff --git a/tests/test_config.py b/tests/test_config.py index 86c99a23..2f28f423 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -5,7 +5,11 @@ import yaml -from fast_llm.layers.transformer.config import TransformerConfig +from fast_llm.layers.transformer.config import ( + TransformerConfig, + TransformerArchitectureConfig, + TransformerSubLayerKeys, +) from fast_llm.utils import Assert from fast_llm.engine.distributed.config import DistributedConfig from fast_llm.engine.config_utils.data_type import DataType @@ -84,3 +88,96 @@ def test_do_use_flash_attention(): mock_distributed_config.training_dtype = DataType.float32 with pytest.raises(AssertionError): config.do_use_flash_attention(mock_distributed_config) + + +@pytest.fixture +def config_with_true_biases(): + """Fixture for TransformerArchitectureConfig with True add_linear_biases.""" + return TransformerArchitectureConfig(add_linear_biases=True) + + +@pytest.fixture +def config_with_false_biases(): + """Fixture for TransformerArchitectureConfig with False add_linear_biases.""" + return TransformerArchitectureConfig(add_linear_biases=False) + + +@pytest.fixture +def config_with_dict_biases(): + """Fixture for TransformerArchitectureConfig with dict add_linear_biases.""" + return TransformerArchitectureConfig( + num_layers = 10, + add_linear_biases={ + "layers.self_attn.query": "*", + "layers.mlp.layer_1": "1:10:3, 9", + "layers.mlp.layer_2": "5:7", + } + ) + + +def test_add_linear_biases_bool_true(config_with_true_biases): + """Test case for add_linear_biases set to True (default).""" + assert config_with_true_biases._parsed_add_linear_biases == True + + +def test_add_linear_biases_bool_false(config_with_false_biases): + """Test case for add_linear_biases set to False.""" + assert config_with_false_biases._parsed_add_linear_biases == False + + +def test_add_linear_biases_dict_valid(config_with_dict_biases): + """Test case for add_linear_biases with valid dictionary.""" + assert config_with_dict_biases._parsed_add_linear_biases == { + TransformerSubLayerKeys.attn_query: {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + TransformerSubLayerKeys.mlp_layer1: {2, 5, 8, 10}, + TransformerSubLayerKeys.mlp_layer2: {6, 7}, + } + + +def test_invalid_key_in_dict(): + """Test case where an invalid key is provided in add_linear_biases dictionary.""" + with pytest.raises(AssertionError): + # Using an invalid key in the dictionary. + TransformerArchitectureConfig(add_linear_biases={"invalid_key": "*"}) + + +def test_invalid_range_format(): + """Test case where invalid range format is provided.""" + with pytest.raises(AssertionError): + TransformerArchitectureConfig(add_linear_biases={TransformerSubLayerKeys.mlp_layer1: "1:10:3, abc"}) + + +def test_empty_add_linear_biases(): + """Test case for empty add_linear_biases dictionary.""" + with pytest.raises(AssertionError): # Expecting AssertionError for invalid empty dictionary + TransformerArchitectureConfig(add_linear_biases={}) + + +def test_should_add_linear_bias_for_layer_and_sublayer(config_with_dict_biases): + """Test case for should_add_linear_bias based on layer index and sublayer key.""" + + # Layer 1 and sublayer mlp_layer1 + assert config_with_dict_biases.should_add_linear_bias(1, TransformerSubLayerKeys.mlp_layer1) == False + + # Layer 2 and sublayer mlp_layer1 + assert config_with_dict_biases.should_add_linear_bias(2, TransformerSubLayerKeys.mlp_layer1) == True + + # Layer 9 and sublayer mlp_layer1 + assert config_with_dict_biases.should_add_linear_bias(9, TransformerSubLayerKeys.mlp_layer1) == False + + # Layer 6 and sublayer mlp_layer2 + assert config_with_dict_biases.should_add_linear_bias(6, TransformerSubLayerKeys.mlp_layer2) == True + + # Layer 5 and sublayer attn_query + assert config_with_dict_biases.should_add_linear_bias(5, TransformerSubLayerKeys.attn_query) == True + + +def test_should_add_linear_bias_for_bool_true(config_with_true_biases): + """Test case for add_linear_biases set to True (should always return True).""" + assert config_with_true_biases.should_add_linear_bias(10, TransformerSubLayerKeys.mlp_layer1) == True + + +def test_should_add_linear_bias_for_bool_false(config_with_false_biases): + """Test case for add_linear_biases set to False (should always return False).""" + assert config_with_false_biases.should_add_linear_bias(10, TransformerSubLayerKeys.mlp_layer1) == False + From a4d513cec637f9647d8fc96fb04af15efeac423e Mon Sep 17 00:00:00 2001 From: Denis Kocetkov Date: Mon, 24 Feb 2025 16:33:54 +0200 Subject: [PATCH 02/10] added simple tests for attention and mlp constructors --- tests/test_attention.py | 16 ++++++++++++++++ tests/test_mlp.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 tests/test_mlp.py diff --git a/tests/test_attention.py b/tests/test_attention.py index db856787..c8b91d76 100644 --- a/tests/test_attention.py +++ b/tests/test_attention.py @@ -1,6 +1,8 @@ import unittest.mock from fast_llm.layers.transformer.attention import Attention from fast_llm.layers.transformer.config import TransformerConfig +from fast_llm.engine.distributed.config import DistributedConfig +from fast_llm.engine.config_utils.tensor_space import TensorSpace def test_decide_window_size(): @@ -20,3 +22,17 @@ def test_decide_window_size(): # Arrange - Case 3: max_window_layers is None (always return window_size) attention._config = TransformerConfig(window_size=512, max_window_layers=None) assert attention._decide_window_size() == 512 + + +def test_attention_constructor(): + transformer_conf = TransformerConfig( + num_layers=2, + num_attention_heads=2, + hidden_size=16, + ) + distributed_config = DistributedConfig() + tensor_space = TensorSpace(distributed_config=distributed_config) + transformer_conf.setup_tensor_space(tensor_space) + + Attention(transformer_conf, tensor_space, 1) + diff --git a/tests/test_mlp.py b/tests/test_mlp.py new file mode 100644 index 00000000..4d343ec0 --- /dev/null +++ b/tests/test_mlp.py @@ -0,0 +1,33 @@ +from fast_llm.layers.transformer.mlp import MLP +from fast_llm.layers.transformer.mixture_of_experts import MixtureOfExpertMLP +from fast_llm.layers.transformer.config import TransformerConfig +from fast_llm.engine.distributed.config import DistributedConfig +from fast_llm.engine.config_utils.tensor_space import TensorSpace + + +def test_mlp_constructor(): + transformer_conf = TransformerConfig( + num_layers=2, + num_attention_heads=2, + hidden_size=16, + ) + distributed_config = DistributedConfig() + tensor_space = TensorSpace(distributed_config=distributed_config) + transformer_conf.setup_tensor_space(tensor_space) + + MLP(transformer_conf, tensor_space, 1, "name") + + +def test_moe_mlp_constructor(): + transformer_conf = TransformerConfig( + num_layers=2, + num_attention_heads=2, + hidden_size=16, + num_experts=2, + add_linear_biases=False + ) + distributed_config = DistributedConfig() + tensor_space = TensorSpace(distributed_config=distributed_config) + transformer_conf.setup_tensor_space(tensor_space) + + MixtureOfExpertMLP(transformer_conf, tensor_space, 1, "name") From 252aee2bf27086567579f5b6aa641c3616424d6f Mon Sep 17 00:00:00 2001 From: Denis Kocetkov Date: Tue, 25 Feb 2025 15:54:44 +0200 Subject: [PATCH 03/10] partial converter, not working --- fast_llm/models/gpt/config.py | 4 +++ fast_llm/models/gpt/conversion.py | 43 +++++++++++++++++++++++++++++++ tests/common.py | 13 ++++++++++ 3 files changed, 60 insertions(+) diff --git a/fast_llm/models/gpt/config.py b/fast_llm/models/gpt/config.py index 265443bd..6d97b34c 100644 --- a/fast_llm/models/gpt/config.py +++ b/fast_llm/models/gpt/config.py @@ -35,6 +35,9 @@ class Starcoder2GPTHuggingfaceCheckpointFormat(GPTHuggingfaceCheckpointFormat): class LlamaGPTHuggingfaceCheckpointFormat(GPTHuggingfaceCheckpointFormat): name: typing.ClassVar[str] = "llama" +class Qwen2GPTHuggingfaceCheckpointFormat(GPTHuggingfaceCheckpointFormat): + name: typing.ClassVar[str] = "qwen2" + class MistralGPTHuggingfaceCheckpointFormat(GPTHuggingfaceCheckpointFormat): name: typing.ClassVar[str] = "mistral" @@ -98,6 +101,7 @@ class GPTModelConfig(FastLLMModelConfig): AutoGPTHuggingfaceCheckpointFormat, Starcoder2GPTHuggingfaceCheckpointFormat, LlamaGPTHuggingfaceCheckpointFormat, + Qwen2GPTHuggingfaceCheckpointFormat, MistralGPTHuggingfaceCheckpointFormat, MixtralGPTHuggingfaceCheckpointFormat, ) diff --git a/fast_llm/models/gpt/conversion.py b/fast_llm/models/gpt/conversion.py index 8c858f7e..9c4a7bb9 100644 --- a/fast_llm/models/gpt/conversion.py +++ b/fast_llm/models/gpt/conversion.py @@ -28,6 +28,7 @@ GPTArchitectureConfig, GPTModelConfig, LlamaGPTHuggingfaceCheckpointFormat, + Qwen2GPTHuggingfaceCheckpointFormat, MistralGPTHuggingfaceCheckpointFormat, MixtralGPTHuggingfaceCheckpointFormat, Starcoder2GPTHuggingfaceCheckpointFormat, @@ -367,6 +368,47 @@ def _get_mlp_converters(self, fast_llm_prefix: str, hf_prefix: str) -> list[Weig MLPLayer2Converter, ), ] + +class Qwen2HuggingfaceCheckpointHandler(CommonLlamaHuggingfaceCheckpointHandler): + format: typing.ClassVar[type[CheckpointFormat]] = Qwen2GPTHuggingfaceCheckpointFormat + + @classmethod + def _create_config_converters(cls) -> list[ParamConverter]: + return super()._create_config_converters() + [ + ConstantExportParamConverter(export_names=(("architectures",),), export_value=["Qwen2ForCausalLM"]), + # TODO: Llama supports biases + ConstantExportParamConverter(export_names=(("attention_bias",),), export_value=False), + ConstantExportParamConverter(export_names=(("mlp_bias",),), export_value=False), + RopeScalingParamConverter( + fast_llm_names=( + ("transformer", "rotary", "type"), + ("transformer", "rotary", "scale_factor"), + ("transformer", "rotary", "low_frequency_factor"), + ("transformer", "rotary", "high_frequency_factor"), + ("transformer", "rotary", "original_context_length"), + ("transformer", "rotary", "attention_factor"), + ("transformer", "rotary", "beta_fast"), + ("transformer", "rotary", "beta_slow"), + ), + export_names=(("rope_scaling",),), + ), + ] + + def _get_mlp_converters(self, fast_llm_prefix: str, hf_prefix: str) -> list[WeightConverter]: + return [ + *self._get_weight_and_bias_converters( + f"{fast_llm_prefix}.mlp.layer_1", + (f"{hf_prefix}.mlp.gate_proj", f"{hf_prefix}.mlp.up_proj"), + False, + SplitWeightConverter, + ), + *self._get_weight_and_bias_converters( + f"{fast_llm_prefix}.mlp.layer_2", + f"{hf_prefix}.mlp.down_proj", + False, + MLPLayer2Converter, + ), + ] class MistralHuggingfaceCheckpointHandler(CommonLlamaHuggingfaceCheckpointHandler): @@ -445,6 +487,7 @@ class AutoGPTHuggingfaceCheckpointHandler( handler_map = { Starcoder2GPTHuggingfaceCheckpointFormat.name: Starcoder2HuggingfaceCheckpointHandler, LlamaGPTHuggingfaceCheckpointFormat.name: LlamaHuggingfaceCheckpointHandler, + Qwen2GPTHuggingfaceCheckpointFormat.name: Qwen2HuggingfaceCheckpointHandler, MistralGPTHuggingfaceCheckpointFormat.name: MistralHuggingfaceCheckpointHandler, MixtralGPTHuggingfaceCheckpointFormat.name: MixtralHuggingfaceCheckpointHandler, } diff --git a/tests/common.py b/tests/common.py index 9e82ab54..a151b8b6 100644 --- a/tests/common.py +++ b/tests/common.py @@ -14,6 +14,7 @@ from fast_llm.data.dataset.gpt.sampled import GPTSample from fast_llm.models.gpt.config import ( LlamaGPTHuggingfaceCheckpointFormat, + Qwen2GPTHuggingfaceCheckpointFormat, MistralGPTHuggingfaceCheckpointFormat, MixtralGPTHuggingfaceCheckpointFormat, Starcoder2GPTHuggingfaceCheckpointFormat, @@ -155,6 +156,11 @@ ] CONFIG_LLAMA3_COMMON = CONFIG_LLAMA3_FAST_LLM + ["model.distributed.training_dtype=bf16"] +# Megatron does not support Llama3-style Rotary Embeddings +CONFIG_QWEN2_MEGATRON = None +CONFIG_QWEN2_FAST_LLM = CONFIG_LLAMA_FAST_LLM +CONFIG_QWEN2_COMMON = CONFIG_QWEN2_FAST_LLM + ["model.distributed.training_dtype=bf16"] + CONFIG_MIXTRAL_MEGATRON = CONFIG_LLAMA_MEGATRON + [ "--num-experts=4", "--moe-router-topk=4", @@ -189,6 +195,13 @@ CONFIG_LLAMA3_COMMON, LlamaGPTHuggingfaceCheckpointFormat, ), + "qwen2": ( + "gpt", + CONFIG_QWEN2_FAST_LLM, + CONFIG_QWEN2_MEGATRON, + CONFIG_QWEN2_COMMON, + Qwen2GPTHuggingfaceCheckpointFormat, + ), "mistral": ( "gpt", CONFIG_LLAMA_FAST_LLM, From 25ab9873fbeaabb5a6c09680f25098402ef46fdd Mon Sep 17 00:00:00 2001 From: Denis Kocetkov Date: Tue, 25 Feb 2025 17:07:11 +0200 Subject: [PATCH 04/10] Qwen2 only add_linear_biases changes --- fast_llm/layers/transformer/attention.py | 18 +-- fast_llm/layers/transformer/config.py | 88 ++++--------- .../layers/transformer/mixture_of_experts.py | 4 +- fast_llm/layers/transformer/mlp.py | 13 +- fast_llm/layers/transformer/transformer.py | 2 +- tests/test_config.py | 120 ++++++------------ tests/test_mlp.py | 4 +- 7 files changed, 82 insertions(+), 167 deletions(-) diff --git a/fast_llm/layers/transformer/attention.py b/fast_llm/layers/transformer/attention.py index 86572dd7..f64de9f1 100644 --- a/fast_llm/layers/transformer/attention.py +++ b/fast_llm/layers/transformer/attention.py @@ -14,7 +14,6 @@ TransformerConfig, TransformerDimNames, TransformerKwargs, - TransformerSubLayerKeys, ) from fast_llm.logging import log_distributed_grad, log_distributed_tensor from fast_llm.tensor import TensorMeta, init_normal_, init_zeros_ @@ -107,7 +106,7 @@ def __init__( self.query = OutputParallelLinear( hidden_dim, self._tensor_space.get_tensor_dim(TransformerDimNames.composite_query), - bias=self._config.should_add_linear_bias(self._layer_index, TransformerSubLayerKeys.attn_query), + bias=self._config.add_attn_qkv_bias, weight_init_method=init_method_qkv, bias_init_method=init_method_qkv if self._config.random_bias_init else init_zeros_, sequence_parallel=self._sequence_parallel, @@ -116,7 +115,7 @@ def __init__( self.key_value = OutputParallelLinear( hidden_dim, self._tensor_space.get_tensor_dim(TransformerDimNames.composite_key_value), - bias=self._config.should_add_linear_bias(self._layer_index, TransformerSubLayerKeys.attn_key_value), + bias=self._config.add_attn_qkv_bias, weight_init_method=init_method_qkv, bias_init_method=init_method_qkv if self._config.random_bias_init else init_zeros_, sequence_parallel=self._sequence_parallel, @@ -128,7 +127,7 @@ def __init__( self.dense = InputParallelLinear( self._tensor_space.get_tensor_dim(TransformerDimNames.composite_dense), hidden_dim, - bias=self._config.should_add_linear_bias(self._layer_index, TransformerSubLayerKeys.attn_dense), + bias=self._config.add_attn_dense_bias, weight_init_method=init_method_std_attn_proj, bias_init_method=init_method_std_attn_proj if self._config.random_bias_init else init_zeros_, sequence_parallel=self._sequence_parallel, @@ -279,18 +278,14 @@ def _query_key_value_backward( input_grad.add_(self.key_value.backward(key_grad, context.pop("key_value"))) return input_grad - def _decide_window_size(self) -> int | None: # NOTE: This is a temporal solution for qwen 2.X # https://github.com/huggingface/transformers/blob/5e2183f344911aa82aba0b83778a4f196cff378e/src/transformers/models/qwen2/modular_qwen2.py#L71 # TODO: make universal per layer config window_size = self._config.window_size - if ( - self._config.max_window_layers is not None - and self._layer_index < self._config.max_window_layers - ): + if self._config.max_window_layers is not None and self._layer_index < self._config.max_window_layers: window_size = None - + return window_size def forward(self, input_: torch.Tensor, kwargs: dict[str, typing.Any]) -> tuple[torch.Tensor, torch.Tensor | None]: @@ -342,9 +337,8 @@ def forward(self, input_: torch.Tensor, kwargs: dict[str, typing.Any]) -> tuple[ query = rotary_fn(query, kwargs[TransformerKwargs.rotary_freq_q]) key = rotary_fn(key, kwargs[TransformerKwargs.rotary_freq_k]) - window_size = self._decide_window_size() - + if self._use_flash_attention: input_ = flash_attn( query, diff --git a/fast_llm/layers/transformer/config.py b/fast_llm/layers/transformer/config.py index de4dcaee..6d9888a0 100644 --- a/fast_llm/layers/transformer/config.py +++ b/fast_llm/layers/transformer/config.py @@ -151,12 +151,10 @@ class RotaryConfig(RotaryArchitectureConfig, BaseModelConfig): pass -class TransformerSubLayerKeys(str, enum.Enum): - attn_query = "layers.self_attn.query" - attn_key_value = "layers.self_attn.key_value" - attn_dense = "layers.self_attn.dense" - mlp_layer1 = "layers.mlp.layer_1" - mlp_layer2 = "layers.mlp.layer_2" +class AddLinearBiasChoices(str, enum.Enum): + nowhere = "nowhere" + everywhere = "everywhere" + only_attn_qkv = "only_attn_qkv" @config_class() @@ -184,9 +182,9 @@ class TransformerArchitectureConfig(BaseModelArchitectureConfig): hint=FieldHint.core, valid=check_field(Assert.gt, 0), ) - add_linear_biases: bool | dict[TransformerSubLayerKeys, str] = Field( + add_linear_biases: bool | AddLinearBiasChoices = Field( default=True, - desc="Add biases to all or selected dense layers. Accepted values: True, False, or a dict with keys from TransformerSubLayerKeys and values as '*' or index ranges.", + desc="Add biases to all, none or Q, K, V layers. Accepted values: True, False, or AddLinearBiasChoices.", hint=FieldHint.core, ) ffn_hidden_size: int = Field( @@ -248,10 +246,6 @@ class TransformerArchitectureConfig(BaseModelArchitectureConfig): hint=FieldHint.feature, ) - _parsed_add_linear_biases: bool | dict[TransformerSubLayerKeys, set[int] | str] = Field( - default=None, init=False, repr=False - ) - def _validate(self) -> None: if self.ffn_hidden_size is None: self.ffn_hidden_size = 4 * self.hidden_size @@ -262,10 +256,6 @@ def _validate(self) -> None: self.projection_size = self.num_attention_heads * self.kv_channels self.num_unshared_experts = self.num_experts - self.num_shared_experts - # Validate before parent validate to have assertion error on invalid key for TransformerSubLayerKeys - self._validate_add_linear_biases() - self._parse_add_linear_biases() - super()._validate() if not TritonConfig.TRITON_ENABLED: @@ -275,54 +265,28 @@ def _validate(self) -> None: Assert.leq(self.num_shared_experts + self.num_experts_per_token, self.num_experts) Assert.multiple(self.num_attention_heads, self.head_groups) - def _validate_add_linear_biases(self) -> None: - """Validate the `add_linear_biases` parameter.""" - if isinstance(self.add_linear_biases, dict): - Assert.gt(len(self.add_linear_biases), 0) - for key, value in self.add_linear_biases.items(): - Assert.incl(key, TransformerSubLayerKeys) # Assert valid sublayer key - Assert.custom( - lambda val: val == "*" or re.match(r"^\d+(:\d+(:\d+)?)?(,\s*\d+(:\d+(:\d+)?)?)*$", val), - value, - ) # Assert valid range format - - def _parse_add_linear_biases(self) -> bool | dict[TransformerSubLayerKeys, set[int] | str]: - """Parse `add_linear_biases` and store the result for quick lookup.""" + @property + def add_mlp_bias(self) -> bool: if isinstance(self.add_linear_biases, bool): - self._parsed_add_linear_biases = self.add_linear_biases - return - - parsed = {} - for key, value in self.add_linear_biases.items(): - parsed[key] = self._parse_indices(value) - self._parsed_add_linear_biases = parsed - - def _parse_indices(self, indices_str: str) -> set[int]: - """Parse layer indices from a string like '1:10:2, 20, 30' or '*'.""" - indices = [] - # Layers are numbered from 1 as 0 layer is embedding layer in Fast-LLM - if indices_str == "*": - indices.extend(range(1, self.num_layers + 1)) - else: - for part in indices_str.split(","): - part = part.strip() - if ":" in part: - parts = list(map(int, part.split(":"))) - start, stop = parts[0] + 1, parts[1] + 1 - step = parts[2] if len(parts) == 3 else 1 - indices.extend(range(start, stop, step)) - else: - indices.append(int(part) + 1) - return set(itertools.chain(indices)) - - def should_add_linear_bias(self, layer_index: int, sublayer_key: TransformerSubLayerKeys) -> bool: - """Check if linear bias should be added for a given layer and sublayer.""" - if isinstance(self._parsed_add_linear_biases, bool): - return self._parsed_add_linear_biases - - if sublayer_key in self._parsed_add_linear_biases: - return layer_index in self._parsed_add_linear_biases[sublayer_key] + return self.add_linear_biases + if self.add_linear_biases == AddLinearBiasChoices.everywhere: + return True + return False + @property + def add_attn_qkv_bias(self) -> bool: + if isinstance(self.add_linear_biases, bool): + return self.add_linear_biases + if self.add_linear_biases == AddLinearBiasChoices.nowhere: + return False + return True + + @property + def add_attn_dense_bias(self) -> bool: + if isinstance(self.add_linear_biases, bool): + return self.add_linear_biases + if self.add_linear_biases == AddLinearBiasChoices.everywhere: + return True return False @classmethod diff --git a/fast_llm/layers/transformer/mixture_of_experts.py b/fast_llm/layers/transformer/mixture_of_experts.py index e374f31f..85c6686f 100644 --- a/fast_llm/layers/transformer/mixture_of_experts.py +++ b/fast_llm/layers/transformer/mixture_of_experts.py @@ -40,11 +40,11 @@ class MixtureOfExpertMLP(MLPBase): _group: ProcessGroup - def __init__(self, config: TransformerConfig, tensor_space: TensorSpace, layer_index: int, name: str = "mlp"): + def __init__(self, config: TransformerConfig, tensor_space: TensorSpace, name: str = "mlp"): Assert.gt(config.num_experts, 1) # TODO: Implement? assert not config.add_linear_biases, "Biases not supported for MoE." - super().__init__(config, tensor_space, layer_index, name) + super().__init__(config, tensor_space, name) self._config = config self._tensor_space = tensor_space self._debug_mode = self._config.debug_transformer or self._config.debug_transformer_memory diff --git a/fast_llm/layers/transformer/mlp.py b/fast_llm/layers/transformer/mlp.py index 02b0b4ca..adc6242d 100644 --- a/fast_llm/layers/transformer/mlp.py +++ b/fast_llm/layers/transformer/mlp.py @@ -8,16 +8,15 @@ from fast_llm.functional.config import TritonConfig from fast_llm.functional.triton.mlp import mlp_autograd, torch_mlp_activation, triton_mlp_activation_autograd from fast_llm.layers.common.linear import LinearBase -from fast_llm.layers.transformer.config import TransformerConfig, TransformerDimNames, TransformerSubLayerKeys +from fast_llm.layers.transformer.config import TransformerConfig, TransformerDimNames from fast_llm.tensor import init_normal_, init_zeros_ from fast_llm.utils import Assert class MLPBase(Layer, ABC): - def __init__(self, config: TransformerConfig, tensor_space: TensorSpace, layer_index: int, name: str = "mlp"): + def __init__(self, config: TransformerConfig, tensor_space: TensorSpace, name: str = "mlp"): super().__init__() self._name = name - self._layer_index = layer_index init_method_1 = init_normal_( std=config.init_method_std_mlp_1, @@ -43,7 +42,7 @@ def __init__(self, config: TransformerConfig, tensor_space: TensorSpace, layer_i self.layer_1 = LinearBase( hidden_dim, tensor_space.get_tensor_dim(TransformerDimNames.composite_gated_expert_mlp), - bias=config.should_add_linear_bias(self._layer_index, TransformerSubLayerKeys.mlp_layer1), + bias=config.add_mlp_bias, weight_init_method=init_method_1, bias_init_method=init_method_1 if config.random_bias_init else init_zeros_, lr_scale=tuple(config.mlp_lr_scale), @@ -51,7 +50,7 @@ def __init__(self, config: TransformerConfig, tensor_space: TensorSpace, layer_i self.layer_2 = LinearBase( self._intermediate_dim, hidden_dim, - bias=config.should_add_linear_bias(self._layer_index, TransformerSubLayerKeys.mlp_layer2), + bias=config.add_mlp_bias, weight_init_method=init_method_2, bias_init_method=init_method_2 if config.random_bias_init else init_zeros_, auto_bias_grad_accumulation=tensor_space.distributed_config.tensor_parallel > 1, @@ -61,9 +60,9 @@ def __init__(self, config: TransformerConfig, tensor_space: TensorSpace, layer_i class MLP(MLPBase): - def __init__(self, config: TransformerConfig, tensor_space: TensorSpace, layer_index: int, name: str = "mlp"): + def __init__(self, config: TransformerConfig, tensor_space: TensorSpace, name: str = "mlp"): Assert.eq(config.num_experts, 1) - super().__init__(config, tensor_space, layer_index, name) + super().__init__(config, tensor_space, name) def forward( self, diff --git a/fast_llm/layers/transformer/transformer.py b/fast_llm/layers/transformer/transformer.py index df326c04..4780dd3a 100644 --- a/fast_llm/layers/transformer/transformer.py +++ b/fast_llm/layers/transformer/transformer.py @@ -43,7 +43,7 @@ def __init__( self.self_attn = Attention(self._config, self._tensor_space, layer_index) self.mlp = (MixtureOfExpertMLP if self._config.num_experts > 1 else MLP)( - self._config, self._tensor_space, self._layer_index, f"{self.name} mlp" + self._config, self._tensor_space, f"{self.name} mlp" ) @torch.compile diff --git a/tests/test_config.py b/tests/test_config.py index 2f28f423..7141812a 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -8,11 +8,11 @@ from fast_llm.layers.transformer.config import ( TransformerConfig, TransformerArchitectureConfig, - TransformerSubLayerKeys, + AddLinearBiasChoices, ) -from fast_llm.utils import Assert from fast_llm.engine.distributed.config import DistributedConfig from fast_llm.engine.config_utils.data_type import DataType +from fast_llm.config import ValidationError from fast_llm.models.auto import trainer_registry @@ -90,94 +90,52 @@ def test_do_use_flash_attention(): config.do_use_flash_attention(mock_distributed_config) -@pytest.fixture -def config_with_true_biases(): - """Fixture for TransformerArchitectureConfig with True add_linear_biases.""" - return TransformerArchitectureConfig(add_linear_biases=True) +def test_add_linear_biases_valid_values(): + # Valid boolean values + assert TransformerArchitectureConfig(add_linear_biases=True).add_linear_biases is True + assert TransformerArchitectureConfig(add_linear_biases=False).add_linear_biases is False - -@pytest.fixture -def config_with_false_biases(): - """Fixture for TransformerArchitectureConfig with False add_linear_biases.""" - return TransformerArchitectureConfig(add_linear_biases=False) - - -@pytest.fixture -def config_with_dict_biases(): - """Fixture for TransformerArchitectureConfig with dict add_linear_biases.""" - return TransformerArchitectureConfig( - num_layers = 10, - add_linear_biases={ - "layers.self_attn.query": "*", - "layers.mlp.layer_1": "1:10:3, 9", - "layers.mlp.layer_2": "5:7", - } + # Valid enum values + assert TransformerArchitectureConfig(add_linear_biases="nowhere").add_linear_biases == AddLinearBiasChoices.nowhere + assert ( + TransformerArchitectureConfig(add_linear_biases="everywhere").add_linear_biases + == AddLinearBiasChoices.everywhere + ) + assert ( + TransformerArchitectureConfig(add_linear_biases="only_attn_qkv").add_linear_biases == AddLinearBiasChoices.only_attn_qkv ) -def test_add_linear_biases_bool_true(config_with_true_biases): - """Test case for add_linear_biases set to True (default).""" - assert config_with_true_biases._parsed_add_linear_biases == True - - -def test_add_linear_biases_bool_false(config_with_false_biases): - """Test case for add_linear_biases set to False.""" - assert config_with_false_biases._parsed_add_linear_biases == False - - -def test_add_linear_biases_dict_valid(config_with_dict_biases): - """Test case for add_linear_biases with valid dictionary.""" - assert config_with_dict_biases._parsed_add_linear_biases == { - TransformerSubLayerKeys.attn_query: {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, - TransformerSubLayerKeys.mlp_layer1: {2, 5, 8, 10}, - TransformerSubLayerKeys.mlp_layer2: {6, 7}, - } - - -def test_invalid_key_in_dict(): - """Test case where an invalid key is provided in add_linear_biases dictionary.""" - with pytest.raises(AssertionError): - # Using an invalid key in the dictionary. - TransformerArchitectureConfig(add_linear_biases={"invalid_key": "*"}) - - -def test_invalid_range_format(): - """Test case where invalid range format is provided.""" - with pytest.raises(AssertionError): - TransformerArchitectureConfig(add_linear_biases={TransformerSubLayerKeys.mlp_layer1: "1:10:3, abc"}) - - -def test_empty_add_linear_biases(): - """Test case for empty add_linear_biases dictionary.""" - with pytest.raises(AssertionError): # Expecting AssertionError for invalid empty dictionary - TransformerArchitectureConfig(add_linear_biases={}) - - -def test_should_add_linear_bias_for_layer_and_sublayer(config_with_dict_biases): - """Test case for should_add_linear_bias based on layer index and sublayer key.""" - - # Layer 1 and sublayer mlp_layer1 - assert config_with_dict_biases.should_add_linear_bias(1, TransformerSubLayerKeys.mlp_layer1) == False - - # Layer 2 and sublayer mlp_layer1 - assert config_with_dict_biases.should_add_linear_bias(2, TransformerSubLayerKeys.mlp_layer1) == True +def test_add_linear_biases_invalid_values(): + with pytest.raises(ValidationError): + TransformerArchitectureConfig(add_linear_biases="invalid_value") - # Layer 9 and sublayer mlp_layer1 - assert config_with_dict_biases.should_add_linear_bias(9, TransformerSubLayerKeys.mlp_layer1) == False + with pytest.raises(ValidationError): + TransformerArchitectureConfig(add_linear_biases=123) - # Layer 6 and sublayer mlp_layer2 - assert config_with_dict_biases.should_add_linear_bias(6, TransformerSubLayerKeys.mlp_layer2) == True + with pytest.raises(ValidationError): + TransformerArchitectureConfig(add_linear_biases=None) - # Layer 5 and sublayer attn_query - assert config_with_dict_biases.should_add_linear_bias(5, TransformerSubLayerKeys.attn_query) == True +def test_add_mlp_bias(): + assert TransformerArchitectureConfig(add_linear_biases=True).add_mlp_bias is True + assert TransformerArchitectureConfig(add_linear_biases=False).add_mlp_bias is False + assert TransformerArchitectureConfig(add_linear_biases=AddLinearBiasChoices.everywhere).add_mlp_bias is True + assert TransformerArchitectureConfig(add_linear_biases=AddLinearBiasChoices.nowhere).add_mlp_bias is False + assert TransformerArchitectureConfig(add_linear_biases=AddLinearBiasChoices.only_attn_qkv).add_mlp_bias is False -def test_should_add_linear_bias_for_bool_true(config_with_true_biases): - """Test case for add_linear_biases set to True (should always return True).""" - assert config_with_true_biases.should_add_linear_bias(10, TransformerSubLayerKeys.mlp_layer1) == True +def test_add_attn_qkv_bias(): + assert TransformerArchitectureConfig(add_linear_biases=True).add_attn_qkv_bias is True + assert TransformerArchitectureConfig(add_linear_biases=False).add_attn_qkv_bias is False + assert TransformerArchitectureConfig(add_linear_biases=AddLinearBiasChoices.everywhere).add_attn_qkv_bias is True + assert TransformerArchitectureConfig(add_linear_biases=AddLinearBiasChoices.nowhere).add_attn_qkv_bias is False + assert TransformerArchitectureConfig(add_linear_biases=AddLinearBiasChoices.only_attn_qkv).add_attn_qkv_bias is True -def test_should_add_linear_bias_for_bool_false(config_with_false_biases): - """Test case for add_linear_biases set to False (should always return False).""" - assert config_with_false_biases.should_add_linear_bias(10, TransformerSubLayerKeys.mlp_layer1) == False +def test_add_attn_dense_bias(): + assert TransformerArchitectureConfig(add_linear_biases=True).add_attn_dense_bias is True + assert TransformerArchitectureConfig(add_linear_biases=False).add_attn_dense_bias is False + assert TransformerArchitectureConfig(add_linear_biases=AddLinearBiasChoices.everywhere).add_attn_dense_bias is True + assert TransformerArchitectureConfig(add_linear_biases=AddLinearBiasChoices.nowhere).add_attn_dense_bias is False + assert TransformerArchitectureConfig(add_linear_biases=AddLinearBiasChoices.only_attn_qkv).add_attn_dense_bias is False diff --git a/tests/test_mlp.py b/tests/test_mlp.py index 4d343ec0..bcfbaf69 100644 --- a/tests/test_mlp.py +++ b/tests/test_mlp.py @@ -15,7 +15,7 @@ def test_mlp_constructor(): tensor_space = TensorSpace(distributed_config=distributed_config) transformer_conf.setup_tensor_space(tensor_space) - MLP(transformer_conf, tensor_space, 1, "name") + MLP(transformer_conf, tensor_space, "name") def test_moe_mlp_constructor(): @@ -30,4 +30,4 @@ def test_moe_mlp_constructor(): tensor_space = TensorSpace(distributed_config=distributed_config) transformer_conf.setup_tensor_space(tensor_space) - MixtureOfExpertMLP(transformer_conf, tensor_space, 1, "name") + MixtureOfExpertMLP(transformer_conf, tensor_space, "name") From b8cf6ae4b143ec2012184eb810e959c51a2d8e0a Mon Sep 17 00:00:00 2001 From: Denis Kocetkov Date: Tue, 25 Feb 2025 17:18:30 +0200 Subject: [PATCH 05/10] clean up and formatting --- fast_llm/layers/transformer/config.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fast_llm/layers/transformer/config.py b/fast_llm/layers/transformer/config.py index 6d9888a0..cf985392 100644 --- a/fast_llm/layers/transformer/config.py +++ b/fast_llm/layers/transformer/config.py @@ -1,8 +1,6 @@ import enum -import itertools import logging import math -import re import typing import warnings @@ -121,12 +119,12 @@ class RotaryArchitectureConfig(BaseModelArchitectureConfig): hint=FieldHint.feature, ) beta_fast: float = Field( - default=32., + default=32.0, desc="Beta-fast for yarn-type scaling.", hint=FieldHint.feature, ) beta_slow: float = Field( - default=1., + default=1.0, desc="Beta-slow for yarn-type scaling.", hint=FieldHint.feature, ) @@ -280,7 +278,7 @@ def add_attn_qkv_bias(self) -> bool: if self.add_linear_biases == AddLinearBiasChoices.nowhere: return False return True - + @property def add_attn_dense_bias(self) -> bool: if isinstance(self.add_linear_biases, bool): @@ -615,8 +613,11 @@ def _validate(self) -> None: Assert.geq(scale, 0) def do_use_flash_attention(self, distributed_config: DistributedConfig) -> bool: - use_flash_attention = self.use_flash_attention and distributed_config.training_dtype in (DataType.float16, DataType.bfloat16) - + use_flash_attention = self.use_flash_attention and distributed_config.training_dtype in ( + DataType.float16, + DataType.bfloat16, + ) + # Config parameter `window_size` only can be used with flash attention if not use_flash_attention: Assert.is_(self.window_size, None) From 4d4e56bc14bddcfd466b72a50d3efbfb68d60ed4 Mon Sep 17 00:00:00 2001 From: Denis Kocetkov Date: Wed, 26 Feb 2025 15:23:55 +0200 Subject: [PATCH 06/10] partial qwen2 converter, non working --- fast_llm/models/gpt/conversion.py | 128 +++++++++++++++++++++++++----- tests/common.py | 11 ++- 2 files changed, 118 insertions(+), 21 deletions(-) diff --git a/fast_llm/models/gpt/conversion.py b/fast_llm/models/gpt/conversion.py index 9c4a7bb9..b0b32c81 100644 --- a/fast_llm/models/gpt/conversion.py +++ b/fast_llm/models/gpt/conversion.py @@ -23,7 +23,7 @@ from fast_llm.functional.config import ActivationType from fast_llm.functional.rotary import convert_rotary_complex_to_real, convert_rotary_real_to_complex from fast_llm.layers.common.config import NormalizationType -from fast_llm.layers.transformer.config import RotaryEmbeddingType, RoutingType +from fast_llm.layers.transformer.config import RotaryEmbeddingType, RoutingType, AddLinearBiasChoices from fast_llm.models.gpt.config import ( GPTArchitectureConfig, GPTModelConfig, @@ -157,11 +157,37 @@ def _create_config_converters(cls) -> list[ParamConverter]: def _get_mlp_converters(self, fast_llm_prefix: str, hf_prefix: str) -> list[WeightConverter]: pass - def _create_weight_converters(self) -> list[WeightConverter]: + @staticmethod + def _add_mlp_bias(add_linear_biases: bool | AddLinearBiasChoices) -> bool: + if isinstance(add_linear_biases, bool): + return add_linear_biases + if add_linear_biases == AddLinearBiasChoices.everywhere: + return True + return False + + @staticmethod + def _add_attn_qkv_bias(add_linear_biases: bool | AddLinearBiasChoices) -> bool: + if isinstance(add_linear_biases, bool): + return add_linear_biases + if add_linear_biases == AddLinearBiasChoices.nowhere: + return False + return True + + @staticmethod + def _add_attn_dense_bias(add_linear_biases: bool | AddLinearBiasChoices) -> bool: + if isinstance(add_linear_biases, bool): + return add_linear_biases + if add_linear_biases == AddLinearBiasChoices.everywhere: + return True + return False + + def _create_weight_converters( + self, + ) -> list[WeightConverter]: converters = [] num_layers = self._model.config.base_model.transformer.num_layers norm_bias: bool = self._model.config.base_model.transformer.normalization.type == NormalizationType.layer_norm - linear_bias: bool = self._model.config.base_model.transformer.add_linear_biases + linear_bias: bool | AddLinearBiasChoices = self._model.config.base_model.transformer.add_linear_biases # Embedding and output if self._model.config.base_model.tie_word_embeddings: @@ -181,17 +207,19 @@ def _create_weight_converters(self) -> list[WeightConverter]: converters += self._get_weight_and_bias_converters( f"layers.{i+1}.self_attn.query", f"model.layers.{i}.self_attn.q_proj", - linear_bias, + self._add_attn_qkv_bias(linear_bias), QueryWeightConverter, ) converters += self._get_weight_and_bias_converters( f"layers.{i+1}.self_attn.key_value", (f"model.layers.{i}.self_attn.k_proj", f"model.layers.{i}.self_attn.v_proj"), - linear_bias, + self._add_attn_qkv_bias(linear_bias), KeyValueWeightConverter, ) converters += self._get_weight_and_bias_converters( - f"layers.{i+1}.self_attn.dense", f"model.layers.{i}.self_attn.o_proj", linear_bias + f"layers.{i+1}.self_attn.dense", + f"model.layers.{i}.self_attn.o_proj", + self._add_attn_dense_bias(linear_bias), ) # Norm @@ -257,13 +285,16 @@ def _create_config_converters(cls) -> list[ParamConverter]: ] def _get_mlp_converters(self, fast_llm_prefix: str, hf_prefix: str) -> list[WeightConverter]: - linear_bias: bool = self._model.config.base_model.transformer.add_linear_biases + linear_bias: bool | AddLinearBiasChoices = self._model.config.base_model.transformer.add_linear_biases return [ *self._get_weight_and_bias_converters( - f"{fast_llm_prefix}.mlp.layer_1", f"{hf_prefix}.mlp.c_fc", linear_bias + f"{fast_llm_prefix}.mlp.layer_1", f"{hf_prefix}.mlp.c_fc", self._add_mlp_bias(linear_bias) ), *self._get_weight_and_bias_converters( - f"{fast_llm_prefix}.mlp.layer_2", f"{hf_prefix}.mlp.c_proj", linear_bias, MLPLayer2Converter + f"{fast_llm_prefix}.mlp.layer_2", + f"{hf_prefix}.mlp.c_proj", + self._add_mlp_bias(linear_bias), + MLPLayer2Converter, ), ] @@ -353,32 +384,59 @@ def _create_config_converters(cls) -> list[ParamConverter]: ] def _get_mlp_converters(self, fast_llm_prefix: str, hf_prefix: str) -> list[WeightConverter]: - linear_bias: bool = self._model.config.base_model.transformer.add_linear_biases + linear_bias: bool | AddLinearBiasChoices = self._model.config.base_model.transformer.add_linear_biases return [ *self._get_weight_and_bias_converters( f"{fast_llm_prefix}.mlp.layer_1", (f"{hf_prefix}.mlp.gate_proj", f"{hf_prefix}.mlp.up_proj"), - linear_bias, + self._add_mlp_bias(linear_bias), SplitWeightConverter, ), *self._get_weight_and_bias_converters( f"{fast_llm_prefix}.mlp.layer_2", f"{hf_prefix}.mlp.down_proj", - linear_bias, + self._add_mlp_bias(linear_bias), MLPLayer2Converter, ), ] - -class Qwen2HuggingfaceCheckpointHandler(CommonLlamaHuggingfaceCheckpointHandler): + + +@dataclasses.dataclass +class Qwen2SlidingWindowParamConverter(ParamConverter): + def __post_init__(self): + Assert.eq(len(self.fast_llm_names), 1) + Assert.eq(len(self.export_names), 2) + + def export_params(self, fast_llm_values: tuple[typing.Any, ...]) -> tuple[typing.Any, ...]: + window_size= fast_llm_values + if window_size is None: + return (False, 4096) # default value in HF Qwen2 config + return (True, window_size) + + def import_params(self, export_values: tuple[typing.Any, ...]) -> tuple[typing.Any, ...]: + use_sliding_window, sliding_window = export_values + if use_sliding_window: + return sliding_window + return None + + +class Qwen2HuggingfaceCheckpointHandler(CommonHuggingfaceCheckpointHandler): format: typing.ClassVar[type[CheckpointFormat]] = Qwen2GPTHuggingfaceCheckpointFormat @classmethod def _create_config_converters(cls) -> list[ParamConverter]: return super()._create_config_converters() + [ ConstantExportParamConverter(export_names=(("architectures",),), export_value=["Qwen2ForCausalLM"]), - # TODO: Llama supports biases - ConstantExportParamConverter(export_names=(("attention_bias",),), export_value=False), - ConstantExportParamConverter(export_names=(("mlp_bias",),), export_value=False), + ConstantImportParamConverter( + fast_llm_names=(("transformer", "normalization", "type"),), fast_llm_value=NormalizationType.rms_norm + ), + RenameParamConverter( + fast_llm_names=(("transformer", "normalization", "epsilon"),), export_names=(("rms_norm_eps",),) + ), + ConstantImportParamConverter(fast_llm_names=(("transformer", "gated"),), fast_llm_value=True), + ConstantImportParamConverter( + fast_llm_names=(("transformer", "add_linear_biases"),), fast_llm_value="only_attn_qkv" + ), RopeScalingParamConverter( fast_llm_names=( ("transformer", "rotary", "type"), @@ -392,24 +450,56 @@ def _create_config_converters(cls) -> list[ParamConverter]: ), export_names=(("rope_scaling",),), ), + Qwen2SlidingWindowParamConverter(fast_llm_names=(("transformer", "window_size"),), export_names=(("use_sliding_window"), ("sliding_window"))), + RenameParamConverter( + fast_llm_names=( + ( + "transformer", + "max_window_layers", + ), + ), + export_names=(("max_window_layers",),), + ), ] def _get_mlp_converters(self, fast_llm_prefix: str, hf_prefix: str) -> list[WeightConverter]: + linear_bias: bool | AddLinearBiasChoices = self._model.config.base_model.transformer.add_linear_biases return [ *self._get_weight_and_bias_converters( f"{fast_llm_prefix}.mlp.layer_1", (f"{hf_prefix}.mlp.gate_proj", f"{hf_prefix}.mlp.up_proj"), - False, + self._add_mlp_bias(linear_bias), SplitWeightConverter, ), *self._get_weight_and_bias_converters( f"{fast_llm_prefix}.mlp.layer_2", f"{hf_prefix}.mlp.down_proj", - False, + self._add_mlp_bias(linear_bias), MLPLayer2Converter, ), ] +""" +x vocab_size=151936, +x hidden_size=4096, +x intermediate_size=22016, +x num_hidden_layers=32, +x num_attention_heads=32, +x num_key_value_heads=32, +x hidden_act="silu", +max_position_embeddings=32768, +initializer_range=0.02, +x rms_norm_eps=1e-6, +use_cache=True, +x tie_word_embeddings=False, +x rope_theta=10000.0, +x rope_scaling=None, +x use_sliding_window=False, +x sliding_window=4096, +x max_window_layers=28, +attention_dropout=0.0, +""" + class MistralHuggingfaceCheckpointHandler(CommonLlamaHuggingfaceCheckpointHandler): format: typing.ClassVar[type[CheckpointFormat]] = MistralGPTHuggingfaceCheckpointFormat diff --git a/tests/common.py b/tests/common.py index a151b8b6..d1438dd1 100644 --- a/tests/common.py +++ b/tests/common.py @@ -156,9 +156,16 @@ ] CONFIG_LLAMA3_COMMON = CONFIG_LLAMA3_FAST_LLM + ["model.distributed.training_dtype=bf16"] -# Megatron does not support Llama3-style Rotary Embeddings +# Megatron does not support per sub layer biases CONFIG_QWEN2_MEGATRON = None -CONFIG_QWEN2_FAST_LLM = CONFIG_LLAMA_FAST_LLM +CONFIG_QWEN2_FAST_LLM = CONFIG_SC2_FAST_LLM + [ + "model.base_model.transformer.gated=True", + "model.base_model.transformer.activation_type=silu", + "model.base_model.transformer.add_linear_biases=only_attn_qkv", + "model.base_model.transformer.normalization.type=rms_norm", + "model.base_model.transformer.ffn_hidden_size=1024", + "model.base_model.tie_word_embeddings=False", +] CONFIG_QWEN2_COMMON = CONFIG_QWEN2_FAST_LLM + ["model.distributed.training_dtype=bf16"] CONFIG_MIXTRAL_MEGATRON = CONFIG_LLAMA_MEGATRON + [ From 36c62b4c077cf7bad53fc0d40a91c311ab77200d Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Wed, 26 Feb 2025 16:26:11 +0000 Subject: [PATCH 07/10] merge fix --- tests/common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/common.py b/tests/common.py index d5ebfeda..017a3ce0 100644 --- a/tests/common.py +++ b/tests/common.py @@ -221,6 +221,7 @@ CONFIG_QWEN2_MEGATRON, CONFIG_QWEN2_COMMON, Qwen2GPTHuggingfaceCheckpointFormat, + ), "llama-yarn": ( "gpt", CONFIG_LLAMA_YARN_FAST_LLM, From 52a462e9263ff2573d6138e3ad6edf8185e29234 Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Wed, 26 Feb 2025 19:37:43 +0000 Subject: [PATCH 08/10] fix tuple access and return --- fast_llm/models/gpt/conversion.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fast_llm/models/gpt/conversion.py b/fast_llm/models/gpt/conversion.py index 9ad89314..74c713b6 100644 --- a/fast_llm/models/gpt/conversion.py +++ b/fast_llm/models/gpt/conversion.py @@ -408,7 +408,7 @@ def __post_init__(self): Assert.eq(len(self.export_names), 2) def export_params(self, fast_llm_values: tuple[typing.Any, ...]) -> tuple[typing.Any, ...]: - window_size= fast_llm_values + window_size, = fast_llm_values if window_size is None: return (False, 4096) # default value in HF Qwen2 config return (True, window_size) @@ -416,8 +416,8 @@ def export_params(self, fast_llm_values: tuple[typing.Any, ...]) -> tuple[typing def import_params(self, export_values: tuple[typing.Any, ...]) -> tuple[typing.Any, ...]: use_sliding_window, sliding_window = export_values if use_sliding_window: - return sliding_window - return None + return sliding_window, + return None, class Qwen2HuggingfaceCheckpointHandler(CommonHuggingfaceCheckpointHandler): From be7a4968be86e8f83d1c1c44effe66fb62c2072e Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Thu, 27 Feb 2025 10:01:56 +0000 Subject: [PATCH 09/10] ignoring sliding window params on import --- fast_llm/models/gpt/conversion.py | 42 +++++++++++++++---------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/fast_llm/models/gpt/conversion.py b/fast_llm/models/gpt/conversion.py index 74c713b6..0a942aee 100644 --- a/fast_llm/models/gpt/conversion.py +++ b/fast_llm/models/gpt/conversion.py @@ -1,10 +1,11 @@ import abc import dataclasses +import logging import typing import torch -from fast_llm.config import DEFAULT +from fast_llm.config import DEFAULT, MISSING from fast_llm.engine.checkpoint.config import CheckpointFormat from fast_llm.engine.checkpoint.external import ( AutoStateDictCheckpointHandler, @@ -40,6 +41,8 @@ if typing.TYPE_CHECKING: pass +logger = logging.getLogger(__name__) + class QueryWeightConverter(WeightConverter): # Hf uses the real format for rotary embeddings. @@ -402,22 +405,25 @@ def _get_mlp_converters(self, fast_llm_prefix: str, hf_prefix: str) -> list[Weig @dataclasses.dataclass -class Qwen2SlidingWindowParamConverter(ParamConverter): +class IgnoreImportQwen2SlidingWindowParamsConverter(ParamConverter): def __post_init__(self): - Assert.eq(len(self.fast_llm_names), 1) - Assert.eq(len(self.export_names), 2) + Assert.eq(len(self.fast_llm_names), 0) + Assert.eq(len(self.export_names), 0) + self.export_names = (("use_sliding_window",), ("sliding_window",), ("max_window_layers",)) def export_params(self, fast_llm_values: tuple[typing.Any, ...]) -> tuple[typing.Any, ...]: - window_size, = fast_llm_values - if window_size is None: - return (False, 4096) # default value in HF Qwen2 config - return (True, window_size) + return (MISSING, MISSING, MISSING) def import_params(self, export_values: tuple[typing.Any, ...]) -> tuple[typing.Any, ...]: - use_sliding_window, sliding_window = export_values - if use_sliding_window: - return sliding_window, - return None, + # Default value for use_sliding_window in Qwen2 HF config is False + if export_values[0] != MISSING and export_values[0] == True: + logger.warning( + f"The configuration parameters `{self.export_names[0]}={export_values[0]}`," + f" `{self.export_names[1]}={export_values[1]}`, `{self.export_names[2]}={export_values[2]}`" + f" are ignored during conversion." + f" If you intend to use them in Fast-LLM, make sure to set them explicitly in the model configuration." + ) + return () class Qwen2HuggingfaceCheckpointHandler(CommonHuggingfaceCheckpointHandler): @@ -450,16 +456,7 @@ def _create_config_converters(cls) -> list[ParamConverter]: ), export_names=(("rope_scaling",),), ), - Qwen2SlidingWindowParamConverter(fast_llm_names=(("transformer", "window_size"),), export_names=(("use_sliding_window"), ("sliding_window"))), - RenameParamConverter( - fast_llm_names=( - ( - "transformer", - "max_window_layers", - ), - ), - export_names=(("max_window_layers",),), - ), + IgnoreImportQwen2SlidingWindowParamsConverter(), ] def _get_mlp_converters(self, fast_llm_prefix: str, hf_prefix: str) -> list[WeightConverter]: @@ -479,6 +476,7 @@ def _get_mlp_converters(self, fast_llm_prefix: str, hf_prefix: str) -> list[Weig ), ] + """ x vocab_size=151936, x hidden_size=4096, From 814b4e6c3c67026a19d50453bda2748e19701608 Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Thu, 27 Feb 2025 10:19:23 +0000 Subject: [PATCH 10/10] use add_biases functionality from TransformerConfig, cleanup --- fast_llm/models/gpt/conversion.py | 73 ++++++------------------------- 1 file changed, 14 insertions(+), 59 deletions(-) diff --git a/fast_llm/models/gpt/conversion.py b/fast_llm/models/gpt/conversion.py index 0a942aee..51c8a3b7 100644 --- a/fast_llm/models/gpt/conversion.py +++ b/fast_llm/models/gpt/conversion.py @@ -24,7 +24,7 @@ from fast_llm.functional.config import ActivationType from fast_llm.functional.rotary import convert_rotary_complex_to_real, convert_rotary_real_to_complex from fast_llm.layers.common.config import NormalizationType -from fast_llm.layers.transformer.config import RotaryEmbeddingType, RoutingType, AddLinearBiasChoices +from fast_llm.layers.transformer.config import RotaryEmbeddingType, RoutingType, TransformerConfig from fast_llm.models.gpt.config import ( GPTArchitectureConfig, GPTModelConfig, @@ -160,29 +160,6 @@ def _create_config_converters(cls) -> list[ParamConverter]: def _get_mlp_converters(self, fast_llm_prefix: str, hf_prefix: str) -> list[WeightConverter]: pass - @staticmethod - def _add_mlp_bias(add_linear_biases: bool | AddLinearBiasChoices) -> bool: - if isinstance(add_linear_biases, bool): - return add_linear_biases - if add_linear_biases == AddLinearBiasChoices.everywhere: - return True - return False - - @staticmethod - def _add_attn_qkv_bias(add_linear_biases: bool | AddLinearBiasChoices) -> bool: - if isinstance(add_linear_biases, bool): - return add_linear_biases - if add_linear_biases == AddLinearBiasChoices.nowhere: - return False - return True - - @staticmethod - def _add_attn_dense_bias(add_linear_biases: bool | AddLinearBiasChoices) -> bool: - if isinstance(add_linear_biases, bool): - return add_linear_biases - if add_linear_biases == AddLinearBiasChoices.everywhere: - return True - return False def _create_weight_converters( self, @@ -190,7 +167,7 @@ def _create_weight_converters( converters = [] num_layers = self._model.config.base_model.transformer.num_layers norm_bias: bool = self._model.config.base_model.transformer.normalization.type == NormalizationType.layer_norm - linear_bias: bool | AddLinearBiasChoices = self._model.config.base_model.transformer.add_linear_biases + transformer_config: TransformerConfig = self._model.config.base_model.transformer # Embedding and output if self._model.config.base_model.tie_word_embeddings: @@ -210,19 +187,19 @@ def _create_weight_converters( converters += self._get_weight_and_bias_converters( f"layers.{i+1}.self_attn.query", f"model.layers.{i}.self_attn.q_proj", - self._add_attn_qkv_bias(linear_bias), + transformer_config.add_attn_qkv_bias, QueryWeightConverter, ) converters += self._get_weight_and_bias_converters( f"layers.{i+1}.self_attn.key_value", (f"model.layers.{i}.self_attn.k_proj", f"model.layers.{i}.self_attn.v_proj"), - self._add_attn_qkv_bias(linear_bias), + transformer_config.add_attn_qkv_bias, KeyValueWeightConverter, ) converters += self._get_weight_and_bias_converters( f"layers.{i+1}.self_attn.dense", f"model.layers.{i}.self_attn.o_proj", - self._add_attn_dense_bias(linear_bias), + transformer_config.add_attn_dense_bias, ) # Norm @@ -288,15 +265,15 @@ def _create_config_converters(cls) -> list[ParamConverter]: ] def _get_mlp_converters(self, fast_llm_prefix: str, hf_prefix: str) -> list[WeightConverter]: - linear_bias: bool | AddLinearBiasChoices = self._model.config.base_model.transformer.add_linear_biases + transformer_config: TransformerConfig = self._model.config.base_model.transformer return [ *self._get_weight_and_bias_converters( - f"{fast_llm_prefix}.mlp.layer_1", f"{hf_prefix}.mlp.c_fc", self._add_mlp_bias(linear_bias) + f"{fast_llm_prefix}.mlp.layer_1", f"{hf_prefix}.mlp.c_fc", transformer_config.add_mlp_bias ), *self._get_weight_and_bias_converters( f"{fast_llm_prefix}.mlp.layer_2", f"{hf_prefix}.mlp.c_proj", - self._add_mlp_bias(linear_bias), + transformer_config.add_mlp_bias, MLPLayer2Converter, ), ] @@ -387,18 +364,18 @@ def _create_config_converters(cls) -> list[ParamConverter]: ] def _get_mlp_converters(self, fast_llm_prefix: str, hf_prefix: str) -> list[WeightConverter]: - linear_bias: bool | AddLinearBiasChoices = self._model.config.base_model.transformer.add_linear_biases + transformer_config: TransformerConfig = self._model.config.base_model.transformer return [ *self._get_weight_and_bias_converters( f"{fast_llm_prefix}.mlp.layer_1", (f"{hf_prefix}.mlp.gate_proj", f"{hf_prefix}.mlp.up_proj"), - self._add_mlp_bias(linear_bias), + transformer_config.add_mlp_bias, SplitWeightConverter, ), *self._get_weight_and_bias_converters( f"{fast_llm_prefix}.mlp.layer_2", f"{hf_prefix}.mlp.down_proj", - self._add_mlp_bias(linear_bias), + transformer_config.add_mlp_bias, MLPLayer2Converter, ), ] @@ -460,45 +437,23 @@ def _create_config_converters(cls) -> list[ParamConverter]: ] def _get_mlp_converters(self, fast_llm_prefix: str, hf_prefix: str) -> list[WeightConverter]: - linear_bias: bool | AddLinearBiasChoices = self._model.config.base_model.transformer.add_linear_biases + transformer_config: TransformerConfig = self._model.config.base_model.transformer return [ *self._get_weight_and_bias_converters( f"{fast_llm_prefix}.mlp.layer_1", (f"{hf_prefix}.mlp.gate_proj", f"{hf_prefix}.mlp.up_proj"), - self._add_mlp_bias(linear_bias), + transformer_config.add_mlp_bias, SplitWeightConverter, ), *self._get_weight_and_bias_converters( f"{fast_llm_prefix}.mlp.layer_2", f"{hf_prefix}.mlp.down_proj", - self._add_mlp_bias(linear_bias), + transformer_config.add_mlp_bias, MLPLayer2Converter, ), ] -""" -x vocab_size=151936, -x hidden_size=4096, -x intermediate_size=22016, -x num_hidden_layers=32, -x num_attention_heads=32, -x num_key_value_heads=32, -x hidden_act="silu", -max_position_embeddings=32768, -initializer_range=0.02, -x rms_norm_eps=1e-6, -use_cache=True, -x tie_word_embeddings=False, -x rope_theta=10000.0, -x rope_scaling=None, -x use_sliding_window=False, -x sliding_window=4096, -x max_window_layers=28, -attention_dropout=0.0, -""" - - class MistralHuggingfaceCheckpointHandler(CommonLlamaHuggingfaceCheckpointHandler): format: typing.ClassVar[type[CheckpointFormat]] = MistralGPTHuggingfaceCheckpointFormat