ServiceNow · oleksost · Apr 24, 2025 · Feb 18, 2025 · Mar 13, 2025 · Mar 14, 2025
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -29,7 +29,7 @@ jobs:
         run: |
           pip install "torch>=2.2.2"
           pip install pybind11
-          FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE pip install --no-build-isolation -e ".[CORE,OPTIONAL,DEV,DOCS]"
+          FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE MAMBA_SKIP_CUDA_BUILD=TRUE MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE pip install --no-build-isolation -e ".[CORE,OPTIONAL,DEV,DOCS]"
 
       - name: Run tests
         run: pytest .

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -31,7 +31,7 @@ jobs:
       - run: |
           pip install "torch>=2.2.2"
           pip install pybind11
-          FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE pip install --no-build-isolation -e ".[CORE,OPTIONAL,DEV,DOCS]"
+          FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE MAMBA_SKIP_CUDA_BUILD=TRUE MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE  pip install --no-build-isolation -e ".[CORE,OPTIONAL,DEV,DOCS]"
       - name: Build the documentation
         run: mkdocs build
 

diff --git a/fast_llm/functional/config.py b/fast_llm/functional/config.py
@@ -43,6 +43,7 @@ class ActivationType(str, enum.Enum):
     silu = "silu"
     relu = "relu"
     squared_relu = "squared_relu"
+    identity = "identity"
 
     @property
     def activation_fn(self) -> typing.Callable[["torch.Tensor"], "torch.Tensor"]:
@@ -70,6 +71,7 @@ def _set_activation_fn_map() -> None:
         ActivationType.silu: torch.nn.functional.silu,
         ActivationType.relu: torch.nn.functional.relu,
         ActivationType.squared_relu: lambda x: torch.pow(torch.nn.functional.relu(x), 2),
+        ActivationType.identity: lambda x: x,
     }
 
 
@@ -80,6 +82,7 @@ def _set_activation_fn_map() -> None:
     ActivationType.silu: "silu",
     ActivationType.relu: "relu",
     ActivationType.squared_relu: "relu2",
+    ActivationType.identity: "identity",
 }
 _ACTIVATION_HF_NAMES_INV = {value: key for key, value in _ACTIVATION_HF_NAMES.items()}
 

diff --git a/fast_llm/functional/triton/mlp.py b/fast_llm/functional/triton/mlp.py
@@ -119,6 +119,10 @@ def triton_mlp_activation_backward_kernel(
         grad = 2 * relu_out
         if gated or recompute:
             out = relu_out * relu_out
+    elif activation_type == _TritonActivationType.identity:
+        grad = 1
+        if gated or recompute:
+            out = input_
     else:
         raise NotImplementedError()
 

diff --git a/fast_llm/layers/language_model/config.py b/fast_llm/layers/language_model/config.py
@@ -5,6 +5,7 @@
 from fast_llm.engine.config_utils.tensor_space import TensorDim, TensorSpace
 from fast_llm.engine.distributed.config import DistributedDimNames
 from fast_llm.functional.config import CrossEntropyImpl
+from fast_llm.layers.ssm.config import SSMArchitectureConfig, SSMConfig
 from fast_llm.layers.transformer.config import TransformerArchitectureConfig, TransformerConfig
 from fast_llm.utils import Assert
 
@@ -43,6 +44,13 @@ class LanguageModelArchitectureConfig(BaseModelArchitectureConfig):
         desc="Configuration for the transformer architecture.",
         hint=FieldHint.core,
     )
+
+    ssm: SSMArchitectureConfig = Field(
+        default_factory=SSMArchitectureConfig,
+        desc="Configuration for the transformer architecture.",
+        hint=FieldHint.core,
+    )
+
     max_position_embeddings: int = Field(
         default=2048,
         desc="Number of absolute position embeddings, if applicable.",
@@ -125,6 +133,8 @@ class LanguageModelBaseConfig(LanguageModelArchitectureConfig, BaseModelConfig):
     architecture_class = LanguageModelArchitectureConfig
 
     transformer: TransformerConfig = FieldUpdate(default_factory=TransformerConfig)
+    ssm: SSMConfig = FieldUpdate(default_factory=SSMConfig)
+
     init_method_std_embed: float = Field(
         default=None,
         desc="Initialization scale for the vocabulary embedding and output weights (logits).",

diff --git a/fast_llm/layers/ssm/config.py b/fast_llm/layers/ssm/config.py
@@ -0,0 +1,135 @@
+from fast_llm.config import Field, FieldHint, FieldUpdate, check_field, config_class
+from fast_llm.engine.base_model.config import BaseModelArchitectureConfig, BaseModelConfig
+from fast_llm.functional.config import ActivationType
+from fast_llm.layers.common.config import NormalizationArchitectureConfig, NormalizationConfig
+from fast_llm.utils import Assert
+
+
+class SSMDimNames:
+    model_dim = "model_dim"  # Model dimension (D)
+    state_dim = "state_dim"  # State dimension (N)
+    conv_dim = "conv_dim"  # Dimension of the conv1d input in mamba layers
+    inner_dim = "inner_dim"  # Inner dimension after expansion
+    dt_rank = "dt_rank"  # Rank of Δ
+    inner_proj_mamba = "inner_proj_mamba"  # Inner projection dimension for mamba
+    inner_proj_mamba2 = "inner_proj_mamba2"  # Inner projection dimension for mamba2
+    x_proj_dim = "x_proj_dim"  # X projection dimension
+    head_dim = "head_dim"  # Dimension of the mamba2 head (P)
+    conv_kernel_size = "conv_kernel_size"  # Kernel size of the conv1d in mamba layers
+    qk_heads = "qk_heads"  # Number of QK heads
+    v_heads = "v_heads"  # Number of V heads
+
+
+@config_class()
+class SSMArchitectureConfig(BaseModelArchitectureConfig):
+    _abstract = False
+
+    # Normalization
+    normalization: NormalizationArchitectureConfig = Field(
+        default_factory=NormalizationArchitectureConfig,
+        desc="Configuration for the normalization layers architecture.",
+        hint=FieldHint.core,
+    )
+
+    expansion_factor: int = Field(
+        default=2, desc="Expansion factor for Mamba blocks.", hint=FieldHint.core, valid=check_field(Assert.gt, 0)
+    )
+
+    state_size: int = Field(
+        default=16,
+        desc="State size for Mamba blocks.",
+        hint=FieldHint.core,
+        valid=check_field(Assert.gt, 0),
+    )
+    conv_kernel_dimension: int = Field(
+        default=4,
+        desc="Conv kernel dimension for Mamba blocks.",
+        hint=FieldHint.core,
+        valid=check_field(Assert.gt, 0),
+    )
+
+    # Layer parameters
+    add_bias_linear: bool = Field(
+        default=False,
+        desc="Whether to use bias in SSM layers",
+        hint=FieldHint.core,
+    )
+
+    dt_rank: int = Field(
+        default=None,
+        desc="Rank of the Δ projection matrix. If 'None', will be set to ceil(hidden_size/16)",
+        hint=FieldHint.core,
+    )
+
+    chunk_size: int = Field(
+        default=256,
+        desc="Chunk size for Mamba2 blocks.",
+        hint=FieldHint.core,
+    )
+
+    n_qk_heads: int = Field(
+        default=32,
+        desc="Number of QK heads for Mamba2 blocks.",
+        hint=FieldHint.core,
+    )
+
+    n_v_heads: int = Field(
+        default=32,
+        desc="Number of V heads for Mamba2 blocks.",
+        hint=FieldHint.core,
+    )
+
+    activation_type: ActivationType = Field(
+        default=None,
+        desc="The MLP intermediate activation type. Default: SiLU for gated MLP, GeLU otherwise.",
+        hint=FieldHint.core,
+    )
+
+    def _validate(self) -> None:
+        with self._set_implicit_default():
+            if self.activation_type is None:
+                self.activation_type = ActivationType.silu
+            if self.dt_rank is None:
+                self.dt_rank = -1  # set to -1, it will be overwrittem in ssm validation
+
+        super()._validate()
+
+
+@config_class()
+class SSMConfig(SSMArchitectureConfig, BaseModelConfig):
+    """Configuration for a Structured State Space Model (SSM) layer."""
+
+    normalization: NormalizationConfig = FieldUpdate(default_factory=NormalizationConfig)
+
+    debug_ssm: bool = Field(
+        default=False,
+        desc="debug_ssm",
+        hint=FieldHint.optional,
+    )
+
+    dt_min: float = Field(
+        default=0.001,
+        desc="Minimum step size for discretization",
+        hint=FieldHint.core,
+        valid=check_field(Assert.gt, 0),
+    )
+
+    dt_max: float = Field(
+        default=0.1,
+        desc="Maximum step size for discretization",
+        hint=FieldHint.core,
+        valid=check_field(Assert.gt, 0),
+    )
+
+    dt_init_floor: float = Field(
+        default=1e-4,
+        desc="Minimum value for initializing dt",
+        hint=FieldHint.core,
+        valid=check_field(Assert.gt, 0),
+    )
+
+    def _validate(self) -> None:
+        """Validate configuration parameters."""
+
+        super()._validate()
+        Assert.geq(self.dt_max, self.dt_min)