AI-Hypercomputer · copybara-service · Feb 11, 2025 · Jan 31, 2025 · Jan 31, 2025 · Feb 1, 2025
@@ -192,6 +192,10 @@ final_logits_soft_cap: 0.0
 use_post_attn_norm: False
 use_post_ffw_norm: False
 
+# In dot_product attention, whether to upcast the qk product and attention logits to fp32
+float32_qk_product: False
+float32_logits: False
+
 
 # Combine matmuls for QKV and MLP
 fused_qkv: False

diff --git a/MaxText/configs/models/gemma-2b.yml b/MaxText/configs/models/gemma-2b.yml
@@ -24,4 +24,6 @@ mlp_activations: ["gelu","linear"]
 vocab_size: 256128
 decoder_block: "gemma"
 normalization_layer_epsilon: 1.e-06
-logits_via_embedding: True
+logits_via_embedding: True
+float32_qk_product: True
+float32_qk_logits: True
diff --git a/MaxText/configs/models/gemma-7b.yml b/MaxText/configs/models/gemma-7b.yml
@@ -24,4 +24,6 @@ mlp_activations: ["gelu","linear"]
 vocab_size: 256128
 decoder_block: "gemma"
 normalization_layer_epsilon: 1.e-06
-logits_via_embedding: True
+logits_via_embedding: True
+float32_qk_product: True
+float32_qk_logits: True
diff --git a/MaxText/configs/models/gemma2-27b.yml b/MaxText/configs/models/gemma2-27b.yml
@@ -30,3 +30,5 @@ attn_logits_soft_cap: 50.0
 sliding_window_size: 4096
 use_post_attn_norm: True
 use_post_ffw_norm: True
+float32_qk_product: True
+float32_qk_logits: True
diff --git a/MaxText/configs/models/gemma2-2b.yml b/MaxText/configs/models/gemma2-2b.yml
@@ -30,3 +30,5 @@ attn_logits_soft_cap: 50.0
 sliding_window_size: 4096
 use_post_attn_norm: True
 use_post_ffw_norm: True
+float32_qk_product: True
+float32_qk_logits: True
diff --git a/MaxText/configs/models/gemma2-9b.yml b/MaxText/configs/models/gemma2-9b.yml
@@ -30,3 +30,5 @@ attn_logits_soft_cap: 50.0
 sliding_window_size: 4096
 use_post_attn_norm: True
 use_post_ffw_norm: True
+float32_qk_product: True
+float32_qk_logits: True
@@ -477,7 +477,7 @@ def apply_attention_dot(
     """Apply Attention."""
     validate_compute_axis_order(self.compute_axis_order)
     # Casting qk_product and softmaxt computation for float32 for model stability.
-    if model_mode == common_types.MODEL_MODE_TRAIN and self.float32_qk_product:
+    if self.float32_qk_product:
 matmul_precision: "default" 
 matmul_precision: "default" 
       if isinstance(key, KVTensor):
         key = key.dequant()
       query = query.astype(jnp.float32)
@@ -491,7 +491,7 @@ def apply_attention_dot(
       attn_weights = attn_weights * self.attn_logits_soft_cap
 
     # Casting softmaxt computation for float32 for model stability.
-    if model_mode == common_types.MODEL_MODE_TRAIN and self.float32_logits:
+    if self.float32_logits:
       attn_weights = attn_weights.astype(jnp.float32)
     attn_mask = self.generate_attention_mask(query, key, decoder_segment_ids, model_mode)
     if attn_mask is not None:

@@ -91,8 +91,8 @@ def __call__(
         weight_dtype=cfg.weight_dtype,
         dropout_rate=cfg.dropout_rate,
         name="self_attention",
-        float32_qk_product=True,
-        float32_logits=True,
+        float32_qk_product=cfg.float32_qk_product,
+        float32_logits=cfg.float32_logits,
         quant=self.quant,
         kv_quant=quantizations.configure_kv_quant(cfg),
         use_ragged_attention=cfg.use_ragged_attention,

@@ -91,8 +91,8 @@ def __call__(
         weight_dtype=cfg.weight_dtype,
         dropout_rate=cfg.dropout_rate,
         name="self_attention_local",
-        float32_qk_product=True,
-        float32_logits=True,
+        float32_qk_product=cfg.float32_qk_product,
+        float32_logits=cfg.float32_logits,
         quant=self.quant,
         kv_quant=quantizations.configure_kv_quant(cfg),
         attention_type=attentions.AttentionType.LOCAL_SLIDING,

@@ -312,6 +312,8 @@ def __call__(
         mesh=mesh,
         dropout_rate=cfg.dropout_rate,
         name="self_attention",
+        float32_qk_product=cfg.float32_qk_product,
+        float32_logits=cfg.float32_logits,
         fused_qkv=cfg.fused_qkv,
         use_bias=True,
         quant=self.quant,

@@ -105,6 +105,8 @@ def __call__(
         weight_dtype=cfg.weight_dtype,
         dropout_rate=cfg.dropout_rate,
         name="self_attention",
+        float32_qk_product=cfg.float32_qk_product,
+        float32_logits=cfg.float32_logits,
         quant=self.quant,
         kv_quant=quantizations.configure_kv_quant(cfg),
         prefill_cache_axis_order=tuple([int(i) for i in cfg.prefill_cache_axis_order.split(",")]),

@@ -97,6 +97,8 @@ def __call__(
         weight_dtype=cfg.weight_dtype,
         dropout_rate=cfg.dropout_rate,
         name="self_attention",
+        float32_qk_product=cfg.float32_qk_product,
+        float32_logits=cfg.float32_logits,
         quant=self.quant,
         kv_quant=quantizations.configure_kv_quant(cfg),
     )

@@ -92,6 +92,8 @@ def __call__(
         weight_dtype=cfg.weight_dtype,
         dropout_rate=cfg.dropout_rate,
         name="self_attention",
+        float32_qk_product=cfg.float32_qk_product,
+        float32_logits=cfg.float32_logits,
         quant=self.quant,
         kv_quant=quantizations.configure_kv_quant(cfg),
         prefill_cache_axis_order=tuple([int(i) for i in cfg.prefill_cache_axis_order.split(",")]),