x

jomitchellnv · jomitchellnv · commit f82b896a2e63 · 2026-03-12T12:42:56.000-06:00
Signed-off-by: Jonathan Mitchell &lt;jomitchell@nvidia.com&gt;
diff --git a/bionemo-recipes/models/llama3/modeling_llama_te.py b/bionemo-recipes/models/llama3/modeling_llama_te.py
@@ -18,11 +18,7 @@
 import warnings
 from collections import OrderedDict
 from contextlib import nullcontext
-<<<<<<< HEAD
 from typing import ClassVar, ContextManager, Unpack
-=======
-from typing import ClassVar, Unpack
->>>>>>> 4067915d (adds llama3 MXFP8 NVFP4)
 
 import torch
 import torch.nn as nn
@@ -346,14 +342,6 @@ def forward(
             if te_rope_emb.dtype != torch.float32:
                 warnings.warn("Rotary embeddings should be in float32 for optimal performance.", UserWarning)
 
-<<<<<<< HEAD
-        with self.get_autocast_context(None, outer=True):
-            for layer_idx, decoder_layer in enumerate(self.layers[: self.config.num_hidden_layers]):
-                if output_hidden_states:
-                    all_hidden_states = (*all_hidden_states, hidden_states)
-
-                with self.get_autocast_context(layer_idx):
-=======
         # Outer FP8 autocast enables FP8 compute for the decoder stack. Per-layer overrides (FP4, BF16) are handled
         # by get_layer_autocast(), which nests inside this context.
         with transformer_engine.pytorch.autocast(enabled=self._fp8_recipe is not None, recipe=self._fp8_recipe):
@@ -362,7 +350,6 @@ def forward(
                     all_hidden_states = (*all_hidden_states, hidden_states)
 
                 with self.get_layer_autocast(layer_number):
->>>>>>> 4067915d (adds llama3 MXFP8 NVFP4)
                     hidden_states = decoder_layer(
                         hidden_states,
                         attention_mask=None if self.config.attn_input_format == "thd" else attention_mask,
diff --git a/bionemo-recipes/recipes/llama3_native_te/hydra_config/defaults.yaml b/bionemo-recipes/recipes/llama3_native_te/hydra_config/defaults.yaml
@@ -41,12 +41,8 @@ fp8_config:
   fp8_recipe: transformer_engine.common.recipe.DelayedScaling
   fp8_format: "HYBRID"
   fp8_recipe_kwargs: {}
-
-fp4_config:
-  enabled: false
-  fp4_recipe: transformer_engine.common.recipe.NVFP4BlockScaling
-  fp4_format: "E2M1"
-  fp4_recipe_kwargs: {}
+  quantized_model_init_kwargs:
+    enabled: false # If this is set to true, fp8_config.enabled must also be set to true.
 
 fp4_config:
   enabled: false
diff --git a/bionemo-recipes/recipes/llama3_native_te/modeling_llama_te.py b/bionemo-recipes/recipes/llama3_native_te/modeling_llama_te.py
@@ -18,11 +18,7 @@
 import warnings
 from collections import OrderedDict
 from contextlib import nullcontext
-<<<<<<< HEAD
 from typing import ClassVar, ContextManager, Unpack
-=======
-from typing import ClassVar, Unpack
->>>>>>> 4067915d (adds llama3 MXFP8 NVFP4)
 
 import torch
 import torch.nn as nn
@@ -346,14 +342,6 @@ def forward(
             if te_rope_emb.dtype != torch.float32:
                 warnings.warn("Rotary embeddings should be in float32 for optimal performance.", UserWarning)
 
-<<<<<<< HEAD
-        with self.get_autocast_context(None, outer=True):
-            for layer_idx, decoder_layer in enumerate(self.layers[: self.config.num_hidden_layers]):
-                if output_hidden_states:
-                    all_hidden_states = (*all_hidden_states, hidden_states)
-
-                with self.get_autocast_context(layer_idx):
-=======
         # Outer FP8 autocast enables FP8 compute for the decoder stack. Per-layer overrides (FP4, BF16) are handled
         # by get_layer_autocast(), which nests inside this context.
         with transformer_engine.pytorch.autocast(enabled=self._fp8_recipe is not None, recipe=self._fp8_recipe):
@@ -362,7 +350,6 @@ def forward(
                     all_hidden_states = (*all_hidden_states, hidden_states)
 
                 with self.get_layer_autocast(layer_number):
->>>>>>> 4067915d (adds llama3 MXFP8 NVFP4)
                     hidden_states = decoder_layer(
                         hidden_states,
                         attention_mask=None if self.config.attn_input_format == "thd" else attention_mask,
diff --git a/bionemo-recipes/recipes/llama3_native_te/train_ddp.py b/bionemo-recipes/recipes/llama3_native_te/train_ddp.py
@@ -72,27 +72,6 @@ def main(args: DictConfig) -> float | None:
     # Create a device mesh for DDP. While this isn't strictly necessary, it mirrors the device mesh we create for FSDP2.
     device_mesh = init_device_mesh("cuda", mesh_shape=(dist_config.world_size,), mesh_dim_names=("dp",))
 
-<<<<<<< HEAD
-    # --- Model Configuration ---
-    # Create quantization recipes -- only used if FP8/FP4 is enabled in the config.
-    fp8_recipe = None
-    if args.fp8_config.enabled:
-        fp8_recipe = hydra.utils.get_class(args.fp8_config.fp8_recipe)(
-            fp8_format=Format[args.fp8_config.fp8_format], **args.fp8_config.fp8_recipe_kwargs
-        )
-
-    fp4_recipe = None
-    if args.fp4_config.enabled:
-        fp4_recipe = hydra.utils.get_class(args.fp4_config.fp4_recipe)(**args.fp4_config.fp4_recipe_kwargs)
-
-    # --- Model Initialization ---
-    if args.use_te:
-        config = NVLlamaConfig.from_pretrained(args.config_name_or_path, dtype=torch.bfloat16, **args.config_kwargs)
-        model = NVLlamaForCausalLM(config, fp8_recipe=fp8_recipe, fp4_recipe=fp4_recipe)
-    else:
-        config = LlamaConfig.from_pretrained(args.config_name_or_path, dtype=torch.bfloat16, **args.config_kwargs)
-        model = LlamaForCausalLM(config)
-=======
     if args.use_te:
         config_class = NVLlamaConfig
         model_class = NVLlamaForCausalLM
@@ -141,7 +120,6 @@ def main(args: DictConfig) -> float | None:
         recipe=fp8_recipe, **args.fp8_config.quantized_model_init_kwargs
     ):
         model = model_class(config)
->>>>>>> 4067915d (adds llama3 MXFP8 NVFP4)
 
     logger.info("Initialized Model:\n%s", model)
 
diff --git a/bionemo-recipes/recipes/llama3_native_te/train_fsdp2.py b/bionemo-recipes/recipes/llama3_native_te/train_fsdp2.py
@@ -74,29 +74,6 @@ def main(args: DictConfig) -> float | None:
 
     device_mesh = init_device_mesh("cuda", mesh_shape=(dist_config.world_size,), mesh_dim_names=("dp",))
 
-<<<<<<< HEAD
-    # --- Model Configuration ---
-    # Create quantization recipes -- only used if FP8/FP4 is enabled in the config.
-    fp8_recipe = None
-    if args.fp8_config.enabled:
-        fp8_recipe = hydra.utils.get_class(args.fp8_config.fp8_recipe)(
-            fp8_format=Format[args.fp8_config.fp8_format], **args.fp8_config.fp8_recipe_kwargs
-        )
-
-    fp4_recipe = None
-    if args.fp4_config.enabled:
-        fp4_recipe = hydra.utils.get_class(args.fp4_config.fp4_recipe)(**args.fp4_config.fp4_recipe_kwargs)
-
-    # --- Model Initialization ---
-    if args.use_te:
-        config = NVLlamaConfig.from_pretrained(args.config_name_or_path, dtype=torch.bfloat16, **args.config_kwargs)
-        with torch.device("meta") if args.use_meta_device else nullcontext():
-            model = NVLlamaForCausalLM(config, fp8_recipe=fp8_recipe, fp4_recipe=fp4_recipe)
-    else:
-        config = LlamaConfig.from_pretrained(args.config_name_or_path, dtype=torch.bfloat16, **args.config_kwargs)
-        with torch.device("meta") if args.use_meta_device else nullcontext():
-            model = LlamaForCausalLM(config)
-=======
     if args.use_te:
         config_class = NVLlamaConfig
         model_class = NVLlamaForCausalLM
@@ -152,7 +129,6 @@ def main(args: DictConfig) -> float | None:
         ),
     ):
         model = model_class(config)
->>>>>>> 4067915d (adds llama3 MXFP8 NVFP4)
 
     logger.info("Initialized Model:\n%s", model)
 
diff --git a/bionemo-recipes/recipes/llama3_native_te/train_fsdp2_cp.py b/bionemo-recipes/recipes/llama3_native_te/train_fsdp2_cp.py
@@ -81,33 +81,11 @@ def main(args: DictConfig) -> float | None:
     logger.info("Created device mesh: %s", device_mesh)
 
     # --- Model Configuration ---
-<<<<<<< HEAD
-<<<<<<< HEAD
-    # Create quantization recipes -- only used if FP8/FP4 is enabled in the config.
-    fp8_recipe = None
-    if args.fp8_config.enabled:
-        fp8_recipe = hydra.utils.get_class(args.fp8_config.fp8_recipe)(
-            fp8_format=Format[args.fp8_config.fp8_format], **args.fp8_config.fp8_recipe_kwargs
-        )
-
-    fp4_recipe = None
-    if args.fp4_config.enabled:
-        fp4_recipe = hydra.utils.get_class(args.fp4_config.fp4_recipe)(**args.fp4_config.fp4_recipe_kwargs)
-
-    # --- Model Initialization ---
-    config = NVLlamaConfig.from_pretrained(args.config_name_or_path, dtype=torch.bfloat16, **args.config_kwargs)
-
-    with torch.device("meta") if args.use_meta_device else nullcontext():
-        model = NVLlamaForCausalLM(config, fp8_recipe=fp8_recipe, fp4_recipe=fp4_recipe)
-=======
-    config = NVLlamaConfig.from_pretrained(args.config_name_or_path, dtype=torch.bfloat16, **args.config_kwargs)
-=======
     config = NVLlamaConfig.from_pretrained(
         args.config_name_or_path,
         dtype=torch.float32 if args.use_fp32_master_weights else torch.bfloat16,
         **args.config_kwargs,
     )
->>>>>>> 80e4897e (fixed quant stats init and adds fp32 master weights)
 
     # Resolve layer-wise quantization assignments and store on config.
     layer_precision = resolve_layer_precision(
@@ -150,7 +128,6 @@ def main(args: DictConfig) -> float | None:
         ),
     ):
         model = NVLlamaForCausalLM(config)
->>>>>>> 4067915d (adds llama3 MXFP8 NVFP4)
 
     logger.info("Initialized Model:\n%s", model)