for testing, remove later

Signed-off-by: Kirthi Shankar Sivamani <[email protected]>
NVIDIA · Mar 8, 2024 · 86a2505 · 86a2505
1 parent a54c32b
commit 86a2505
Show file tree

Hide file tree

Showing 2 changed files with 2 additions and 2 deletions.
diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py
@@ -2400,7 +2400,7 @@ def __init__(
         assert (num_attention_heads % self.num_gqa_groups == 0
                 ), "The number of attention heads must be divisible by the number of GQA groups!"
 
-        if sequence_parallel or get_rng_state_tracker is None:
+        if True: #sequence_parallel or get_rng_state_tracker is None:
             attention_dropout_ctx = nullcontext
         else:
             attention_dropout_ctx = get_rng_state_tracker().fork

diff --git a/transformer_engine/pytorch/fp8.py b/transformer_engine/pytorch/fp8.py
@@ -429,7 +429,7 @@ def fp8_autocast(
     enabled: bool = True,
     calibrating: bool = False,
     fp8_recipe: Optional[DelayedScaling] = None,
-    fp8_group: Optional[dist_group_type] = None,
+    fp8_group: Optional[dist_group_type] = -100, #None,
 ) -> None:
     """
     Context manager for FP8 usage.