meta-pytorch · wanchaol · Mar 4, 2024 · Mar 4, 2024 · Mar 12, 2024 · Mar 12, 2024
diff --git a/float8_experimental/config.py b/float8_experimental/config.py
@@ -14,8 +14,3 @@
 # this doesn't work with autocast + torch.compile + FSDP. Enabling this
 # option is useful for safety, but not strictly necessary.
 enable_pre_and_post_forward = True
-
-# If True, dynamic linear uses hooks for activation casting
-# TODO(before land): add test coverage for both cases
-# dynamic_use_activation_hooks = True
-# dynamic_use_activation_hooks = False
diff --git a/float8_experimental/float8_dynamic_linear.py b/float8_experimental/float8_dynamic_linear.py
@@ -8,7 +8,11 @@
 """
 import torch
 
-from float8_experimental.float8_tensor import Float8Tensor, to_fp8_no_autograd
+from float8_experimental.float8_tensor import (
+    Float8Tensor,
+    tensor_already_casted_to_fp8,
+    to_fp8_no_autograd,
+)
 from float8_experimental.float8_utils import tensor_to_scale
 
 
@@ -30,63 +34,43 @@ def forward(
 
     @staticmethod
     def backward(ctx, gradY):
+        if tensor_already_casted_to_fp8(gradY):
+            # check to early return if already casted to float8
+            return gradY, None
         gradY_scale = tensor_to_scale(gradY, torch.float8_e5m2)
         fp8_tensor = to_fp8_no_autograd(
             gradY, gradY_scale, torch.float8_e5m2, ctx.emulate
         )
         return fp8_tensor, None
 
 
-def cast_x_to_float8_e4m3fn_pre_hook(module, args):
-    """
-    Hook to cast the incoming activation to `torch.float8_e4m3fn`
-    """
-    return module.cast_to_float8_e4m3fn(args[0])
-
-
-def cast_grad_to_float8_e5m2_backward_forward_hook(module, input, output):
-    """This is a forward hook that sends the output of the model through
-    a no-op in the forward but a cast to float8_e5m2 in the backward.
-
-    Args:
-        module (nn.Module): the module to cast the output of
-        input (Tensor): the input to the module forward call
-        output (Tensor): the output of the module forward
-    """
-    return module.cast_to_float8_e5m2_bw(output)
-
-
 class Float8DynamicLinear(torch.nn.Linear):
     """
     A wrapper around a `torch.nn.Linear` module which does fp8 compute. By on the fly
     conversion to fp8 of the input and weight tensors.
     """
 
-    def __init__(self, use_activation_hooks: bool, **super_kwargs):
-        """
-        Args:
-            use_activation_hooks (bool): whether to use activation hooks for casting to and from float8
-        """
+    def __init__(self, **super_kwargs):
         super().__init__(**super_kwargs)
 
-        self.use_activation_hooks = use_activation_hooks
-
     def forward(self, x):
         # cast x to float8_e4m3fn if not using activation hooks
-        x_fp8 = x if self.use_activation_hooks else self.cast_to_float8_e4m3fn(x)
+        x_fp8 = self.cast_to_float8_e4m3fn(x)
 
         # cast w to float8_e4m3fn
         w_fp8 = self.cast_to_float8_e4m3fn(self.weight)
 
         y = torch.nn.functional.linear(x_fp8, w_fp8, self.bias)
 
         # Cast gradY to float8_e5m2 during backward if not using activation hooks
-        if not self.use_activation_hooks:
-            y = self.cast_to_float8_e5m2_bw(y)
+        y = self.cast_to_float8_e5m2_bw(y)
 
         return y
 
     def cast_to_float8_e4m3fn(self, inpt_tensor: torch.Tensor) -> Float8Tensor:
+        if tensor_already_casted_to_fp8(inpt_tensor):
+            # check to early return if already casted to float8
+            return inpt_tensor
         scale = tensor_to_scale(inpt_tensor, torch.float8_e4m3fn)
         return Float8Tensor.to_float8(
             inpt_tensor, scale, torch.float8_e4m3fn, emulate=self.emulate
@@ -96,31 +80,22 @@ def cast_to_float8_e5m2_bw(self, gradY: torch.Tensor) -> torch.Tensor:
         return NoopFwToFloat8E5M2Bw.apply(gradY, self.emulate)
 
     @classmethod
-    def from_float(
-        cls, mod, emulate: bool = False, use_activation_hooks: bool = False
-    ) -> "Float8DynamicLinear":
+    def from_float(cls, mod, emulate: bool = False) -> "Float8DynamicLinear":
         """
         Create an nn.Linear with fp8 compute from a regular nn.Linear
 
         Args:
             mod (torch.nn.Linear): nn.Linear to convert
             emulate (bool): whether to emulate fp8 matmul logic in float32
-            use_activation_hooks (bool): whether to use activation hooks for casting to and from float8
         """
         with torch.device("meta"):
             super_kwargs = {
                 "in_features": mod.in_features,
                 "out_features": mod.out_features,
                 "bias": False,
             }
-            new_mod = cls(use_activation_hooks, **super_kwargs)
+            new_mod = cls(**super_kwargs)
         new_mod.weight = mod.weight
         new_mod.bias = mod.bias
         new_mod.emulate = emulate
-        if new_mod.use_activation_hooks:
-            # install the hooks
-            new_mod.register_forward_pre_hook(cast_x_to_float8_e4m3fn_pre_hook)
-            new_mod.register_forward_hook(
-                cast_grad_to_float8_e5m2_backward_forward_hook
-            )
         return new_mod
diff --git a/float8_experimental/float8_linear.py b/float8_experimental/float8_linear.py
@@ -304,16 +304,15 @@ def forward(self, x):
         return y
 
     @classmethod
-    def from_float(cls, mod, emulate: bool = False, use_activation_hooks: bool = False):
+    def from_float(cls, mod, emulate: bool = False):
         """
         Create an nn.Linear with fp8 compute from a regular nn.Linear
 
         Args:
             mod (torch.nn.Linear): nn.Linear to convert
             emulate (bool): whether to emulate fp8 matmul logic in float32
-            use_activation_hooks (bool): whether to use activation hooks instead of inlining the casting logic
+            cast_activation (bool): whether to use activation hooks instead of inlining the casting logic
         """
-        assert not use_activation_hooks, "use_activation_hooks is not supported yet!"
         # TODO Follow up! This is a great idea but we need the mixin base to create real
         # Tensors and the Linear base to create empty params
         # with torch.device("meta"):

diff --git a/float8_experimental/float8_linear_utils.py b/float8_experimental/float8_linear_utils.py
@@ -33,27 +33,22 @@ def get_float8_linear(
     linear_type: LinearType,
     linear_ref: torch.nn.Linear,
     emulate: bool = False,
-    use_activation_hooks: bool = False,
 ):
     """Returns a Float8Linear module of the given type, initialized from linear_ref.
     Args:
         linear_type: The type of Float8Linear to return.
         linear_ref: The linear module to initialize from.
         emulate: Whether to emulate the fp8 matmul logic in float32.
-        use_activation_hooks: Whether to use activation hooks for dynamic linear.
     """
     LINEAR_TYPE_MAP = {
         LinearType.DELAYED: Float8Linear,
         LinearType.DYNAMIC: Float8DynamicLinear,
     }
     if linear_type not in LINEAR_TYPE_MAP:
         raise ValueError(f"linear_type must be one of {LINEAR_TYPE_MAP.keys()}")
-    if use_activation_hooks and linear_type != LinearType.DYNAMIC:
-        raise ValueError("use_activation_hooks is only supported for dynamic linear")
     return LINEAR_TYPE_MAP[linear_type].from_float(
         copy.deepcopy(linear_ref),
         emulate=emulate,
-        use_activation_hooks=use_activation_hooks,
     )
 
 
@@ -90,7 +85,6 @@ def swap_linear_with_float8_linear(
     *,
     skip_fqn_list: Optional[List[str]] = None,
     emulate: bool = False,
-    use_activation_hooks: bool = False,
 ) -> nn.Module:
     """
     Replaces all instances of ``torch.nn.Linear`` in ``module`` with instances
@@ -102,17 +96,14 @@ def swap_linear_with_float8_linear(
         skip_fqn_list (List[str], optional): If specified, a list of module FQNs to skip.
             Linear submodules of these skipped modules will also be skipped.
         emulate (bool): Whether to emulate the fp8 matmul logic in fp32.
-        use_activation_hooks (bool): Whether to cast activations to fp8 using module hooks.
     """
     module_names_to_skip = set(skip_fqn_list or [])
     if isinstance(module, nn.Linear):
         if len(list(module.children())) > 0:
             raise AssertionError(
                 f"Does not support a root nn.Linear with children: {module}"
             )
-        return module_cls.from_float(
-            module, emulate=emulate, use_activation_hooks=use_activation_hooks
-        )
+        return module_cls.from_float(module, emulate=emulate)
 
     # Mark all modules to skip as visited
     root_module = module
@@ -134,9 +125,7 @@ def post_order_traversal(
             assert (
                 parent_module is not None
             ), f"Linear root module should return early: {module}"
-            float8linear_module = module_cls.from_float(
-                module, emulate=emulate, use_activation_hooks=use_activation_hooks
-            )
+            float8linear_module = module_cls.from_float(module, emulate=emulate)
             setattr(parent_module, module_name, float8linear_module)
 
     post_order_traversal(root_module, "", None)

diff --git a/float8_experimental/float8_tensor.py b/float8_experimental/float8_tensor.py
@@ -14,6 +14,19 @@
 aten = torch.ops.aten
 
 
+def tensor_already_casted_to_fp8(tensor: torch.Tensor) -> bool:
+    """
+    Check if the tensor is already casted to fp8
+    """
+    if isinstance(tensor, Float8Tensor):
+        return True
+    elif isinstance(tensor, DTensor) and isinstance(tensor._local_tensor, Float8Tensor):
+        # TODO: shall we stick to public API and directly use tensor.to_local() here?
+        return True
+
+    return False
+
+
 def to_fp8_no_autograd(
     x: torch.Tensor, x_scale: torch.Tensor, float8_dtype: torch.dtype, emulate: bool
 ) -> "Float8Tensor":

diff --git a/test/test_base.py b/test/test_base.py
@@ -58,9 +58,8 @@ def _test_linear_impl(
         m_ref,
         linear_type: LinearType,
         emulate: bool,
-        use_activation_hooks: bool = False,
     ):
-        m_fp8 = get_float8_linear(linear_type, m_ref, emulate, use_activation_hooks)
+        m_fp8 = get_float8_linear(linear_type, m_ref, emulate)
         for _ in range(2):
             if linear_requires_sync(linear_type):
                 sync_float8_amax_and_scale_history(m_fp8)
@@ -121,15 +120,12 @@ def _test_linear_impl(
     @pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True])
     @pytest.mark.parametrize("x_shape", [(16, 16), (2, 16, 16), (3, 2, 16, 16)])
     @pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC])
-    @pytest.mark.parametrize("use_activation_hooks", [True, False])
-    @pytest.mark.usefixtures("x_fail_activation_hooks_with_delayed")
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def test_linear_nobias(
         self,
         x_shape,
         linear_type: LinearType,
         emulate: bool,
-        use_activation_hooks: bool,
     ):
         if not emulate:
             if not torch.cuda.is_available():
@@ -143,24 +139,21 @@ def test_linear_nobias(
 
         x = torch.randn(*x_shape, device="cuda")
         m_ref = nn.Linear(16, 32, bias=False, device="cuda")
-        self._test_linear_impl(x, m_ref, linear_type, emulate, use_activation_hooks)
+        self._test_linear_impl(x, m_ref, linear_type, emulate)
 
     @pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True])
     @pytest.mark.parametrize("x_shape", [(16, 16), (2, 16, 16), (3, 2, 16, 16)])
     @pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC])
     @pytest.mark.parametrize(
         "linear_dtype", [torch.float16, torch.bfloat16, torch.float32]
     )
-    @pytest.mark.parametrize("use_activation_hooks", [True, False])
-    @pytest.mark.usefixtures("x_fail_activation_hooks_with_delayed")
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def test_linear_bias(
         self,
         x_shape,
         linear_type: LinearType,
         emulate: bool,
         linear_dtype: torch.dtype,
-        use_activation_hooks: bool,
     ):
         if not emulate:
             if not torch.cuda.is_available():
@@ -174,22 +167,19 @@ def test_linear_bias(
 
         x = torch.randn(*x_shape, device="cuda", dtype=linear_dtype)
         m_ref = nn.Linear(16, 32, bias=True, device="cuda", dtype=linear_dtype)
-        self._test_linear_impl(x, m_ref, linear_type, emulate, use_activation_hooks)
+        self._test_linear_impl(x, m_ref, linear_type, emulate)
 
     @pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True])
     @pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC])
     @pytest.mark.parametrize(
         "linear_dtype", [torch.float16, torch.bfloat16, torch.float32]
     )
-    @pytest.mark.parametrize("use_activation_hooks", [True, False])
-    @pytest.mark.usefixtures("x_fail_activation_hooks_with_delayed")
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def test_autocast_outputs(
         self,
         linear_type: LinearType,
         emulate: bool,
         linear_dtype: torch.dtype,
-        use_activation_hooks: bool,
     ):
         if not emulate:
             if not torch.cuda.is_available():
@@ -202,7 +192,7 @@ def test_autocast_outputs(
                 pytest.skip()
 
         m_ref = nn.Linear(32, 16, device="cuda", dtype=linear_dtype)
-        m = get_float8_linear(linear_type, m_ref, emulate, use_activation_hooks)
+        m = get_float8_linear(linear_type, m_ref, emulate)
 
         # autocast off
         x = torch.randn(16, 32, device="cuda", dtype=linear_dtype)
@@ -240,7 +230,7 @@ def test_type_cast(
         )
 
         m = nn.Linear(32, 16, device="cuda", dtype=linear_dtype)
-        m = get_float8_linear(linear_type, m, emulate, False)
+        m = get_float8_linear(linear_type, m, emulate)
 
         # Cast the module to dtype
         m = m.to(dtype=linear_dtype)