move most previously-global configs to Float8LinearConfig (#324)

vkuzo · facebook-github-bot · commit 27a3277566e7 · 2024-07-24T12:11:19.000-07:00
Summary: Pull Request resolved: #324 Adds a `Float8LinearConfig` to unify the user facing per-linear configuration, and moves most of the previously global config options there. In future PRs (to keep PRs small), we will move emulation, scaling and gemm configurations to also live here. Reviewed By: weifengpy Differential Revision: D60176981 fbshipit-source-id: 84ed7a2d0d72aee425f870786b56b8bd641595b1
diff --git a/README.md b/README.md
@@ -91,20 +91,27 @@ from float8_experimental.float8_linear import TensorScalingType
 # create model
 m = Model(...)
 
+# optional: configure for compatibility with FSDP. Note that workarounds 
+# gated with config.enable_amax_init and
+# config.enable_pre_and_post_forward are needed for 
+# autocast + compile + FSDP + float8 to work
+from float8_experimental import Float8LinearConfig
+config = Float8LinearConfig(
+    enable_amax_init = False,  # only needed for autocast + compile + FSDP +  float8 delayed
+    enable_pre_and_post_forward, False  # only needed for autocast + compile + FSDP +  float8 delayed
+)
+
 # convert all `torch.nn.Linear` modules to `Float8Linear`, specifying scaling
 # type
 swap_linear_with_float8_linear(
     m,
     scaling_type_input=TensorScalingType.DELAYED,
     scaling_type_weight=TensorScalingType.DELAYED,
     scaling_type_grad_output=TensorScalingType.DELAYED,
+    config=config,
 )
 
-# optional: use FSDP. Note that workarounds gated with config.enable_amax_init and
-# config.enable_pre_and_post_forward are needed for autocast + compile + FSDP + float8 to work
-from float8_experimental import config
-config.enable_amax_init = False  # only needed for autocast + compile + FSDP +  float8 delayed
-config.enable_pre_and_post_forward = False  # only needed for autocast + compile + FSDP +  float8 delayed
+# optional: use FSDP
 model = FSDP(model, use_orig_params=True)
 
 # optional: enable torch.compile for improved performance
diff --git a/float8_experimental/__init__.py b/float8_experimental/__init__.py
@@ -4,7 +4,9 @@
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
 # Lets define a few top level things here
+from float8_experimental.config import Float8LinearConfig
 from float8_experimental.float8_linear import Float8Linear
+from float8_experimental.float8_linear_utils import swap_linear_with_float8_linear
 from float8_experimental.float8_tensor import (
     Float8Tensor,
     GemmInputRole,
@@ -17,4 +19,12 @@
 
 add_safe_globals([Float8Tensor, ScaledMMConfig, GemmInputRole, LinearMMConfig])
 
-__all__ = ["Float8Tensor", "Float8Linear"]
+__all__ = [
+    # configuration
+    "Float8LinearConfig",
+    # top level UX
+    "swap_linear_with_float8_linear",
+    # TODO(future): remove Float8Tensor and Float8Linear from public API
+    "Float8Tensor",
+    "Float8Linear",
+]
diff --git a/float8_experimental/config.py b/float8_experimental/config.py
@@ -4,28 +4,40 @@
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
 
-# If True, on the first iteration of Float8Linear the amaxes will be
-# initialized with the incoming data. As of 2023-12-30, this doesn't work
-# with autocast + torch.compile + FSDP. Enabling this option is nice for
-# testing, but this is not necessary for real training jobs.
-enable_amax_init = True
-
-# If True, pre-forward and post-forward functions are run. As of 2023-12-30,
-# this doesn't work with autocast + torch.compile + FSDP. Enabling this
-# option is useful for safety, but not strictly necessary.
-enable_pre_and_post_forward = True
-
-# If True, then uses a tensor subclass for the fp8 linear module's weight that
-# implements pre/post-all-gather methods to do fp8 all-gather with FSDP2.
-# Only dynamic scaling is supported for now.
-enable_fsdp_fp8_all_gather = False
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class Float8LinearConfig:
+    """
+    Configuration for converting a `torch.nn.Linear` module to float8
+    for training.
+    """
+
+    # If True, on the first iteration of Float8Linear the amaxes will be
+    # initialized with the incoming data. As of 2023-12-30, this doesn't work
+    # with autocast + torch.compile + FSDP. Enabling this option is nice for
+    # testing, but this is not necessary for real training jobs.
+    enable_amax_init: bool = True
+
+    # If True, pre-forward and post-forward functions are run. As of 2023-12-30,
+    # this doesn't work with autocast + torch.compile + FSDP. Enabling this
+    # option is useful for safety, but not strictly necessary.
+    enable_pre_and_post_forward: bool = True
+
+    # If True, then uses a tensor subclass for the fp8 linear module's weight that
+    # implements pre/post-all-gather methods to do fp8 all-gather with FSDP2.
+    # Only dynamic scaling is supported for now.
+    enable_fsdp_fp8_all_gather: bool = False
+
+    # If True, then prior to performing the fp8 scaled mamtmul we will pad the
+    # inner dimension of a (dim 1) and b (dim 2) with 0s. This is needed for matmuls
+    # _scaled_mm since it has the strong constraint that for M,N,K  N, K must be a multiple of 16.
+    # This can cause a memory spike however so we keep this off by default.
+    pad_inner_dim: bool = False
+
 
 # If True, use 'fnuz' float8 types for calculations.
 # Currently, ROCm only supports fnuz variants.
+# TODO(future PR): move this to Float8LinearConfig
 use_fnuz_dtype = False
-
-# If True, then prior to performing the fp8 scaled mamtmul we will pad the
-# inner dimension of a (dim 1) and b (dim 2) with 0s. This is needed for matmuls
-# _scaled_mm since it has the strong constraint that for M,N,K  N, K must be a multiple of 16.
-# This can cause a memory spike however so we keep this off by default.
-pad_inner_dim = False
diff --git a/float8_experimental/float8_linear.py b/float8_experimental/float8_linear.py
@@ -12,10 +12,10 @@
 
 from typing import Optional
 
-import float8_experimental.config as config
-
 import torch
 
+from float8_experimental.config import Float8LinearConfig
+
 from float8_experimental.float8_dynamic_utils import (
     cast_to_float8_e4m3_dynamic,
     cast_to_float8_e5m2_dynamic_bw,
@@ -173,6 +173,7 @@ def __init__(self, *args, **kwargs):
         * `scaling_type_input`: delayed vs dynamic scaling for `input`
         * `scaling_type_weight`: delayed vs dynamic scaling for `weight`
         * `scaling_type_grad_output`: delayed vs dynamic scaling for `grad_output`
+        * `config`: Float8LinearConfig
         """
 
         delayed_scaling_recipe = kwargs.pop(
@@ -188,6 +189,7 @@ def __init__(self, *args, **kwargs):
         scaling_type_grad_output = kwargs.pop(
             "scaling_type_grad_output", TensorScalingType.DYNAMIC
         )
+        config = kwargs.pop("config")
         super().__init__(*args, **kwargs)
 
         # Defines the scaling behavior of input, weight, grad_output
@@ -201,6 +203,8 @@ def __init__(self, *args, **kwargs):
             or self.scaling_type_grad_output is TensorScalingType.DELAYED
         )
 
+        self.config = config
+
         # TODO(future): have a unique recipe per buffer instead of one per
         # module, saving implementing that until we need it.
         # TODO(future): serialization for recipes
@@ -212,36 +216,42 @@ def __init__(self, *args, **kwargs):
         self.linear_mm_config = LinearMMConfig(
             # input
             ScaledMMConfig(
-                emulate, True if not emulate else False, False, config.pad_inner_dim
+                emulate,
+                True if not emulate else False,
+                False,
+                self.config.pad_inner_dim,
             ),
             # weight
             ScaledMMConfig(
-                emulate, True if not emulate else False, False, config.pad_inner_dim
+                emulate,
+                True if not emulate else False,
+                False,
+                self.config.pad_inner_dim,
             ),
             # grad_output
-            ScaledMMConfig(emulate, False, False, config.pad_inner_dim),
+            ScaledMMConfig(emulate, False, False, self.config.pad_inner_dim),
         )
 
         # Note: is_amax_initialized is not a buffer to avoid data dependent
         # control flow visible to dynamo
         # TODO(future PR): add serialization for this flag
-        self.is_amax_initialized = not config.enable_amax_init
+        self.is_amax_initialized = not self.config.enable_amax_init
 
         # Syncing of amaxes and scales happens outside of this function. This
         # flag is here to enforce that the user does not forget to do this.
-        self.amax_and_scale_synced = not config.enable_amax_init
+        self.amax_and_scale_synced = not self.config.enable_amax_init
 
         # This is needed to properly handle autocast in the amax/scale
         # update function for torch.float16
         self.last_seen_input_dtype = None
 
         # pre_forward and post_forward are currently broken with FSDP
         # and torch.compile, this option can disable them
-        # Note that when using `config.enable_pre_and_post_forward = False`,
-        # it's recommended to also set `config.enable_amax_init = False`.
+        # Note that when using `self.config.enable_pre_and_post_forward = False`,
+        # it's recommended to also set `self.config.enable_amax_init = False`.
         # Otherwise, the amax buffer would never be marked as initialized and
         # would be initialized in every iteration.
-        self.enable_pre_and_post_forward = config.enable_pre_and_post_forward
+        self.enable_pre_and_post_forward = self.config.enable_pre_and_post_forward
 
     def create_buffers(self):
         # Default values for history buffers, see above TODO
@@ -450,14 +460,18 @@ def from_float(
         scaling_type_input=TensorScalingType.DYNAMIC,
         scaling_type_weight=TensorScalingType.DYNAMIC,
         scaling_type_grad_output=TensorScalingType.DYNAMIC,
+        config: Optional[Float8LinearConfig] = None,
     ):
         """
         Create an nn.Linear with fp8 compute from a regular nn.Linear
 
         Args:
             mod (torch.nn.Linear): nn.Linear to convert
             emulate (bool): whether to emulate fp8 matmul logic in float32
+            config (Optional[Float8LinearConfig]): configuration for conversion to float8
         """
+        if config is None:
+            config = Float8LinearConfig()
         with torch.device("meta"):
             new_mod = cls(
                 mod.in_features,
@@ -467,6 +481,7 @@ def from_float(
                 scaling_type_weight=scaling_type_weight,
                 scaling_type_grad_output=scaling_type_grad_output,
                 emulate=emulate,
+                config=config,
             )
         new_mod.weight = mod.weight
         new_mod.bias = mod.bias
diff --git a/float8_experimental/float8_linear_utils.py b/float8_experimental/float8_linear_utils.py
@@ -9,6 +9,7 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+from float8_experimental.config import Float8LinearConfig
 from float8_experimental.float8_linear import Float8Linear, TensorScalingType
 
 from float8_experimental.float8_utils import (
@@ -135,6 +136,7 @@ def swap_linear_with_float8_linear(
     scaling_type_input: TensorScalingType = TensorScalingType.DYNAMIC,
     scaling_type_weight: TensorScalingType = TensorScalingType.DYNAMIC,
     scaling_type_grad_output: TensorScalingType = TensorScalingType.DYNAMIC,
+    config: Float8LinearConfig = None,
 ) -> Optional[nn.Module]:
     """
     Swaps `torch.nn.Linear` in `module` with `Float8Linear`.
@@ -148,16 +150,20 @@ def swap_linear_with_float8_linear(
         scaling_type_input (TensorScalingType): scaling type for `input`
         scaling_type_weight (TensorScalingType): scaling type for `weight`
         scaling_type_grad_output (TensorScalingType): scaling type for `grad_output`
+        config (Float8LinearConfig): configuration for conversion to float8
 
     Returns:
      nn.Module: The modified module with swapped linear layers.
     """
+    if config is None:
+        config = Float8LinearConfig()
     from_float = lambda m: Float8Linear.from_float(
         m,
         emulate=emulate,
         scaling_type_input=scaling_type_input,
         scaling_type_weight=scaling_type_weight,
         scaling_type_grad_output=scaling_type_grad_output,
+        config=config,
     )
     return swap_linear_layers(
         module,
diff --git a/float8_experimental/inference.py b/float8_experimental/inference.py
@@ -12,8 +12,6 @@
 from enum import auto, Enum
 from typing import Callable, List, Optional
 
-import float8_experimental.config as config
-
 import torch
 import torch.nn as nn
 from float8_experimental.float8_linear_utils import swap_linear_layers
@@ -55,6 +53,12 @@ class QuantConfig:
     activation_casting: ActivationCasting
     static_quantization_scale: Optional[torch.Tensor] = None
 
+    # If True, then prior to performing the fp8 scaled mamtmul we will pad the
+    # inner dimension of a (dim 1) and b (dim 2) with 0s. This is needed for matmuls
+    # _scaled_mm since it has the strong constraint that for M,N,K  N, K must be a multiple of 16.
+    # This can cause a memory spike however so we keep this off by default.
+    pad_inner_dim = False
+
     def __post_init__(self):
         if self.activation_casting == ActivationCasting.STATIC:
             assert isinstance(
@@ -151,7 +155,7 @@ def from_float(
             quant_config (QuantConfig): Configuration for the weight and activation casting
         """
         forward_config = ScaledMMConfig(
-            False, use_fast_accum, pad_inner_dim=config.pad_inner_dim
+            False, use_fast_accum, pad_inner_dim=quant_config.pad_inner_dim
         )
         linear_mm_config = LinearMMConfig(
             forward_config, forward_config, forward_config
diff --git a/test/test_fsdp2/test_fsdp2.py b/test/test_fsdp2/test_fsdp2.py
diff --git a/test/test_fsdp2/test_fsdp2_common.py b/test/test_fsdp2/test_fsdp2_common.py
diff --git a/test/test_fsdp_compile.py b/test/test_fsdp_compile.py