From c4c9ae82a1d5c536a25d505ec67edcc4ccb6b8a4 Mon Sep 17 00:00:00 2001 From: vasiliy Date: Fri, 26 Jul 2024 16:30:19 -0700 Subject: [PATCH] Update on "[wip] add axiswise granularity to Float8Tensor" Summary: This PR adds the axiswise scaling granularity to `Float8Tensor` and ensures that basic ops like transpose and `torch._scaled_mm` work as expected. A future PR will add integration with `Float8Linear`. Test Plan: TODO Reviewers: Subscribers: Tasks: Tags: [ghstack-poisoned] --- float8_experimental/float8_ops.py | 15 +++++- float8_experimental/float8_python_api.py | 7 --- float8_experimental/float8_utils.py | 4 +- test/test_base.py | 61 ++++++++++++++++++------ 4 files changed, 62 insertions(+), 25 deletions(-) diff --git a/float8_experimental/float8_ops.py b/float8_experimental/float8_ops.py index 588d48a..da5aec4 100644 --- a/float8_experimental/float8_ops.py +++ b/float8_experimental/float8_ops.py @@ -41,7 +41,7 @@ def decorator(func): @implements( [ - aten.view.default, + # aten.view.default, aten._unsafe_view.default, aten.as_strided.default, aten.clone.default, @@ -79,6 +79,19 @@ def float8_desugar_data_and_scale(aten_op, args, kwargs=None): args[0]._gemm_input_role, ) +@implements([aten.view.default]) +def float8_view(aten_op, args, kwargs=None): + if len(args[0]._scale.shape) < 2: + # tensorwise scaling + return float8_desugar_op(aten_op, *args, **kwargs) + print('args', args) + print('kwargs', kwargs) + tensor, new_shape = args[0], args[1] + + # for now, only support reshaping to [-1, *dims] or [*dims, -1] + if len(new_shape) >= 2 and (new_shape[0] == -1 or new_shape[-1] == -1): + return float8_desugar_data_and_scale(aten_op, *args, **kwargs) + raise AssertionError(f"{aten_op} with axiswise scaling and shape {new_shape} is not supported yet.") @implements([aten.split.Tensor]) def float8_split(aten_op, args, kwargs=None): diff --git a/float8_experimental/float8_python_api.py b/float8_experimental/float8_python_api.py index 001eff4..59edd8d 100644 --- a/float8_experimental/float8_python_api.py +++ b/float8_experimental/float8_python_api.py @@ -39,13 +39,6 @@ def addmm_float8_unwrapped( a_inverse_scale = a_scale.reciprocal() b_inverse_scale = b_scale.reciprocal() - # TODO: should we change torch._scaled_mm? - # torch._scaled_mm expects rowwise scaled scales to be of rank 1, not rank - # 2. Translate to this format. - # TODO: audit if we need to make this more generic for various shapes. - a_inverse_scale = a_inverse_scale.squeeze() - b_inverse_scale = b_inverse_scale.squeeze() - if output_dtype == torch.float32 and bias is not None: # Bias is not supported by _scaled_mm when output is fp32 output = torch._scaled_mm( diff --git a/float8_experimental/float8_utils.py b/float8_experimental/float8_utils.py index 26fde8a..500d05e 100644 --- a/float8_experimental/float8_utils.py +++ b/float8_experimental/float8_utils.py @@ -115,9 +115,9 @@ def tensor_to_amax( # convert from axiswise_dim (dim to keep) to # dim as the input to the `torch.amax` function (tuple of dims to reduce) - dim_to_reduce = tuple(d for d in range(len(x.shape)) if d != axiswise_dim) + # dim_to_reduce = tuple(d for d in range(len(x.shape)) if d != axiswise_dim) - amax = torch.amax(torch.abs(x), dim=dim_to_reduce, keepdim=True) + amax = torch.amax(torch.abs(x), dim=axiswise_dim, keepdim=True) # If the user asked for distributed reduction, do it. # If the user did not ask for it, assume that it will diff --git a/test/test_base.py b/test/test_base.py index 38fed52..2825fc5 100644 --- a/test/test_base.py +++ b/test/test_base.py @@ -63,7 +63,7 @@ def bitwise_identical(a: Float8Tensor, b: Float8Tensor) -> bool: return True -class TestFloat8Tensor(unittest.TestCase): +class TestFloat8Tensor: def test_preserves_dtype(self) -> None: # hp means high precision, lp means low precision hp_dtypes = (torch.float32, torch.float16, torch.bfloat16) @@ -73,7 +73,7 @@ def test_preserves_dtype(self) -> None: x1_s = tensor_to_scale(x1_hp, lp_dtype) x2_lp = hp_tensor_and_scale_to_float8(x1_hp, x1_s, lp_dtype) x3_hp = x2_lp.to_original_precision() - self.assertTrue(x3_hp.dtype == hp_dtype) + assert x3_hp.dtype == hp_dtype def test_differentiable_casts(self) -> None: lp_dtypes = (e4m3_dtype, e5m2_dtype) @@ -108,7 +108,7 @@ def test_index_put(self): fp8_b = hp_tensor_and_scale_to_float8(b, scale_a, torch.float8_e4m3fn) fp8_b_bad = hp_tensor_and_scale_to_float8(b, scale_b, torch.float8_e4m3fn) - with self.assertRaises(AssertionError): + with pytest.raises(AssertionError): b[index] = fp8_a fp8_b[index] = a fp8_b_bad[index] = fp8_a @@ -122,7 +122,7 @@ def test_copy_(self): b = torch.empty(16, dtype=torch.bfloat16) b.copy_(fp8_a) # Should work torch.testing.assert_close(b, fp8_a.to_original_precision()) - with self.assertRaises(RuntimeError): + with pytest.raises(RuntimeError): fp8_a.copy_(b) # Should fail fp8_b = Float8Tensor( @@ -149,9 +149,33 @@ def test_weights_only_load(self): buffer.seek(0) _ = torch.load(buffer, weights_only=True) - def test_axiswise_dynamic_cast(self): - a = torch.randn(16, 32, dtype=torch.bfloat16) + @pytest.mark.parametrize("shape", [(8, 16), (4, 8, 16), (2, 4, 8, 16)]) + @pytest.mark.parametrize("dim_name", ["first", "last"]) + def test_axiswise_dynamic_cast(self, shape, dim_name): + a = torch.randn(*shape, dtype=torch.bfloat16) + + if dim_name == "first": + dim = 0 + elif dim_name == "last": + dim = len(a.shape) - 1 + linear_mm_config = LinearMMConfig() + a_fp8 = hp_tensor_to_float8_dynamic( + a, + e4m3_dtype, + linear_mm_config, + scaling_granularity=ScalingGranularity.AXISWISE, + axiswise_dim=dim, + ) + a_dq = a_fp8.to_original_precision() + sqnr = compute_error(a, a_dq) + assert sqnr >= 25.0 + + # TODO(next) make this work + def test_axiswise_reshape(self): + a = torch.randn(3, 5, 7, dtype=torch.bfloat16, device="cuda") + linear_mm_config = LinearMMConfig() + a_fp8 = hp_tensor_to_float8_dynamic( a, e4m3_dtype, @@ -159,11 +183,15 @@ def test_axiswise_dynamic_cast(self): scaling_granularity=ScalingGranularity.AXISWISE, axiswise_dim=0, ) - # print(a_fp8) - # print(a_fp8.to_original_precision()) - # print(a_fp8.t()) - b = a_fp8.t() - # TODO check numerical accuracy + # a_fp8._data.shape is (3, 5, 7) + # a_fp8._scale.shape is (1, 5, 7) + print(a_fp8._scale.shape) + + # reshape to (3, 5 * 7) + # a_fp8._scale.shape should be (1, 5 * 7) + a_fp8_r = a_fp8.reshape(3, -1) + print(a_fp8_r._scale.shape) + def test_axiswise_gemm(self): a = torch.randn(16, 32, dtype=torch.bfloat16, device="cuda") @@ -177,7 +205,7 @@ def test_axiswise_gemm(self): linear_mm_config, gemm_input_role=GemmInputRole.INPUT, scaling_granularity=ScalingGranularity.AXISWISE, - axiswise_dim=0, + axiswise_dim=1, ) b_fp8 = hp_tensor_to_float8_dynamic( b, @@ -185,10 +213,13 @@ def test_axiswise_gemm(self): linear_mm_config, gemm_input_role=GemmInputRole.WEIGHT, scaling_granularity=ScalingGranularity.AXISWISE, - axiswise_dim=0, + axiswise_dim=1, ) - c = torch.mm(a_fp8, b_fp8.t()) - print(c) + c_fp8_compute = torch.mm(a_fp8, b_fp8.t()) + print(c_fp8_compute) + c_ref = torch.mm(a, b.t()) + sqnr = compute_error(c_ref, c_fp8_compute) + print('sqnr', sqnr) # TODO check numerical accuracy