facebookexperimental · timmoon10 · Jul 28, 2023
diff --git a/protoquant/float8/float8_tensor.py b/protoquant/float8/float8_tensor.py
@@ -60,6 +60,7 @@ class Float8Tensor(torch.Tensor):
     def __new__(cls, data, scale, flavor):
         # This is a non-differentiable constructor!
         assert not data.requires_grad
+        assert not scale.requires_grad
         # TODO(future): make bits8 easier to work with and switch to using it
         # assert data.dtype == torch.bits8
         assert scale.dtype == torch.float32

diff --git a/protoquant/float8/float8_utils.py b/protoquant/float8/float8_utils.py
@@ -162,12 +162,14 @@ def float8_to_float32(x, flavor):
     else:  # e5m2
         return _hfp8_to_float(x, E5M2_EBITS, E5M2_EXP_BIAS)
 
+@torch.no_grad()
 def amax_to_scale(amax, flavor):
     if flavor == E4M3:
         return E4M3_MAX_POS / torch.clamp(amax, min=EPS)
     else:  # e5m2
         return E5M2_MAX_POS / torch.clamp(amax, min=EPS)
 
+@torch.no_grad()
 def tensor_to_scale(x, flavor):
     amax = torch.max(torch.abs(x))
     return amax_to_scale(amax, flavor)