NVIDIA · pranavm-nvidia · Dec 13, 2024
diff --git a/tripy/examples/nanogpt/README.md b/tripy/examples/nanogpt/README.md
@@ -80,3 +80,9 @@ To run with a quantization mode, pass `--quant-mode` to `example.py`. The suppor
     ```
     Tripy: TEST: EXPECTED_STDOUT End
     -->
+
+3. float8 quantization:
+
+    ```bash
+    python3 example.py --input-text "What is the answer to life, the universe, and everything?" --seed=0 --quant-mode float8
+    ```
diff --git a/tripy/examples/nanogpt/example.py b/tripy/examples/nanogpt/example.py
@@ -72,7 +72,7 @@ def main():
         "--quant-mode",
         type=str,
         help="Quantization mode.",
-        choices=["int8-weight-only", "int4-weight-only"],
+        choices=["int8-weight-only", "int4-weight-only", "float8"],
     )
 
     args = parser.parse_args()

diff --git a/tripy/examples/nanogpt/model.py b/tripy/examples/nanogpt/model.py
@@ -43,6 +43,9 @@ def linear_layer(config: GPTConfig, in_feat, out_feat, bias):
     elif config.quant_mode == "int4-weight-only":
         quant_kwargs["quant_dtype"] = tp.int4
         quant_kwargs["weight_quant_dim"] = None
+    elif config.quant_mode == "float8":
+        quant_kwargs["quant_dtype"] = tp.float8
+        quant_kwargs["weight_quant_dim"] = None
 
     return tp.Linear(
         in_feat,
@@ -73,7 +76,7 @@ def __call__(self, x: tp.Tensor):
         qkv = self.c_attn(x)  # (batch_size, seq_len, 3 * embedding_size)
 
         # WAR for better accuracy and avoid TRT compilation error in fp16
-        if self.c_attn.quant_dtype == tp.int4:
+        if self.c_attn.quant_dtype in (tp.float8, tp.int4):
             qkv = tp.cast(qkv, tp.float32)
 
         q, k, v = tp.split(qkv, 3, dim=2)
@@ -156,8 +159,12 @@ def __init__(self, config):
         ), f"Cannot forward sequence of length {config.seq_len}, block size is only {config.block_size}"
 
         self.transformer = Transformer(config)
-        # Quantization is disabled for `lm_head`
-        self.lm_head = tp.Linear(config.embedding_size, config.vocab_size, bias=False, dtype=config.dtype)
+
+        if config.quant_mode == "float8":
+            self.lm_head = linear_layer(config, config.embedding_size, config.vocab_size, bias=False)
+        else:
+            # Quantization is disabled for `lm_head` except for FP8.
+            self.lm_head = tp.Linear(config.embedding_size, config.vocab_size, bias=False, dtype=config.dtype)
 
     def __call__(self, idx):
         x = self.transformer(idx)

diff --git a/tripy/examples/nanogpt/quantization.py b/tripy/examples/nanogpt/quantization.py
@@ -40,6 +40,8 @@ def modelopt_quantize(model_hf, quant_mode):
         }
     elif quant_mode == "int4-weight-only":
         quant_cfg = mtq.INT4_AWQ_CFG
+    elif quant_mode == "float8":
+        quant_cfg = mtq.FP8_DEFAULT_CFG
     else:
         raise NotImplementedError(f"Unsupported quantization mode: {quant_mode}")
 

diff --git a/tripy/tests/conftest.py b/tripy/tests/conftest.py
@@ -29,7 +29,7 @@
 from tripy.common.datatype import DATA_TYPES
 
 skip_if_older_than_sm89 = pytest.mark.skipif(
-    torch.cuda.get_device_capability() < (8, 9), reason="Some features (e.g. fp8) are not available before SM90"
+    torch.cuda.get_device_capability() < (8, 9), reason="Some features (e.g. float8) are not available before SM90"
 )
 
 skip_if_older_than_sm80 = pytest.mark.skipif(

diff --git a/tripy/tests/integration/test_dequantize.py b/tripy/tests/integration/test_dequantize.py
@@ -63,12 +63,12 @@ def func(input):
         output = torch.from_dlpack(dequantized)
         assert torch.allclose(expected, output.to("cpu"))
 
-    # TODO(#161): Update fp8 test to use frontend representation
+    # TODO(#161): Update float8 test to use frontend representation
     @pytest.mark.parametrize(
         "dtype", [tp.float32, tp.float16, pytest.param(tp.bfloat16, marks=skip_if_older_than_sm80)]
     )
     @skip_if_older_than_sm89
-    def test_dequantize_fp8_per_tensor(self, dtype):
+    def test_dequantize_float8_per_tensor(self, dtype):
         data_value = [1.0, 1.0]
         input_tp = tp.Tensor(data_value, dtype=tp.float8)
         scale = torch.tensor(0.5, dtype=TORCH_DTYPES[dtype])
@@ -84,7 +84,7 @@ def test_dequantize_fp8_per_tensor(self, dtype):
         "dtype", [tp.float32, tp.float16, pytest.param(tp.bfloat16, marks=skip_if_older_than_sm80)]
     )
     @skip_if_older_than_sm89
-    def test_dequantize_fp8_per_channel(self, dtype):
+    def test_dequantize_float8_per_channel(self, dtype):
         data_value = [[1.0, 1.0], [1.0, 1.0]]
         input_tp = tp.Tensor(data_value, dtype=tp.float8)
         scale = torch.tensor([0.8, 0.9], dtype=TORCH_DTYPES[dtype])

diff --git a/tripy/tests/integration/test_quantize.py b/tripy/tests/integration/test_quantize.py
@@ -73,7 +73,7 @@ def func(input):
         "dtype", [tp.float32, tp.float16, pytest.param(tp.bfloat16, marks=skip_if_older_than_sm80)]
     )
     @skip_if_older_than_sm89
-    def test_quantize_fp8_per_tensor(self, dtype, eager_or_compiled):
+    def test_quantize_float8_per_tensor(self, dtype, eager_or_compiled):
         input = torch.tensor([1.0, 2.0], dtype=TORCH_DTYPES[dtype])
         scale = torch.tensor(0.5, dtype=TORCH_DTYPES[dtype])
         input_tp = tp.Tensor(input, dtype=dtype)
@@ -96,7 +96,7 @@ def func(input):
         "dtype", [tp.float32, tp.float16, pytest.param(tp.bfloat16, marks=skip_if_older_than_sm80)]
     )
     @skip_if_older_than_sm89
-    def test_quantize_fp8_per_channel(self, dtype, eager_or_compiled):
+    def test_quantize_float8_per_channel(self, dtype, eager_or_compiled):
         input = torch.tensor([[1.0, 2.0], [3.0, 4.0]], dtype=TORCH_DTYPES[dtype])
         scale = torch.tensor([0.2, 0.1], dtype=TORCH_DTYPES[dtype])
         input_tp = tp.Tensor(input, dtype=dtype)