[PyTorch] Non-reentrant mode for activation recompute (#670)

* added non-reentrant mode support to TE checkpoint Signed-off-by: Alp Dener <[email protected]> * updated get_cuda_rng_tracker kwarg to get_rng_state_tracker to remain consistent with other TE API Signed-off-by: Alp Dener <[email protected]> * docstring cleanup Signed-off-by: Alp Dener <[email protected]> * added mechanism to disable bias_gelu_nvfusion in LayerNormMLP when checkpointing in non-reentrant mode Signed-off-by: Alp Dener <[email protected]> * refactored checkpoint and recompute hook names to match PyTorch implementation Signed-off-by: Alp Dener <[email protected]> * Fixed incorrect reference before assignment Signed-off-by: Alp Dener <[email protected]> * fixed argument error in calling native PyTorch checkpoint Signed-off-by: Alp Dener <[email protected]> * fixed linting errors for missing docstrings Signed-off-by: Alp Dener <[email protected]> * Fix lint Signed-off-by: Kirthi Shankar Sivamani <[email protected]> * bias GELU fusion consistency between checkpoint test and reference comparison Signed-off-by: Alp Dener <[email protected]> --------- Signed-off-by: Alp Dener <[email protected]> Signed-off-by: Kirthi Shankar Sivamani <[email protected]> Co-authored-by: Kirthi Shankar Sivamani <[email protected]>
NVIDIA · Feb 24, 2024 · 82bc797 · 82bc797
1 parent 9b2fed5
commit 82bc797
Show file tree

Hide file tree

Showing 4 changed files with 358 additions and 65 deletions.
diff --git a/tests/pytorch/test_numerics.py b/tests/pytorch/test_numerics.py
@@ -4,6 +4,7 @@
 
 import math
 import os
+import sys
 from typing import List, Optional
 import pytest
 import copy
@@ -72,22 +73,27 @@ def get_causal_attn_mask(sq: int) -> torch.Tensor:
     return torch.triu(torch.ones(sq, sq, device="cuda"), diagonal=1).bool()
 
 
-def assert_all_equal(l1: List[torch.Tensor], l2: List[torch.Tensor]) -> bool:
+def assert_all_equal(l1: List[torch.Tensor], l2: List[torch.Tensor], names=None) -> bool:
     """Ensures two lists are equal."""
     assert len(l1) == len(l2), "Unequal number of outputs."
-    for t1, t2 in zip(l1, l2):
-        assert torch.equal(t1, t2), "Output mismatch."
+    failed = False
+    failed_tensors = ""
+    for i, (t1, t2) in enumerate(zip(l1, l2)):
+        if not torch.equal(t1, t2):
+            failed = True
+            failed_tensors += f"    {names[i]}\n" if names is not None else f"    tensor at idx={i}\n"
+    assert not failed, "Output mismatches in:\n" + failed_tensors
 
 
 def assert_allclose(l1: List[torch.Tensor], l2: List[torch.Tensor], atol: float) -> bool:
     """Ensures two lists are equal."""
     assert len(l1) == len(l2), "Unequal number of outputs."
-    for t1, t2 in zip(l1, l2):
+    for i, (t1, t2) in enumerate(zip(l1, l2)):
         result = torch.allclose(t1, t2, atol=atol)
         if not result:
             diff = torch.abs(t1 - t2).flatten()
             m = torch.argmax(diff)
-            msg = (f"Outputs not close enough."
+            msg = (f"Outputs not close enough in tensor at idx={i}. "
                    f"Location of the maximum difference: {m.item()} "
                    f"with {t1.flatten()[m].item()} vs {t2.flatten()[m].item()} "
                    f"(diff {diff[m].item()})."
@@ -457,7 +463,12 @@ def test_gpt_selective_activation_recompute(dtype, bs, model, fp8, fp8_model_par
     assert_all_equal(outputs, outputs_recompute)
 
 
-def _test_e2e_full_recompute(bs, dtype, config, fp8, fp8_model_params=False, recompute=False):
+def _test_e2e_full_recompute(
+    bs, dtype, config, fp8,
+    fp8_model_params=False,
+    recompute=False,
+    use_reentrant=True
+):
     reset_rng_states()
     FP8GlobalStateManager.reset()
 
@@ -494,21 +505,23 @@ def get_dummy_cuda_rng_tracker():
         )
 
     te_inp_hidden_states = torch.randn(
-        config.seq_len, bs, config.hidden_size, dtype=dtype, requires_grad=True
+        config.seq_len, bs, config.hidden_size, dtype=dtype, requires_grad=use_reentrant
     ).cuda()
-    te_inp_hidden_states.retain_grad()
+    if use_reentrant:
+        te_inp_hidden_states.retain_grad()
     te_inp_attn_mask = get_causal_attn_mask(config.seq_len)
 
     with fp8_autocast(enabled=fp8):
         if recompute:
             te_out = te_checkpoint(
                 block,
-                False,  # distribute_saved_activations
-                get_dummy_cuda_rng_tracker,
-                None,  # tp_group
                 te_inp_hidden_states,
                 attention_mask=te_inp_attn_mask,
                 checkpoint_core_attention=False,
+                distribute_saved_activations=False,
+                tp_group=None,
+                get_rng_state_tracker=get_dummy_cuda_rng_tracker,
+                use_reentrant=use_reentrant,
             )
         else:
             te_out = block(
@@ -520,27 +533,45 @@ def get_dummy_cuda_rng_tracker():
     loss.backward()
     torch.cuda.synchronize()
 
-    outputs = [te_out, te_inp_hidden_states.grad]
-    for p in block.parameters():
+    outputs = [te_out]
+    names = ["output"]
+    if use_reentrant:
+        outputs.append(te_inp_hidden_states.grad)
+        names.append("input")
+    for name, p in block.named_parameters():
         if p.requires_grad:
             outputs.append(p.grad)
-    return outputs
+            names.append(name)
+
+    return outputs, names
 
 
 @pytest.mark.parametrize("dtype", param_types)
 @pytest.mark.parametrize("bs", batch_sizes)
 @pytest.mark.parametrize("model", model_configs.keys())
 @pytest.mark.parametrize("fp8", all_boolean)
 @pytest.mark.parametrize("fp8_model_params", all_boolean)
-def test_gpt_full_activation_recompute(dtype, bs, model, fp8, fp8_model_params):
+@pytest.mark.parametrize("use_reentrant", all_boolean)
+def test_gpt_full_activation_recompute(dtype, bs, model, fp8, fp8_model_params, use_reentrant):
     if fp8 and not fp8_available:
         pytest.skip(reason_for_no_fp8)
 
     config = model_configs[model]
 
-    outputs = _test_e2e_full_recompute(bs, dtype, config, fp8, fp8_model_params, recompute=False)
-    outputs_recompute = _test_e2e_full_recompute(bs, dtype, config, fp8, fp8_model_params, recompute=True)
-    assert_all_equal(outputs, outputs_recompute)
+    if not use_reentrant:
+        # Non-reentrant checkpoint becomes non-deterministic with bias+GELU fusion
+        os.environ["NVTE_BIAS_GELU_NVFUSION"] = "0"
+
+    outputs, names = _test_e2e_full_recompute(bs, dtype, config, fp8, fp8_model_params,
+                                              recompute=False, use_reentrant=use_reentrant)
+    outputs_recompute, _ = _test_e2e_full_recompute(bs, dtype, config, fp8, fp8_model_params,
+                                                    recompute=True, use_reentrant=use_reentrant)
+
+    if not use_reentrant:
+        # Reset bias+GELU fusion flag to avoid contaminating other tests
+        del os.environ["NVTE_BIAS_GELU_NVFUSION"]
+
+    assert_all_equal(outputs, outputs_recompute, names=names)
 
 
 def _test_e2e_checkpointing_get_model(config, dtype):

diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py
@@ -2472,9 +2472,9 @@ def custom_forward(*input_args, **input_kwargs):
 
         hidden_states = checkpoint(
             custom_forward,
-            False,
-            self.get_rng_state_tracker,
-            self.tp_group,
+            distribute_saved_activations=False,
+            get_rng_state_tracker=self.get_rng_state_tracker,
+            tp_group=self.tp_group,
             *forward_args,
             **forward_kwargs,
         )