Confirming we could reproduce the issue with the attached script, with changes from 493e9ef to print out the data_ptr.

repro.py

import torch
from transformer_engine.pytorch import Linear
from transformer_engine.common import recipe
from transformer_engine.pytorch.fp8 import fp8_autocast
from transformer_engine.pytorch.utils import scaled_init_method_normal
from transformer_engine.pytorch.distributed import activation_recompute_forward

num_layers = 2
seq_len = 32
batch_size = 2
hidden_size = 64
num_attention_heads = 2

torch.manual_seed(0)
dtype = torch.float32
sigma = 0.023
output_layer_init_method = scaled_init_method_normal(sigma, num_layers)

block = (
    Linear(
        hidden_size, hidden_size, init_method=output_layer_init_method
    )
    .to(dtype=dtype)
    .cuda()
)

te_inp = torch.randn(
    seq_len, batch_size, hidden_size, dtype=dtype, requires_grad=True
).cuda()

use_fp8 = True
fp8_recipe = recipe.DelayedScaling(0, 1, recipe.Format.HYBRID)

with fp8_autocast(enabled=use_fp8, fp8_recipe=fp8_recipe):
    with torch.no_grad():
        with activation_recompute_forward(activation_recompute=True, recompute_phase=False):
            te_out1 = block(te_inp, is_first_microbatch=True)
            te_out2 = block(te_inp, is_first_microbatch=False)
            print("te_out1", te_out1)
            print("te_out2", te_out2)

output

before data_ptr weight_fp8_bak: 22573943414272, weight_fp8: 22573943414272
after  data_ptr weight_fp8_bak: 22573943414272, weight_fp8: 22573943441920
te_out1 tensor([[[ 0.1578,  0.0800,  0.0902,  ...,  0.0439,  0.0244,  0.0424],
         [-0.0949, -0.0265, -0.0230,  ..., -0.0503,  0.1298,  0.0212]],
        ...,
        [[ 0.0183, -0.0587,  0.1722,  ...,  0.0945, -0.0697, -0.0029],
         [-0.0288, -0.0141, -0.0333,  ..., -0.0888,  0.0073,  0.0359]]],
       device='cuda:0')
te_out2 tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],
        ...,
        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]], device='cuda:0')

te_out2 is zero because cast_to_fp8 returns and assigns a new tensor to the local variable weight_fp8, and the original weight_fp8 tensor passed from the argument is not updated with the casted output, remaining empty. This will cause an issue later when recompute uses this empty weight_fp8 tensor for calculations.

TransformerEngine/transformer_engine/pytorch/module/linear.py

Lines 168 to 173 in b5e13a1

    
           weight_fp8._data = cast_to_fp8( 
        
               weight, 
        
               fp8_meta["scaling_fwd"], 
        
               tex.FP8FwdTensors.GEMM1_WEIGHT, 
        
               fp8_dtype_forward, 
        
           )

	weight_fp8._data = cast_to_fp8(
	weight,
	fp8_meta["scaling_fwd"],
	tex.FP8FwdTensors.GEMM1_WEIGHT,
	fp8_dtype_forward,
	)

[bug] FP8+PP+Recompute+GA>1, loss = nan #539

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions