Update CONTRIBUTING.md (#47) · linkedin/Liger-Kernel@b09ce29

Commit

Update CONTRIBUTING.md (#47)
## Summary
Update CONTRIBUTING.md to cover corner cases.

<!---
## Details
This is an optional section; is there anything specific that reviewers
should be aware of?
--->

## Testing Done
<!--- This is a required section; please describe how this change was
tested. --->

In RTX3070,
```
❯ export PYTHONPATH="${PYTHONPATH}:/mnt/d/ML/fork/Liger-Kernel"
❯ make test
pytest --disable-warnings test/ --ignore=test/convergence
=================================================================================================== test session starts ====================================================================================================
platform linux -- Python 3.11.7, pytest-8.2.1, pluggy-1.5.0
rootdir: /mnt/d/ML/fork/Liger-Kernel
collected 114 items

test/transformers/test_cross_entropy.py .FF.FFF.FFF.FFFFFFFFFFFF.FF.FFF..FF.FFF.FFFFFFFFFFF.FFF.FF                                                                                                                   [ 50%]
test/transformers/test_fused_linear_cross_entropy.py ....FF                                                                                                                 [ 56%]
test/transformers/test_geglu.py F.......                                                                                                                                    [ 63%]
test/transformers/test_rms_norm.py ................                                                                                                                         [ 77%]
test/transformers/test_rope.py ............                                                                                                                                 [ 87%]
test/transformers/test_swiglu.py ........                                                                                                                                   [ 94%]
test/transformers/test_trainer_integration.py ...                                                                                                                           [ 97%]
test/transformers/test_transformers_monkey_patch.py .                                                                                                                       [ 98%]
test/triton/test_triton_monkey_patch.py ..                                                                                                                                  [100%]

==================================================================================== FAILURES =====================================================================================
______________________________________________________________ test_correctness[0.1-dtype0-1e-08-0.05-8-4096-32000] _______________________________________________________________

B = 8, T = 4096, V = 32000, scalar = 0.1, dtype = torch.bfloat16, atol = 1e-08, rtol = 0.05

    @pytest.mark.parametrize(
        "B, T, V",
        [
            (4, 4096, 32000),  # llama2, mistral
            (8, 4096, 32000),  # llama2, mistral
            (4, 4096, 128256),  # llama3
            # # weird shapes
            (3, 423, 32000),
        ],
    )
    @pytest.mark.parametrize(
        "scalar, dtype, atol, rtol",
        [
            (0.1, torch.bfloat16, 1e-8, 5e-2),
            (1.0, torch.bfloat16, 1e-8, 5e-2),
            (10.0, torch.bfloat16, 1e-8, 5e-2),
            (0.1, torch.float32, 1e-8, 1e-6),
            (1.0, torch.float32, 1e-8, 1e-6),
            (10.0, torch.float32, 1e-8, 1e-6),
        ],
    )
    def test_correctness(B, T, V, scalar, dtype, atol, rtol):
        liger_ce = LigerCrossEntropyLoss()
>       _test_correctness_once(liger_ce, B, T, V, scalar, dtype, atol, rtol)

test/transformers/test_cross_entropy.py:112:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

target_ce = LigerCrossEntropyLoss(), B = 8, T = 4096, V = 32000, scalar = 0.1, dtype = torch.bfloat16, atol = 1e-08, rtol = 0.05

    def _test_correctness_once(target_ce, B, T, V, scalar, dtype, atol, rtol):
        torch.manual_seed(0)
        torch_ce = CrossEntropyLoss()

        _tensor = torch.randn(B * T, V, device="cuda", dtype=dtype) * scalar
        _input = _tensor.detach().clone().requires_grad_(True)
        _input2 = _tensor.detach().clone().requires_grad_(True)

        target = torch.randint(0, V, (B * T,), device="cuda", dtype=torch.long)

        output = torch_ce(_input, target)
        output2 = target_ce(_input2, target)
        assert torch.allclose(output, output2, atol=atol, rtol=rtol)

        output.backward()
        output2.backward()
>       assert torch.allclose(_input.grad, _input2.grad, atol=atol, rtol=rtol)
E       torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.95 GiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 12.70 GiB is allocated by PyTorch, and 1001.75 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

test/transformers/test_cross_entropy.py:26: OutOfMemoryError
______________________________________________________________ test_correctness[0.1-dtype0-1e-08-0.05-4-4096-128256] ______________________________________________________________

B = 4, T = 4096, V = 128256, scalar = 0.1, dtype = torch.bfloat16, atol = 1e-08, rtol = 0.05

    @pytest.mark.parametrize(
        "B, T, V",
        [
            (4, 4096, 32000),  # llama2, mistral
            (8, 4096, 32000),  # llama2, mistral
            (4, 4096, 128256),  # llama3
            # # weird shapes
            (3, 423, 32000),
        ],
    )
    @pytest.mark.parametrize(
        "scalar, dtype, atol, rtol",
        [
            (0.1, torch.bfloat16, 1e-8, 5e-2),
            (1.0, torch.bfloat16, 1e-8, 5e-2),
            (10.0, torch.bfloat16, 1e-8, 5e-2),
            (0.1, torch.float32, 1e-8, 1e-6),
            (1.0, torch.float32, 1e-8, 1e-6),
            (10.0, torch.float32, 1e-8, 1e-6),
        ],
    )
    def test_correctness(B, T, V, scalar, dtype, atol, rtol):
        liger_ce = LigerCrossEntropyLoss()
>       _test_correctness_once(liger_ce, B, T, V, scalar, dtype, atol, rtol)

test/transformers/test_cross_entropy.py:112:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

target_ce = LigerCrossEntropyLoss(), B = 4, T = 4096, V = 128256, scalar = 0.1, dtype = torch.bfloat16, atol = 1e-08, rtol = 0.05

    def _test_correctness_once(target_ce, B, T, V, scalar, dtype, atol, rtol):
        torch.manual_seed(0)
        torch_ce = CrossEntropyLoss()

>       _tensor = torch.randn(B * T, V, device="cuda", dtype=dtype) * scalar
E       torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.91 GiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 11.73 GiB is allocated by PyTorch, and 1.75 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

test/transformers/test_cross_entropy.py:14: OutOfMemoryError
______________________________________________________________ test_correctness[1.0-dtype1-1e-08-0.05-4-4096-32000] _______________________________________________________________

B = 4, T = 4096, V = 32000, scalar = 1.0, dtype = torch.bfloat16, atol = 1e-08, rtol = 0.05

    @pytest.mark.parametrize(
        "B, T, V",
        [
            (4, 4096, 32000),  # llama2, mistral
            (8, 4096, 32000),  # llama2, mistral
            (4, 4096, 128256),  # llama3
            # # weird shapes
            (3, 423, 32000),
        ],
    )
    @pytest.mark.parametrize(
        "scalar, dtype, atol, rtol",
        [
            (0.1, torch.bfloat16, 1e-8, 5e-2),
            (1.0, torch.bfloat16, 1e-8, 5e-2),
            (10.0, torch.bfloat16, 1e-8, 5e-2),
            (0.1, torch.float32, 1e-8, 1e-6),
            (1.0, torch.float32, 1e-8, 1e-6),
            (10.0, torch.float32, 1e-8, 1e-6),
        ],
    )
    def test_correctness(B, T, V, scalar, dtype, atol, rtol):
        liger_ce = LigerCrossEntropyLoss()
>       _test_correctness_once(liger_ce, B, T, V, scalar, dtype, atol, rtol)

test/transformers/test_cross_entropy.py:112:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

target_ce = LigerCrossEntropyLoss(), B = 4, T = 4096, V = 32000, scalar = 1.0, dtype = torch.bfloat16, atol = 1e-08, rtol = 0.05

    def _test_correctness_once(target_ce, B, T, V, scalar, dtype, atol, rtol):
        torch.manual_seed(0)
        torch_ce = CrossEntropyLoss()

        _tensor = torch.randn(B * T, V, device="cuda", dtype=dtype) * scalar
        _input = _tensor.detach().clone().requires_grad_(True)
        _input2 = _tensor.detach().clone().requires_grad_(True)

        target = torch.randint(0, V, (B * T,), device="cuda", dtype=torch.long)

        output = torch_ce(_input, target)
        output2 = target_ce(_input2, target)
        assert torch.allclose(output, output2, atol=atol, rtol=rtol)

        output.backward()
        output2.backward()
>       assert torch.allclose(_input.grad, _input2.grad, atol=atol, rtol=rtol)
E       torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1000.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 13.18 GiB is allocated by PyTorch, and 509.62 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

test/transformers/test_cross_entropy.py:26: OutOfMemoryError
______________________________________________________________ test_correctness[1.0-dtype1-1e-08-0.05-8-4096-32000] _______________________________________________________________

B = 8, T = 4096, V = 32000, scalar = 1.0, dtype = torch.bfloat16, atol = 1e-08, rtol = 0.05

    @pytest.mark.parametrize(
        "B, T, V",
        [
            (4, 4096, 32000),  # llama2, mistral
            (8, 4096, 32000),  # llama2, mistral
            (4, 4096, 128256),  # llama3
            # # weird shapes
            (3, 423, 32000),
        ],
    )
    @pytest.mark.parametrize(
        "scalar, dtype, atol, rtol",
        [
            (0.1, torch.bfloat16, 1e-8, 5e-2),
            (1.0, torch.bfloat16, 1e-8, 5e-2),
            (10.0, torch.bfloat16, 1e-8, 5e-2),
            (0.1, torch.float32, 1e-8, 1e-6),
            (1.0, torch.float32, 1e-8, 1e-6),
            (10.0, torch.float32, 1e-8, 1e-6),
        ],
    )
    def test_correctness(B, T, V, scalar, dtype, atol, rtol):
        liger_ce = LigerCrossEntropyLoss()
>       _test_correctness_once(liger_ce, B, T, V, scalar, dtype, atol, rtol)

test/transformers/test_cross_entropy.py:112:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

target_ce = LigerCrossEntropyLoss(), B = 8, T = 4096, V = 32000, scalar = 1.0, dtype = torch.bfloat16, atol = 1e-08, rtol = 0.05

    def _test_correctness_once(target_ce, B, T, V, scalar, dtype, atol, rtol):
        torch.manual_seed(0)
        torch_ce = CrossEntropyLoss()

>       _tensor = torch.randn(B * T, V, device="cuda", dtype=dtype) * scalar
E       torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.95 GiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 11.72 GiB is allocated by PyTorch, and 1009.62 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

test/transformers/test_cross_entropy.py:14: OutOfMemoryError
______________________________________________________________ test_correctness[1.0-dtype1-1e-08-0.05-4-4096-128256] ______________________________________________________________

B = 4, T = 4096, V = 128256, scalar = 1.0, dtype = torch.bfloat16, atol = 1e-08, rtol = 0.05

    @pytest.mark.parametrize(
        "B, T, V",
        [
            (4, 4096, 32000),  # llama2, mistral
            (8, 4096, 32000),  # llama2, mistral
            (4, 4096, 128256),  # llama3
            # # weird shapes
            (3, 423, 32000),
        ],
    )
    @pytest.mark.parametrize(
        "scalar, dtype, atol, rtol",
        [
            (0.1, torch.bfloat16, 1e-8, 5e-2),
            (1.0, torch.bfloat16, 1e-8, 5e-2),
            (10.0, torch.bfloat16, 1e-8, 5e-2),
            (0.1, torch.float32, 1e-8, 1e-6),
            (1.0, torch.float32, 1e-8, 1e-6),
            (10.0, torch.float32, 1e-8, 1e-6),
        ],
    )
    def test_correctness(B, T, V, scalar, dtype, atol, rtol):
        liger_ce = LigerCrossEntropyLoss()
>       _test_correctness_once(liger_ce, B, T, V, scalar, dtype, atol, rtol)

test/transformers/test_cross_entropy.py:112:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

target_ce = LigerCrossEntropyLoss(), B = 4, T = 4096, V = 128256, scalar = 1.0, dtype = torch.bfloat16, atol = 1e-08, rtol = 0.05

    def _test_correctness_once(target_ce, B, T, V, scalar, dtype, atol, rtol):
        torch.manual_seed(0)
        torch_ce = CrossEntropyLoss()

>       _tensor = torch.randn(B * T, V, device="cuda", dtype=dtype) * scalar
E       torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.91 GiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 11.72 GiB is allocated by PyTorch, and 1009.62 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

test/transformers/test_cross_entropy.py:14: OutOfMemoryError
______________________________________________________________ test_correctness[10.0-dtype2-1e-08-0.05-4-4096-32000] ______________________________________________________________

B = 4, T = 4096, V = 32000, scalar = 10.0, dtype = torch.bfloat16, atol = 1e-08, rtol = 0.05

    @pytest.mark.parametrize(
        "B, T, V",
        [
            (4, 4096, 32000),  # llama2, mistral
            (8, 4096, 32000),  # llama2, mistral
            (4, 4096, 128256),  # llama3
            # # weird shapes
            (3, 423, 32000),
        ],
    )
    @pytest.mark.parametrize(
        "scalar, dtype, atol, rtol",
        [
            (0.1, torch.bfloat16, 1e-8, 5e-2),
            (1.0, torch.bfloat16, 1e-8, 5e-2),
            (10.0, torch.bfloat16, 1e-8, 5e-2),
            (0.1, torch.float32, 1e-8, 1e-6),
            (1.0, torch.float32, 1e-8, 1e-6),
            (10.0, torch.float32, 1e-8, 1e-6),
        ],
    )
    def test_correctness(B, T, V, scalar, dtype, atol, rtol):
        liger_ce = LigerCrossEntropyLoss()
>       _test_correctness_once(liger_ce, B, T, V, scalar, dtype, atol, rtol)

test/transformers/test_cross_entropy.py:112:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

target_ce = LigerCrossEntropyLoss(), B = 4, T = 4096, V = 32000, scalar = 10.0, dtype = torch.bfloat16, atol = 1e-08, rtol = 0.05

    def _test_correctness_once(target_ce, B, T, V, scalar, dtype, atol, rtol):
        torch.manual_seed(0)
        torch_ce = CrossEntropyLoss()

        _tensor = torch.randn(B * T, V, device="cuda", dtype=dtype) * scalar
        _input = _tensor.detach().clone().requires_grad_(True)
>       _input2 = _tensor.detach().clone().requires_grad_(True)
E       torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1000.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 13.67 GiB is allocated by PyTorch, and 9.62 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

test/transformers/test_cross_entropy.py:16: OutOfMemoryError
______________________________________________________________ test_correctness[10.0-dtype2-1e-08-0.05-8-4096-32000] ______________________________________________________________

B = 8, T = 4096, V = 32000, scalar = 10.0, dtype = torch.bfloat16, atol = 1e-08, rtol = 0.05

    @pytest.mark.parametrize(
        "B, T, V",
        [
            (4, 4096, 32000),  # llama2, mistral
            (8, 4096, 32000),  # llama2, mistral
            (4, 4096, 128256),  # llama3
            # # weird shapes
            (3, 423, 32000),
        ],
    )
    @pytest.mark.parametrize(
        "scalar, dtype, atol, rtol",
        [
            (0.1, torch.bfloat16, 1e-8, 5e-2),
            (1.0, torch.bfloat16, 1e-8, 5e-2),
            (10.0, torch.bfloat16, 1e-8, 5e-2),
            (0.1, torch.float32, 1e-8, 1e-6),
            (1.0, torch.float32, 1e-8, 1e-6),
            (10.0, torch.float32, 1e-8, 1e-6),
        ],
    )
    def test_correctness(B, T, V, scalar, dtype, atol, rtol):
        liger_ce = LigerCrossEntropyLoss()
>       _test_correctness_once(liger_ce, B, T, V, scalar, dtype, atol, rtol)

test/transformers/test_cross_entropy.py:112:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

target_ce = LigerCrossEntropyLoss(), B = 8, T = 4096, V = 32000, scalar = 10.0, dtype = torch.bfloat16, atol = 1e-08, rtol = 0.05

    def _test_correctness_once(target_ce, B, T, V, scalar, dtype, atol, rtol):
        torch.manual_seed(0)
        torch_ce = CrossEntropyLoss()

>       _tensor = torch.randn(B * T, V, device="cuda", dtype=dtype) * scalar
E       torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.95 GiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 13.67 GiB is allocated by PyTorch, and 9.62 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

test/transformers/test_cross_entropy.py:14: OutOfMemoryError
_____________________________________________________________ test_correctness[10.0-dtype2-1e-08-0.05-4-4096-128256] ______________________________________________________________

B = 4, T = 4096, V = 128256, scalar = 10.0, dtype = torch.bfloat16, atol = 1e-08, rtol = 0.05

    @pytest.mark.parametrize(
        "B, T, V",
        [
            (4, 4096, 32000),  # llama2, mistral
            (8, 4096, 32000),  # llama2, mistral
            (4, 4096, 128256),  # llama3
            # # weird shapes
            (3, 423, 32000),
        ],
    )
    @pytest.mark.parametrize(
        "scalar, dtype, atol, rtol",
        [
            (0.1, torch.bfloat16, 1e-8, 5e-2),
            (1.0, torch.bfloat16, 1e-8, 5e-2),
            (10.0, torch.bfloat16, 1e-8, 5e-2),
            (0.1, torch.float32, 1e-8, 1e-6),
            (1.0, torch.float32, 1e-8, 1e-6),
            (10.0, torch.float32, 1e-8, 1e-6),
        ],
    )
    def test_correctness(B, T, V, scalar, dtype, atol, rtol):
        liger_ce = LigerCrossEntropyLoss()
>       _test_correctness_once(liger_ce, B, T, V, scalar, dtype, atol, rtol)

test/transformers/test_cross_entropy.py:112:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

target_ce = LigerCrossEntropyLoss(), B = 4, T = 4096, V = 128256, scalar = 10.0, dtype = torch.bfloat16, atol = 1e-08, rtol = 0.05

    def _test_correctness_once(target_ce, B, T, V, scalar, dtype, atol, rtol):
        torch.manual_seed(0)
        torch_ce = CrossEntropyLoss()

>       _tensor = torch.randn(B * T, V, device="cuda", dtype=dtype) * scalar
E       torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.91 GiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 11.73 GiB is allocated by PyTorch, and 1.75 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

test/transformers/test_cross_entropy.py:14: OutOfMemoryError
______________________________________________________________ test_correctness[0.1-dtype3-1e-08-1e-06-4-4096-32000] ______________________________________________________________

B = 4, T = 4096, V = 32000, scalar = 0.1, dtype = torch.float32, atol = 1e-08, rtol = 1e-06

    @pytest.mark.parametrize(
        "B, T, V",
        [
            (4, 4096, 32000),  # llama2, mistral
            (8, 4096, 32000),  # llama2, mistral
            (4, 4096, 128256),  # llama3
            # # weird shapes
            (3, 423, 32000),
        ],
    )
    @pytest.mark.parametrize(
        "scalar, dtype, atol, rtol",
        [
            (0.1, torch.bfloat16, 1e-8, 5e-2),
            (1.0, torch.bfloat16, 1e-8, 5e-2),
            (10.0, torch.bfloat16, 1e-8, 5e-2),
            (0.1, torch.float32, 1e-8, 1e-6),
            (1.0, torch.float32, 1e-8, 1e-6),
            (10.0, torch.float32, 1e-8, 1e-6),
        ],
    )
    def test_correctness(B, T, V, scalar, dtype, atol, rtol):
        liger_ce = LigerCrossEntropyLoss()
>       _test_correctness_once(liger_ce, B, T, V, scalar, dtype, atol, rtol)

test/transformers/test_cross_entropy.py:112:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
test/transformers/test_cross_entropy.py:20: in _test_correctness_once
    output = torch_ce(_input, target)
/home/lancerts/miniconda3/envs/wsl-dev/lib/python3.11/site-packages/torch/nn/modules/module.py:1511: in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
/home/lancerts/miniconda3/envs/wsl-dev/lib/python3.11/site-packages/torch/nn/modules/module.py:1520: in _call_impl
    return forward_call(*args, **kwargs)
/home/lancerts/miniconda3/envs/wsl-dev/lib/python3.11/site-packages/torch/nn/modules/loss.py:1179: in forward
    return F.cross_entropy(input, target, weight=self.weight,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

input = tensor([[-0.0925, -0.0425, -0.2644,  ...,  0.0010,  0.1883, -0.0227],
        [ 0.0096, -0.0009, -0.0456,  ...,  0.062...98],
        [ 0.1096,  0.1418, -0.0479,  ..., -0.0130,  0.0280, -0.0784]],
       device='cuda:0', requires_grad=True)
target = tensor([11911,  6795, 20998,  ..., 13584, 24371,   414], device='cuda:0'), weight = None, size_average = None, ignore_index = -100, reduce = None, reduction = 'mean'
label_smoothing = 0.0

    def cross_entropy(
        input: Tensor,
        target: Tensor,
        weight: Optional[Tensor] = None,
        size_average: Optional[bool] = None,
        ignore_index: int = -100,
        reduce: Optional[bool] = None,
        reduction: str = "mean",
        label_smoothing: float = 0.0,
    ) -> Tensor:
        r"""Compute the cross entropy loss between input logits and target.

        See :class:`~torch.nn.CrossEntropyLoss` for details.

        Args:
            input (Tensor) : Predicted unnormalized logits;
                see Shape section below for supported shapes.
            target (Tensor) : Ground truth class indices or class probabilities;
                see Shape section below for supported shapes.
            weight (Tensor, optional): a manual rescaling weight given to each
                class. If given, has to be a Tensor of size `C`
            size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
                the losses are averaged over each loss element in the batch. Note that for
                some losses, there multiple elements per sample. If the field :attr:`size_average`
                is set to ``False``, the losses are instead summed for each minibatch. Ignored
                when reduce is ``False``. Default: ``True``
            ignore_index (int, optional): Specifies a target value that is ignored
                and does not contribute to the input gradient. When :attr:`size_average` is
                ``True``, the loss is averaged over non-ignored targets. Note that
                :attr:`ignore_index` is only applicable when the target contains class indices.
                Default: -100
            reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
                losses are averaged or summed over observations for each minibatch depending
                on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
                batch element instead and ignores :attr:`size_average`. Default: ``True``
            reduction (str, optional): Specifies the reduction to apply to the output:
                ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
                ``'mean'``: the sum of the output will be divided by the number of
                elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
                and :attr:`reduce` are in the process of being deprecated, and in the meantime,
                specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
            label_smoothing (float, optional): A float in [0.0, 1.0]. Specifies the amount
                of smoothing when computing the loss, where 0.0 means no smoothing. The targets
                become a mixture of the original ground truth and a uniform distribution as described in
                `Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__. Default: :math:`0.0`.

        Shape:
            - Input: Shape :math:`(C)`, :math:`(N, C)` or :math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
              in the case of `K`-dimensional loss.
            - Target: If containing class indices, shape :math:`()`, :math:`(N)` or :math:`(N, d_1, d_2, ..., d_K)` with
              :math:`K \geq 1` in the case of K-dimensional loss where each value should be between :math:`[0, C)`.
              If containing class probabilities, same shape as the input and each value should be between :math:`[0, 1]`.

            where:

            .. math::
                \begin{aligned}
                    C ={} & \text{number of classes} \\
                    N ={} & \text{batch size} \\
                \end{aligned}

        Examples::

            >>> # Example of target with class indices
            >>> input = torch.randn(3, 5, requires_grad=True)
            >>> target = torch.randint(5, (3,), dtype=torch.int64)
            >>> loss = F.cross_entropy(input, target)
            >>> loss.backward()
            >>>
            >>> # Example of target with class probabilities
            >>> input = torch.randn(3, 5, requires_grad=True)
            >>> target = torch.randn(3, 5).softmax(dim=1)
            >>> loss = F.cross_entropy(input, target)
            >>> loss.backward()
        """
        if has_torch_function_variadic(input, target, weight):
            return handle_torch_function(
                cross_entropy,
                (input, target, weight),
                input,
                target,
                weight=weight,
                size_average=size_average,
                ignore_index=ignore_index,
                reduce=reduce,
                reduction=reduction,
                label_smoothing=label_smoothing,
            )
        if size_average is not None or reduce is not None:
            reduction = _Reduction.legacy_get_string(size_average, reduce)
>       return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
E       torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.95 GiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 13.67 GiB is allocated by PyTorch, and 9.62 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

/home/lancerts/miniconda3/envs/wsl-dev/lib/python3.11/site-packages/torch/nn/functional.py:3059: OutOfMemoryError
______________________________________________________________ test_correctness[0.1-dtype3-1e-08-1e-06-8-4096-32000] ______________________________________________________________

B = 8, T = 4096, V = 32000, scalar = 0.1, dtype = torch.float32, atol = 1e-08, rtol = 1e-06

    @pytest.mark.parametrize(
        "B, T, V",
        [
            (4, 4096, 32000),  # llama2, mistral
            (8, 4096, 32000),  # llama2, mistral
            (4, 4096, 128256),  # llama3
            # # weird shapes
            (3, 423, 32000),
        ],
    )
    @pytest.mark.parametrize(
        "scalar, dtype, atol, rtol",
        [
            (0.1, torch.bfloat16, 1e-8, 5e-2),
            (1.0, torch.bfloat16, 1e-8, 5e-2),
            (10.0, torch.bfloat16, 1e-8, 5e-2),
            (0.1, torch.float32, 1e-8, 1e-6),
            (1.0, torch.float32, 1e-8, 1e-6),
            (10.0, torch.float32, 1e-8, 1e-6),
        ],
    )
    def test_correctness(B, T, V, scalar, dtype, atol, rtol):
        liger_ce = LigerCrossEntropyLoss()
>       _test_correctness_once(liger_ce, B, T, V, scalar, dtype, atol, rtol)

test/transformers/test_cross_entropy.py:112:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

target_ce = LigerCrossEntropyLoss(), B = 8, T = 4096, V = 32000, scalar = 0.1, dtype = torch.float32, atol = 1e-08, rtol = 1e-06

    def _test_correctness_once(target_ce, B, T, V, scalar, dtype, atol, rtol):
        torch.manual_seed(0)
        torch_ce = CrossEntropyLoss()

>       _tensor = torch.randn(B * T, V, device="cuda", dtype=dtype) * scalar
E       torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.91 GiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 13.67 GiB is allocated by PyTorch, and 9.62 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

test/transformers/test_cross_entropy.py:14: OutOfMemoryError
_____________________________________________________________ test_correctness[0.1-dtype3-1e-08-1e-06-4-4096-128256] ______________________________________________________________

B = 4, T = 4096, V = 128256, scalar = 0.1, dtype = torch.float32, atol = 1e-08, rtol = 1e-06

    @pytest.mark.parametrize(
        "B, T, V",
        [
            (4, 4096, 32000),  # llama2, mistral
            (8, 4096, 32000),  # llama2, mistral
            (4, 4096, 128256),  # llama3
            # # weird shapes
            (3, 423, 32000),
        ],
    )
    @pytest.mark.parametrize(
        "scalar, dtype, atol, rtol",
        [
            (0.1, torch.bfloat16, 1e-8, 5e-2),
            (1.0, torch.bfloat16, 1e-8, 5e-2),
            (10.0, torch.bfloat16, 1e-8, 5e-2),
            (0.1, torch.float32, 1e-8, 1e-6),
            (1.0, torch.float32, 1e-8, 1e-6),
            (10.0, torch.float32, 1e-8, 1e-6),
        ],
    )
    def test_correctness(B, T, V, scalar, dtype, atol, rtol):
        liger_ce = LigerCrossEntropyLoss()
>       _test_correctness_once(liger_ce, B, T, V, scalar, dtype, atol, rtol)

test/transformers/test_cross_entropy.py:112:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

target_ce = LigerCrossEntropyLoss(), B = 4, T = 4096, V = 128256, scalar = 0.1, dtype = torch.float32, atol = 1e-08, rtol = 1e-06

    def _test_correctness_once(target_ce, B, T, V, scalar, dtype, atol, rtol):
        torch.manual_seed(0)
        torch_ce = CrossEntropyLoss()

>       _tensor = torch.randn(B * T, V, device="cuda", dtype=dtype) * scalar
E       torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 7.83 GiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 13.67 GiB is allocated by PyTorch, and 9.62 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

test/transformers/test_cross_entropy.py:14: OutOfMemoryError
______________________________________________________________ test_correctness[0.1-dtype3-1e-08-1e-06-3-423-32000] _______________________________________________________________

B = 3, T = 423, V = 32000, scalar = 0.1, dtype = torch.float32, atol = 1e-08, rtol = 1e-06

    @pytest.mark.parametrize(
        "B, T, V",
        [
            (4, 4096, 32000),  # llama2, mistral
            (8, 4096, 32000),  # llama2, mistral
            (4, 4096, 128256),  # llama3
            # # weird shapes
            (3, 423, 32000),
        ],
    )
    @pytest.mark.parametrize(
        "scalar, dtype, atol, rtol",
        [
            (0.1, torch.bfloat16, 1e-8, 5e-2),
            (1.0, torch.bfloat16, 1e-8, 5e-2),
            (10.0, torch.bfloat16, 1e-8, 5e-2),
            (0.1, torch.float32, 1e-8, 1e-6),
            (1.0, torch.float32, 1e-8, 1e-6),
            (10.0, torch.float32, 1e-8, 1e-6),
        ],
    )
    def test_correctness(B, T, V, scalar, dtype, atol, rtol):
        liger_ce = LigerCrossEntropyLoss()
>       _test_correctness_once(liger_ce, B, T, V, scalar, dtype, atol, rtol)

test/transformers/test_cross_entropy.py:112:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

target_ce = LigerCrossEntropyLoss(), B = 3, T = 423, V = 32000, scalar = 0.1, dtype = torch.float32, atol = 1e-08, rtol = 1e-06

    def _test_correctness_once(target_ce, B, T, V, scalar, dtype, atol, rtol):
        torch.manual_seed(0)
        torch_ce = CrossEntropyLoss()

        _tensor = torch.randn(B * T, V, device="cuda", dtype=dtype) * scalar
        _input = _tensor.detach().clone().requires_grad_(True)
        _input2 = _tensor.detach().clone().requires_grad_(True)

        target = torch.randint(0, V, (B * T,), device="cuda", dtype=torch.long)

        output = torch_ce(_input, target)
        output2 = target_ce(_input2, target)
        assert torch.allclose(output, output2, atol=atol, rtol=rtol)

        output.backward()
        output2.backward()
>       assert torch.allclose(_input.grad, _input2.grad, atol=atol, rtol=rtol)
E       torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 156.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 14.47 GiB is allocated by PyTorch, and 132.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

test/transformers/test_cross_entropy.py:26: OutOfMemoryError
______________________________________________________________ test_correctness[1.0-dtype4-1e-08-1e-06-4-4096-32000] ______________________________________________________________

B = 4, T = 4096, V = 32000, scalar = 1.0, dtype = torch.float32, atol = 1e-08, rtol = 1e-06

    @pytest.mark.parametrize(
        "B, T, V",
        [
            (4, 4096, 32000),  # llama2, mistral
            (8, 4096, 32000),  # llama2, mistral
            (4, 4096, 128256),  # llama3
            # # weird shapes
            (3, 423, 32000),
        ],
    )
    @pytest.mark.parametrize(
        "scalar, dtype, atol, rtol",
        [
            (0.1, torch.bfloat16, 1e-8, 5e-2),
            (1.0, torch.bfloat16, 1e-8, 5e-2),
            (10.0, torch.bfloat16, 1e-8, 5e-2),
            (0.1, torch.float32, 1e-8, 1e-6),
            (1.0, torch.float32, 1e-8, 1e-6),
            (10.0, torch.float32, 1e-8, 1e-6),
        ],
    )
    def test_correctness(B, T, V, scalar, dtype, atol, rtol):
        liger_ce = LigerCrossEntropyLoss()
>       _test_correctness_once(liger_ce, B, T, V, scalar, dtype, atol, rtol)

test/transformers/test_cross_entropy.py:112:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

target_ce = LigerCrossEntropyLoss(), B = 4, T = 4096, V = 32000, scalar = 1.0, dtype = torch.float32, atol = 1e-08, rtol = 1e-06

    def _test_correctness_once(target_ce, B, T, V, scalar, dtype, atol, rtol):
        torch.manual_seed(0)
        torch_ce = CrossEntropyLoss()

>       _tensor = torch.randn(B * T, V, device="cuda", dtype=dtype) * scalar
E       torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.95 GiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 14.28 GiB is allocated by PyTorch, and 13.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

test/transformers/test_cross_entropy.py:14: OutOfMemoryError
______________________________________________________________ test_correctness[1.0-dtype4-1e-08-1e-06-8-4096-32000] ______________________________________________________________

B = 8, T = 4096, V = 32000, scalar = 1.0, dtype = torch.float32, atol = 1e-08, rtol = 1e-06

    @pytest.mark.parametrize(
        "B, T, V",
        [
            (4, 4096, 32000),  # llama2, mistral
            (8, 4096, 32000),  # llama2, mistral
            (4, 4096, 128256),  # llama3
            # # weird shapes
            (3, 423, 32000),
        ],
    )
    @pytest.mark.parametrize(
        "scalar, dtype, atol, rtol",
        [
            (0.1, torch.bfloat16, 1e-8, 5e-2),
            (1.0, torch.bfloat16, 1e-8, 5e-2),
            (10.0, torch.bfloat16, 1e-8, 5e-2),
            (0.1, torch.float32, 1e-8, 1e-6),
            (1.0, torch.float32, 1e-8, 1e-6),
            (10.0, torch.float32, 1e-8, 1e-6),
        ],
    )
    def test_correctness(B, T, V, scalar, dtype, atol, rtol):
        liger_ce = LigerCrossEntropyLoss()
>       _test_correctness_once(liger_ce, B, T, V, scalar, dtype, atol, rtol)

test/transformers/test_cross_entropy.py:112:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

target_ce = LigerCrossEntropyLoss(), B = 8, T = 4096, V = 32000, scalar = 1.0, dtype = torch.float32, atol = 1e-08, rtol = 1e-06

    def _test_correctness_once(target_ce, B, T, V, scalar, dtype, atol, rtol):
        torch.manual_seed(0)
        torch_ce = CrossEntropyLoss()

>       _tensor = torch.randn(B * T, V, device="cuda", dtype=dtype) * scalar
E       torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.91 GiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 14.28 GiB is allocated by PyTorch, and 13.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

test/transformers/test_cross_entropy.py:14: OutOfMemoryError
_____________________________________________________________ test_correctness[1.0-dtype4-1e-08-1e-06-4-4096-128256] ______________________________________________________________

B = 4, T = 4096, V = 128256, scalar = 1.0, dtype = torch.float32, atol = 1e-08, rtol = 1e-06

    @pytest.mark.parametrize(
        "B, T, V",
        [
            (4, 4096, 32000),  # llama2, mistral
            (8, 4096, 32000),  # llama2, mistral
            (4, 4096, 128256),  # llama3
            # # weird shapes
            (3, 423, 32000),
        ],
    )
    @pytest.mark.parametrize(
        "scalar, dtype, atol, rtol",
        [
            (0.1, torch.bfloat16, 1e-8, 5e-2),
            (1.0, torch.bfloat16, 1e-8, 5e-2),
            (10.0, torch.bfloat16, 1e-8, 5e-2),
            (0.1, torch.float32, 1e-8, 1e-6),
            (1.0, torch.float32, 1e-8, 1e-6),
            (10.0, torch.float32, 1e-8, 1e-6),
        ],
    )
    def test_correctness(B, T, V, scalar, dtype, atol, rtol):
        liger_ce = LigerCrossEntropyLoss()
>       _test_correctness_once(liger_ce, B, T, V, scalar, dtype, atol, rtol)

test/transformers/test_cross_entropy.py:112:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

target_ce = LigerCrossEntropyLoss(), B = 4, T = 4096, V = 128256, scalar = 1.0, dtype = torch.float32, atol = 1e-08, rtol = 1e-06

    def _test_correctness_once(target_ce, B, T, V, scalar, dtype, atol, rtol):
        torch.manual_seed(0)
        torch_ce = CrossEntropyLoss()

>       _tensor = torch.randn(B * T, V, device="cuda", dtype=dtype) * scalar
E       torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 7.83 GiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 14.28 GiB is allocated by PyTorch, and 13.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

test/transformers/test_cross_entropy.py:14: OutOfMemoryError
______________________________________________________________ test_correctness[1.0-dtype4-1e-08-1e-06-3-423-32000] _______________________________________________________________

B = 3, T = 423, V = 32000, scalar = 1.0, dtype = torch.float32, atol = 1e-08, rtol = 1e-06

    @pytest.mark.parametrize(
        "B, T, V",
        [
            (4, 4096, 32000),  # llama2, mistral
            (8, 4096, 32000),  # llama2, mistral
            (4, 4096, 128256),  # llama3
            # # weird shapes
            (3, 423, 32000),
        ],
    )
    @pytest.mark.parametrize(
        "scalar, dtype, atol, rtol",
        [
            (0.1, torch.bfloat16, 1e-8, 5e-2),
            (1.0, torch.bfloat16, 1e-8, 5e-2),
            (10.0, torch.bfloat16, 1e-8, 5e-2),
            (0.1, torch.float32, 1e-8, 1e-6),
            (1.0, torch.float32, 1e-8, 1e-6),
            (10.0, torch.float32, 1e-8, 1e-6),
        ],
    )
    def test_correctness(B, T, V, scalar, dtype, atol, rtol):
        liger_ce = LigerCrossEntropyLoss()
>       _test_correctness_once(liger_ce, B, T, V, scalar, dtype, atol, rtol)

test/transformers/test_cross_entropy.py:112:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

target_ce = LigerCrossEntropyLoss(), B = 3, T = 423, V = 32000, scalar = 1.0, dtype = torch.float32, atol = 1e-08, rtol = 1e-06

    def _test_correctness_once(target_ce, B, T, V, scalar, dtype, atol, rtol):
        torch.manual_seed(0)
        torch_ce = CrossEntropyLoss()

        _tensor = torch.randn(B * T, V, device="cuda", dtype=dtype) * scalar
        _input = _tensor.detach().clone().requires_grad_(True)
>       _input2 = _tensor.detach().clone().requires_grad_(True)
E       torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 156.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 14.58 GiB is allocated by PyTorch, and 16.17 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

test/transformers/test_cross_entropy.py:16: OutOfMemoryError
_____________________________________________________________ test_correctness[10.0-dtype5-1e-08-1e-06-4-4096-32000] ______________________________________________________________

B = 4, T = 4096, V = 32000, scalar = 10.0, dtype = torch.float32, atol = 1e-08, rtol = 1e-06

    @pytest.mark.parametrize(
        "B, T, V",
        [
            (4, 4096, 32000),  # llama2, mistral
            (8, 4096, 32000),  # llama2, mistral
            (4, 4096, 128256),  # llama3
            # # weird shapes
            (3, 423, 32000),
        ],
    )
    @pytest.mark.parametrize(
        "scalar, dtype, atol, rtol",
        [
            (0.1, torch.bfloat16, 1e-8, 5e-2),
            (1.0, torch.bfloat16, 1e-8, 5e-2),
            (10.0, torch.bfloat16, 1e-8, 5e-2),
            (0.1, torch.float32, 1e-8, 1e-6),
            (1.0, torch.float32, 1e-8, 1e-6),
            (10.0, torch.float32, 1e-8, 1e-6),
        ],
    )
    def test_correctness(B, T, V, scalar, dtype, atol, rtol):
        liger_ce = LigerCrossEntropyLoss()
>       _test_correctness_once(liger_ce, B, T, V, scalar, dtype, atol, rtol)

test/transformers/test_cross_entropy.py:112:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

target_ce = LigerCrossEntropyLoss(), B = 4, T = 4096, V = 32000, scalar = 10.0, dtype = torch.float32, atol = 1e-08, rtol = 1e-06

    def _test_correctness_once(target_ce, B, T, V, scalar, dtype, atol, rtol):
        torch.manual_seed(0)
        torch_ce = CrossEntropyLoss()

>       _tensor = torch.randn(B * T, V, device="cuda", dtype=dtype) * scalar
E       torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.95 GiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 13.97 GiB is allocated by PyTorch, and 11.81 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

test/transformers/test_cross_entropy.py:14: OutOfMemoryError
_____________________________________________________________ test_correctness[10.0-dtype5-1e-08-1e-06-8-4096-32000] ______________________________________________________________

B = 8, T = 4096, V = 32000, scalar = 10.0, dtype = torch.float32, atol = 1e-08, rtol = 1e-06

    @pytest.mark.parametrize(
        "B, T, V",
        [
            (4, 4096, 32000),  # llama2, mistral
            (8, 4096, 32000),  # llama2, mistral
            (4, 4096, 128256),  # llama3
            # # weird shapes
            (3, 423, 32000),
        ],
    )
    @pytest.mark.parametrize(
        "scalar, dtype, atol, rtol",
        [
            (0.1, torch.bfloat16, 1e-8, 5e-2),
            (1.0, torch.bfloat16, 1e-8, 5e-2),
            (10.0, torch.bfloat16, 1e-8, 5e-2),
            (0.1, torch.float32, 1e-8, 1e-6),
            (1.0, torch.float32, 1e-8, 1e-6),
            (10.0, torch.float32, 1e-8, 1e-6),
        ],
    )
    def test_correctness(B, T, V, scalar, dtype, atol, rtol):
        liger_ce = LigerCrossEntropyLoss()
>       _test_correctness_once(liger_ce, B, T, V, scalar, dtype, atol, rtol)

test/transformers/test_cross_entropy.py:112:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

target_ce = LigerCrossEntropyLoss(), B = 8, T = 4096, V = 32000, scalar = 10.0, dtype = torch.float32, atol = 1e-08, rtol = 1e-06

    def _test_correctness_once(target_ce, B, T, V, scalar, dtype, atol, rtol):
        torch.manual_seed(0)
        torch_ce = CrossEntropyLoss()

>       _tensor = torch.randn(B * T, V, device="cuda", dtype=dtype) * scalar
E       torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.91 GiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 13.97 GiB is allocated by PyTorch, and 11.81 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

test/transformers/test_cross_entropy.py:14: OutOfMemoryError
_____________________________________________________________ test_correctness[10.0-dtype5-1e-08-1e-06-4-4096-128256] _____________________________________________________________

B = 4, T = 4096, V = 128256, scalar = 10.0, dtype = torch.float32, atol = 1e-08, rtol = 1e-06

    @pytest.mark.parametrize(
        "B, T, V",
        [
            (4, 4096, 32000),  # llama2, mistral
            (8, 4096, 32000),  # llama2, mistral
            (4, 4096, 128256),  # llama3
            # # weird shapes
            (3, 423, 32000),
        ],
    )
    @pytest.mark.parametrize(
        "scalar, dtype, atol, rtol",
        [
            (0.1, torch.bfloat16, 1e-8, 5e-2),
            (1.0, torch.bfloat16, 1e-8, 5e-2),
            (10.0, torch.bfloat16, 1e-8, 5e-2),
            (0.1, torch.float32, 1e-8, 1e-6),
            (1.0, torch.float32, 1e-8, 1e-6),
            (10.0, torch.float32, 1e-8, 1e-6),
        ],
    )
    def test_correctness(B, T, V, scalar, dtype, atol, rtol):
        liger_ce = LigerCrossEntropyLoss()
>       _test_correctness_once(liger_ce, B, T, V, scalar, dtype, atol, rtol)

test/transformers/test_cross_entropy.py:112:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

target_ce = LigerCrossEntropyLoss(), B = 4, T = 4096, V = 128256, scalar = 10.0, dtype = torch.float32, atol = 1e-08, rtol = 1e-06

    def _test_correctness_once(target_ce, B, T, V, scalar, dtype, atol, rtol):
        torch.manual_seed(0)
        torch_ce = CrossEntropyLoss()

>       _tensor = torch.randn(B * T, V, device="cuda", dtype=dtype) * scalar
E       torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 7.83 GiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 13.97 GiB is allocated by PyTorch, and 11.81 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

test/transformers/test_cross_entropy.py:14: OutOfMemoryError
______________________________________________________________ test_correctness[10.0-dtype5-1e-08-1e-06-3-423-32000] ______________________________________________________________

B = 3, T = 423, V = 32000, scalar = 10.0, dtype = torch.float32, atol = 1e-08, rtol = 1e-06

    @pytest.mark.parametrize(
        "B, T, V",
        [
            (4, 4096, 32000),  # llama2, mistral
            (8, 4096, 32000),  # llama2, mistral
            (4, 4096, 128256),  # llama3
            # # weird shapes
            (3, 423, 32000),
        ],
    )
    @pytest.mark.parametrize(
        "scalar, dtype, atol, rtol",
        [
            (0.1, torch.bfloat16, 1e-8, 5e-2),
            (1.0, torch.bfloat16, 1e-8, 5e-2),
            (10.0, torch.bfloat16, 1e-8, 5e-2),
            (0.1, torch.float32, 1e-8, 1e-6),
            (1.0, torch.float32, 1e-8, 1e-6),
            (10.0, torch.float32, 1e-8, 1e-6),
        ],
    )
    def test_correctness(B, T, V, scalar, dtype, atol, rtol):
        liger_ce = LigerCrossEntropyLoss()
>       _test_correctness_once(liger_ce, B, T, V, scalar, dtype, atol, rtol)

test/transformers/test_cross_entropy.py:112:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
test/transformers/test_cross_entropy.py:24: in _test_correctness_once
    output.backward()
/home/lancerts/miniconda3/envs/wsl-dev/lib/python3.11/site-packages/torch/_tensor.py:522: in backward
    torch.autograd.backward(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

tensors = (tensor(41.5440, device='cuda:0', grad_fn=<NllLossBackward0>),), grad_tensors = None, retain_graph = False, create_graph = False, grad_variables = None, inputs = ()

    def backward(
        tensors: _TensorOrTensors,
        grad_tensors: Optional[_TensorOrTensors] = None,
        retain_graph: Optional[bool] = None,
        create_graph: bool = False,
        grad_variables: Optional[_TensorOrTensors] = None,
        inputs: Optional[_TensorOrTensorsOrGradEdge] = None,
    ) -> None:
        r"""Computes the sum of gradients of given tensors with respect to graph
        leaves.

        The graph is differentiated using the chain rule. If any of ``tensors``
        are non-scalar (i.e. their data has more than one element) and require
        gradient, then the Jacobian-vector product would be computed, in this
        case the function additionally requires specifying ``grad_tensors``.
        It should be a sequence of matching length, that contains the "vector"
        in the Jacobian-vector product, usually the gradient of the differentiated
        function w.r.t. corresponding tensors (``None`` is an acceptable value for
        all tensors that don't need gradient tensors).

        This function accumulates gradients in the leaves - you might need to zero
        ``.grad`` attributes or set them to ``None`` before calling it.
        See :ref:`Default gradient layouts<default-grad-layouts>`
        for details on the memory layout of accumulated gradients.

        .. note::
            Using this method with ``create_graph=True`` will create a reference cycle
            between the parameter and its gradient which can cause a memory leak.
            We recommend using ``autograd.grad`` when creating the graph to avoid this.
            If you have to use this function, make sure to reset the ``.grad`` fields of your
            parameters to ``None`` after use to break the cycle and avoid the leak.

        .. note::

            If you run any forward ops, create ``grad_tensors``, and/or call ``backward``
            in a user-specified CUDA stream context, see
            :ref:`Stream semantics of backward passes<bwd-cuda-stream-semantics>`.

        .. note::

            When ``inputs`` are provided and a given input is not a leaf,
            the current implementation will call its grad_fn (even though it is not strictly needed to get this gradients).
            It is an implementation detail on which the user should not rely.
            See https://github.com/pytorch/pytorch/pull/60521#issuecomment-867061780 for more details.

        Args:
            tensors (Sequence[Tensor] or Tensor): Tensors of which the derivative will be
                computed.
            grad_tensors (Sequence[Tensor or None] or Tensor, optional): The "vector" in
                the Jacobian-vector product, usually gradients w.r.t. each element of
                corresponding tensors. None values can be specified for scalar Tensors or
                ones that don't require grad. If a None value would be acceptable for all
                grad_tensors, then this argument is optional.
            retain_graph (bool, optional): If ``False``, the graph used to compute the grad
                will be freed. Note that in nearly all cases setting this option to ``True``
                is not needed and often can be worked around in a much more efficient
                way. Defaults to the value of ``create_graph``.
            create_graph (bool, optional): If ``True``, graph of the derivative will
                be constructed, allowing to compute higher order derivative products.
                Defaults to ``False``.
            inputs (Sequence[Tensor] or Tensor or Sequence[GradientEdge], optional): Inputs w.r.t. which the gradient
                be will accumulated into ``.grad``. All other Tensors will be ignored. If
                not provided, the gradient is accumulated into all the leaf Tensors that
                were used to compute the attr::tensors.
        """
        if torch._C._are_functorch_transforms_active():
            raise RuntimeError(
                "backward() called inside a functorch transform. This is not "
                "supported, please use functorch.grad or functorch.vjp instead "
                "or call backward() outside of functorch transforms."
            )

        if grad_variables is not None:
            warnings.warn("'grad_variables' is deprecated. Use 'grad_tensors' instead.")
            if grad_tensors is None:
                grad_tensors = grad_variables
            else:
                raise RuntimeError(
                    "'grad_tensors' and 'grad_variables' (deprecated) "
                    "arguments both passed to backward(). Please only "
                    "use 'grad_tensors'."
                )
        if inputs is not None and len(inputs) == 0:
            raise RuntimeError("'inputs' argument to backward() cannot be empty.")

        tensors = (tensors,) if isinstance(tensors, torch.Tensor) else tuple(tensors)
        inputs = (
            (inputs,)
            if isinstance(inputs, (torch.Tensor, graph.GradientEdge))
            else tuple(inputs)
            if inputs is not None
            else tuple()
        )

        grad_tensors_ = _tensor_or_tensors_to_tuple(grad_tensors, len(tensors))
        grad_tensors_ = _make_grads(tensors, grad_tensors_, is_grads_batched=False)
        if retain_graph is None:
            retain_graph = create_graph

        # The reason we repeat the same comment below is that
        # some Python versions print out the first line of a multi-line function
        # calls in the traceback and some print out the last line
>       Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
            tensors,
            grad_tensors_,
            retain_graph,
            create_graph,
            inputs,
            allow_unreachable=True,
            accumulate_grad=True,
        )  # Calls into the C++ engine to run the backward pass
E       torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 156.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 14.58 GiB is allocated by PyTorch, and 16.17 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

/home/lancerts/miniconda3/envs/wsl-dev/lib/python3.11/site-packages/torch/autograd/__init__.py:266: OutOfMemoryError
____________________________________________________ test_correctness_with_ignore_index[0.1-dtype0-1e-08-0.05-8-4096-32000-2] _____________________________________________________

B = 8, T = 4096, V = 32000, ignore_index = 2, scalar = 0.1, dtype = torch.bfloat16, atol = 1e-08, rtol = 0.05

    @pytest.mark.parametrize(
        "B, T, V, ignore_index",
        [
            (4, 4096, 32000, -100),  # llama2, mistral
            (8, 4096, 32000, 2),  # llama2, mistral
            (4, 4096, 128256, -300),  # llama3
            # weird shapes
            (3, 423, 32000, -123),
        ],
    )
    @pytest.mark.parametrize(
        "scalar, dtype, atol, rtol",
        [
            (0.1, torch.bfloat16, 1e-8, 5e-2),
            (1.0, torch.bfloat16, 1e-8, 5e-2),
            (10.0, torch.bfloat16, 1e-8, 5e-2),
            (0.1, torch.float32, 1e-8, 1e-6),
            (1.0, torch.float32, 1e-8, 1e-6),
            (10.0, torch.float32, 1e-8, 1e-6),
        ],
    )
    def test_correctness_with_ignore_index(
     …
Loading branch information
lancerts authored Aug 20, 2024
1 parent 0c8aba8 commit b09ce29
CONTRIBUTING.md
-Original file line number
+Diff line change
@@ Expand Up @@
        ```
        pip install . -e[dev]
        ```
+       If encounter error `no matches found: .[dev]`, please use
+       ```
+       pip install -e .'[dev]'
+       ```
     ## Adding a New Kernel
     To get familiar with the folder structure, please refer to https://github.com/linkedin/Liger-Kernel?tab=readme-ov-file#structure.
@@ Expand All @@
 . run `make checkstyle` to ensure code style.
 . run `make test-convergence` to ensure convergence.
+    If encounter the import error `ModuleNotFoundError: No module named 'test.utils'`, please add `Liger-Kernel` folder in the path
+    ```
+    export PYTHONPATH="${PYTHONPATH}:your-path-here/Liger-Kernel"
+    ```
     ### Run pytest on single file
     `pytest test_sample.py::test_function_name`
@@ Expand Down @@
0 comments on commit `b09ce29`

Please sign in to comment.
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Commit

There are no files selected for viewing

0 comments on commit `b09ce29`

Commit

There are no files selected for viewing

0 comments on commit b09ce29

0 comments on commit `b09ce29`