Skip to content

Commit

Permalink
ADLR/megatron-lm!2088 - Add MoE interface tests and move other tests …
Browse files Browse the repository at this point in the history
…to internal
  • Loading branch information
yanring authored and ko3n1g committed Sep 11, 2024
1 parent fe1640a commit 6e4e9df
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 0 deletions.
2 changes: 2 additions & 0 deletions tests/unit_tests/transformer/moe/test_aux_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def teardown_method(self, method):
Utils.destroy_model_parallel()

@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.internal
@pytest.mark.parametrize(
"tp_size,ep_size,cp_size", [(8, 1, 1), (4, 2, 1), (1, 1, 8), (2, 1, 4), (2, 2, 2)]
)
Expand All @@ -75,6 +76,7 @@ def test_allgather_dispatcher(self, tp_size, ep_size, cp_size):
container.aux_loss_test(self.input, self.baseline_grad)

@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.internal
@pytest.mark.parametrize(
"tp_size,ep_size,cp_size", [(8, 1, 1), (4, 2, 1), (1, 1, 8), (2, 1, 4), (2, 2, 2)]
)
Expand Down
8 changes: 8 additions & 0 deletions tests/unit_tests/transformer/moe/test_grouped_mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def setup_method(self, method, use_cpu_initialization=False, swiglu=True):
def teardown_method(self, method):
Utils.destroy_model_parallel()

@pytest.mark.internal
def test_constructor(self):
assert isinstance(self.sequential_mlp, MoELayer)
assert isinstance(self.grouped_mlp, MoELayer)
Expand Down Expand Up @@ -130,6 +131,7 @@ def test_constructor(self):
self.grouped_mlp.experts.weight1.shape == self.grouped_mlp.experts.weight2.t().shape
)

@pytest.mark.internal
def test_weight_init_value_the_same(self):
gmm_w1 = self.grouped_mlp.experts.weight1.view(self.num_experts, -1, self.hidden_size)
gmm_w2 = self.grouped_mlp.experts.weight2.view(self.num_experts, self.hidden_size, -1)
Expand All @@ -153,6 +155,7 @@ def test_weight_init_value_the_same(self):
assert torch.equal(gmm_expert2_fc2, smm_expert2_fc2)

@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.internal
@pytest.mark.skipif(
not DEVICE_CAPABILITY or DEVICE_CAPABILITY[0] < 8,
reason='GroupedGEMM kernels are not supported on this device.',
Expand All @@ -175,6 +178,7 @@ def test_gpu_forward(self):
# assert torch.equal(output_smm, output_gmm)

@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.internal
@pytest.mark.skipif(
not DEVICE_CAPABILITY or DEVICE_CAPABILITY[0] < 8,
reason='GroupedGEMM kernels are not supported on this device.',
Expand All @@ -193,6 +197,7 @@ def test_gpu_forward_with_no_tokens_allocated(self):
assert str(e) == "Input batch_sizes should not be all zeros!"

@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.internal
@pytest.mark.skipif(
not DEVICE_CAPABILITY or DEVICE_CAPABILITY[0] < 8,
reason='GroupedGEMM kernels are not supported on this device.',
Expand Down Expand Up @@ -274,6 +279,7 @@ def setup_method(self, method, use_cpu_initialization=False, swiglu=True):
def teardown_method(self, method):
Utils.destroy_model_parallel()

@pytest.mark.internal
def test_constructor(self):
assert isinstance(self.sequential_mlp, MoELayer)
assert isinstance(self.grouped_mlp, MoELayer)
Expand Down Expand Up @@ -308,6 +314,7 @@ def test_constructor(self):
)

@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.internal
def test_gpu_forward_backward(self):
self.sequential_mlp.cuda()
self.grouped_mlp.cuda()
Expand Down Expand Up @@ -350,6 +357,7 @@ def test_gpu_forward_backward(self):
torch.testing.assert_close(smm_result, gmm_result)

@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.internal
def test_gpu_forward_backward_with_no_tokens_allocated(self):
"""Test the case when no token is allocated for groupedGEMM kernels."""
self.grouped_mlp.cuda()
Expand Down
73 changes: 73 additions & 0 deletions tests/unit_tests/transformer/moe/test_moe_layer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.

import pytest
import torch

from megatron.core.models.gpt.gpt_layer_specs import (
get_gpt_layer_local_spec,
get_gpt_layer_with_transformer_engine_spec,
)
from megatron.core.transformer.moe.moe_layer import MoELayer
from megatron.core.transformer.moe.router import Router
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.training.initialize import _set_random_seed
from tests.unit_tests.test_utilities import Utils


class TestMoELayerInit:
def setup_method(self, method):
pass

@pytest.mark.parametrize("moe_token_dispatcher_type", ["allgather", "alltoall"])
@pytest.mark.parametrize("num_moe_experts", [1, 2])
@pytest.mark.parametrize("grouped_gemm", [True, False])
def test_te_moe_layer(self, num_moe_experts, moe_token_dispatcher_type, grouped_gemm):
Utils.initialize_model_parallel(1, 1)
_set_random_seed(seed_=123, data_parallel_random_init=False)
self.transformer_config = TransformerConfig(
num_layers=1,
hidden_size=12,
num_attention_heads=4,
num_moe_experts=num_moe_experts,
use_cpu_initialization=True,
moe_token_dispatcher_type=moe_token_dispatcher_type,
moe_router_topk=2,
moe_aux_loss_coeff=0.01,
moe_grouped_gemm=grouped_gemm,
add_bias_linear=False,
)
transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
num_experts=num_moe_experts, moe_grouped_gemm=grouped_gemm
)
moe_layer = MoELayer(
self.transformer_config, transformer_layer_spec.submodules.mlp.submodules
)
Utils.destroy_model_parallel()

@pytest.mark.parametrize("moe_token_dispatcher_type", ["allgather", "alltoall"])
@pytest.mark.parametrize("num_moe_experts", [1, 2])
def test_legacy_moe_layer(self, num_moe_experts, moe_token_dispatcher_type):
Utils.initialize_model_parallel(1, 1)
_set_random_seed(seed_=123, data_parallel_random_init=False)
num_moe_experts = 4
self.transformer_config = TransformerConfig(
num_layers=1,
hidden_size=12,
num_attention_heads=4,
num_moe_experts=num_moe_experts,
use_cpu_initialization=True,
moe_router_load_balancing_type="aux_loss",
moe_router_topk=2,
moe_aux_loss_coeff=0.01,
add_bias_linear=False,
)
transformer_layer_spec = get_gpt_layer_local_spec(
num_experts=num_moe_experts, moe_grouped_gemm=False
)
moe_layer = MoELayer(
self.transformer_config, transformer_layer_spec.submodules.mlp.submodules
)
Utils.destroy_model_parallel()

def teardown_method(self, method):
Utils.destroy_model_parallel()
2 changes: 2 additions & 0 deletions tests/unit_tests/transformer/moe/test_routers.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def test_constructor(self):
assert num_weights == 12 * 4, num_weights

@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.internal
@pytest.mark.parametrize("moe_router_pre_softmax", [(True), (False)])
def test_router_forward(self, moe_router_pre_softmax):
with torch.no_grad():
Expand All @@ -56,6 +57,7 @@ def test_router_forward(self, moe_router_pre_softmax):
scores, indices = self.router(hidden_states)

@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.internal
def test_aux_loss(self):
self.sequential_mlp = self.sequential_mlp.cuda()

Expand Down
6 changes: 6 additions & 0 deletions tests/unit_tests/transformer/moe/test_sequential_mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,14 @@ def setup_method(self, method):
def teardown_method(self, method):
Utils.destroy_model_parallel()

@pytest.mark.internal
def test_constructor(self):
assert isinstance(self.sequential_mlp, MoELayer)

num_weights = sum([p.numel() for p in self.sequential_mlp.parameters()])
assert num_weights == 3696

@pytest.mark.internal
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
def test_gpu_forward(self):
sequential_mlp = self.sequential_mlp
Expand Down Expand Up @@ -118,6 +120,7 @@ def setup_method(self, method):
te_version < packaging.version.Version("1.7.0"),
reason="Transformer Engine under v1.7.0 doesn't support MoE training.",
)
@pytest.mark.internal
def test_constructor(self):
for i in range(self.num_local_experts):
assert torch.equal(
Expand All @@ -133,6 +136,7 @@ def test_constructor(self):
te_version < packaging.version.Version("1.7.0"),
reason="Transformer Engine under v1.7.0 doesn't support MoE training.",
)
@pytest.mark.internal
def test_gpu_forward(self):
self.local_sequential_mlp.cuda()
self.te_sequential_mlp.cuda()
Expand All @@ -154,6 +158,7 @@ def test_gpu_forward(self):
te_version < packaging.version.Version("1.7.0"),
reason="Transformer Engine under v1.7.0 doesn't support MoE training.",
)
@pytest.mark.internal
def test_gpu_forward_with_one_local_expert(self):
model_parallel_cuda_manual_seed(123)
local_sequential_mlp = SequentialMLP(1, self.transformer_config, self.local_mlp_spec)
Expand All @@ -177,6 +182,7 @@ def test_gpu_forward_with_one_local_expert(self):
te_version < packaging.version.Version("1.7.0"),
reason="Transformer Engine under v1.7.0 doesn't support MoE training.",
)
@pytest.mark.internal
def test_gpu_forward_with_no_tokens_allocated(self):
self.local_sequential_mlp.cuda()
self.te_sequential_mlp.cuda()
Expand Down

0 comments on commit 6e4e9df

Please sign in to comment.