From a6ba070c37bb35b87672ec39738903fda419df09 Mon Sep 17 00:00:00 2001 From: Dennis Liu Date: Fri, 3 Jan 2025 16:48:29 -0800 Subject: [PATCH] ADLR/megatron-lm!2460 - Add NeMo MoE test. Co-authored-by: Oliver Koenig --- .../model_config.yaml | 43 +++++++++++++++++++ tests/test_utils/recipes/gpt-nemo.yaml | 6 ++- 2 files changed, 47 insertions(+), 2 deletions(-) create mode 100644 tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_8experts_tp2_ep2_pp1_dgx_a100_1N8G/model_config.yaml diff --git a/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_8experts_tp2_ep2_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_8experts_tp2_ep2_pp1_dgx_a100_1N8G/model_config.yaml new file mode 100644 index 0000000000..20bacd5029 --- /dev/null +++ b/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_8experts_tp2_ep2_pp1_dgx_a100_1N8G/model_config.yaml @@ -0,0 +1,43 @@ +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + SKIP_PYTEST: 1 +MODEL_ARGS: + trainer.num_nodes: 1 + trainer.devices: 8 + trainer.max_steps: 50 + trainer.val_check_interval: 50 + trainer.limit_val_batches: 50 + trainer.max_epochs: 'null' + trainer.precision: bf16 + model.num_layers: 12 + model.hidden_size: 768 + model.num_attention_heads: 12 + model.micro_batch_size: 1 + model.global_batch_size: 8 + model.tensor_model_parallel_size: 2 + model.pipeline_model_parallel_size: 1 + model.expert_model_parallel_size: 2 + model.virtual_pipeline_model_parallel_size: 'null' + model.encoder_seq_length: 2048 + model.max_position_embeddings: 2048 + model.ffn_hidden_size: 3072 + model.mcore_gpt: 'True' + model.apply_query_key_layer_scaling: 'True' + model.megatron_amp_O2: 'True' + model.data.data_prefix: '[]' + model.data.data_impl: mock + model.data.splits_string: '[99990,8,2]' + model.optim.name: mcore_distributed_optim + model.optim.weight_decay: 0.1 + exp_manager.create_checkpoint_callback: 'False' + model.sequence_parallel: 'True' + model.overlap_p2p_comm: 'True' + model.batch_p2p_comm: 'False' + model.bias: 'False' + model.bias_activation_fusion: 'False' + ++model.num_moe_experts: 8 + ++model.moe_grouped_gemm: 'True' + ++model.moe_router_load_balancing_type: aux_loss + ++model.moe_router_topk: 2 + ++model.moe_aux_loss_coeff: 1e-2 +TEST_TYPE: regular diff --git a/tests/test_utils/recipes/gpt-nemo.yaml b/tests/test_utils/recipes/gpt-nemo.yaml index b07d710e16..fa71a5e7b9 100644 --- a/tests/test_utils/recipes/gpt-nemo.yaml +++ b/tests/test_utils/recipes/gpt-nemo.yaml @@ -34,5 +34,7 @@ products: scope: [mr] n_repeat: [5] test_case: - - gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G - - gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G + - gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G + - gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G + - gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_8experts_tp2_ep2_pp1_dgx_a100_1N8G +