diff --git a/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml new file mode 100644 index 0000000000..c5dbbb35ea --- /dev/null +++ b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml @@ -0,0 +1,67 @@ +ENV_VARS: + CUDA_DEVICE_MAX_CONNECTIONS: '1' + NVTE_ALLOW_NONDETERMINISTIC_ALGO: '1' + NVTE_FLASH_ATTN: '0' + NVTE_FUSED_ATTN: '0' + +TEST_TYPE: 'release' + +MODEL_ARGS: + # T5 model args + --encoder-num-layers: 12 + --decoder-num-layers: 12 + --hidden-size: 768 + --num-attention-heads: 12 + --kv-channels: 64 + --ffn-hidden-size: 3072 + --encoder-seq-length: 512 + --decoder-seq-length: 128 + --max-position-embeddings: 512 + --init-method-std: 0.015 + + # Training args + --micro-batch-size: 32 + --global-batch-size: 512 + --train-iters: 100000 + --weight-decay: 1e-2 + --clip-grad: 1.0 + --bf16: true + --lr: 0.0001 + --lr-decay-style: linear + --min-lr: 1.0e-5 + --lr-warmup-fraction: .01 + --distributed-backend: nccl + + # Transformer Engine args + --use-mcore-models: true + --transformer-impl: transformer_engine + + # Model parallel + --tensor-model-parallel-size: 4 + --pipeline-model-parallel-size: 1 + --encoder-pipeline-model-parallel-size: 0 + + # Data args + --data-path: ${DATA_BLEND} + --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt + --tokenizer-type: BertWordPieceCase + --split: 99982,9,9 + --data-cache-path: ${OUTPUT_PATH}/cache + --vocab-extra-ids: 100 + + # EVAL_AND_LOGGING_ARGS + --log-interval: 100 + --save-interval: 2000 + --eval-interval: 1000 + --save: ${CHECKPOINT_PATH} + --load: ${CHECKPOINT_PATH} + --eval-iters: 10 + --tensorboard-dir: ${TENSORBOARD_PATH} + --log-timers-to-tensorboard: true + --log-memory-to-tensorboard: true + --log-num-zeros-in-grad: true + --log-params-norm: true + --log-validation-ppl-to-tensorboard: true + --timing-log-level: 2 + --wandb-project: megatron-core-release-runs + --wandb-exp-name: ${WANDB_EXPERIMENT} \ No newline at end of file