From 8b2a0e4acda2aaf76d39eb22c2c910de9e845d90 Mon Sep 17 00:00:00 2001 From: michelle-yooh Date: Mon, 9 Dec 2024 18:04:51 +0000 Subject: [PATCH] Add mixtral 8x7b config for gpu --- MaxText/configs/models/gpu/mixtral_8x7b.yml | 49 +++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 MaxText/configs/models/gpu/mixtral_8x7b.yml diff --git a/MaxText/configs/models/gpu/mixtral_8x7b.yml b/MaxText/configs/models/gpu/mixtral_8x7b.yml new file mode 100644 index 000000000..a73deece3 --- /dev/null +++ b/MaxText/configs/models/gpu/mixtral_8x7b.yml @@ -0,0 +1,49 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# model config for mixtral-8x7b + +base_emb_dim: 4096 +base_num_query_heads: 32 +base_num_kv_heads: 8 +base_mlp_dim: 14336 +base_num_decoder_layers: 32 +head_dim: 128 +mlp_activations: ["silu","linear"] +vocab_size: 32000 +enable_dropout: False +logits_via_embedding: False +normalization_layer_epsilon: 1.0e-5 +num_experts: 8 +num_experts_per_tok: 2 +rope_max_timescale: 1_000_000 +decoder_block: "mistral" +ici_fsdp_parallelism: 1 +ici_tensor_parallelism: 1 +ici_expert_parallelism: -1 +dcn_data_parallelism: -1 +dcn_fsdp_parallelism: 1 +capacity_factor: 1.0 +hardware: "gpu" +steps: 30 + +per_device_batch_size: 8.0 +max_target_length: 4096 +attention: "cudnn_flash_te" +remat_policy: "minimal" +use_iota_embed: True +dataset_type: "synthetic" +reuse_example_batch: 1 +enable_checkpointing: False +megablox: False \ No newline at end of file