From 8b2a0e4acda2aaf76d39eb22c2c910de9e845d90 Mon Sep 17 00:00:00 2001
From: michelle-yooh <yooh@google.com>
Date: Mon, 9 Dec 2024 18:04:51 +0000
Subject: [PATCH] Add mixtral 8x7b config for gpu

---
 MaxText/configs/models/gpu/mixtral_8x7b.yml | 49 +++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 MaxText/configs/models/gpu/mixtral_8x7b.yml

diff --git a/MaxText/configs/models/gpu/mixtral_8x7b.yml b/MaxText/configs/models/gpu/mixtral_8x7b.yml
new file mode 100644
index 000000000..a73deece3
--- /dev/null
+++ b/MaxText/configs/models/gpu/mixtral_8x7b.yml
@@ -0,0 +1,49 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for mixtral-8x7b
+
+base_emb_dim: 4096
+base_num_query_heads: 32
+base_num_kv_heads: 8
+base_mlp_dim: 14336
+base_num_decoder_layers: 32
+head_dim: 128
+mlp_activations: ["silu","linear"]
+vocab_size: 32000
+enable_dropout: False
+logits_via_embedding: False
+normalization_layer_epsilon: 1.0e-5
+num_experts: 8
+num_experts_per_tok: 2
+rope_max_timescale: 1_000_000
+decoder_block: "mistral"
+ici_fsdp_parallelism: 1
+ici_tensor_parallelism: 1
+ici_expert_parallelism: -1
+dcn_data_parallelism: -1
+dcn_fsdp_parallelism: 1
+capacity_factor: 1.0
+hardware: "gpu"
+steps: 30
+
+per_device_batch_size: 8.0
+max_target_length: 4096
+attention: "cudnn_flash_te"
+remat_policy: "minimal"
+use_iota_embed: True
+dataset_type: "synthetic"
+reuse_example_batch: 1
+enable_checkpointing: False
+megablox: False
\ No newline at end of file