AI-Hypercomputer · michelle-yooh · Dec 9, 2024 · RissyRan · Dec 17, 2024
@@ -0,0 +1,49 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for mixtral-8x7b
+
+base_emb_dim: 4096
+base_num_query_heads: 32
+base_num_kv_heads: 8
+base_mlp_dim: 14336
+base_num_decoder_layers: 32
+head_dim: 128
+mlp_activations: ["silu","linear"]
+vocab_size: 32000
+enable_dropout: False
+logits_via_embedding: False
+normalization_layer_epsilon: 1.0e-5
+num_experts: 8
+num_experts_per_tok: 2
+rope_max_timescale: 1_000_000
+decoder_block: "mistral"
+ici_fsdp_parallelism: 1
+ici_tensor_parallelism: 1
+ici_expert_parallelism: -1
+dcn_data_parallelism: -1
+dcn_fsdp_parallelism: 1
+capacity_factor: 1.0
+hardware: "gpu"
+steps: 30
+
+per_device_batch_size: 8.0
+max_target_length: 4096
+attention: "cudnn_flash_te"
+remat_policy: "minimal"
+use_iota_embed: True
+dataset_type: "synthetic"
+reuse_example_batch: 1
+enable_checkpointing: False
+megablox: False