numerics: add some tests

togethercomputer · Jan 9, 2024 · 2095a87 · 2095a87
1 parent d19d3f4
commit 2095a87
Showing 1 changed file with 52 additions and 0 deletions.
diff --git a/configs/sh-stem-test.yml b/configs/sh-stem-test.yml
@@ -0,0 +1,52 @@
+model_name: sh-7b-32k-v1
+vocab_size: 32000
+hidden_size: 4096
+# Number of long convolution filters in each hyena block. Can be smaller than `hidden_size`
+num_filters: 4096
+attn_layer_idxs: []
+hyena_layer_idxs: [0, 1, 2, 3]
+num_layers: 4
+# Length of the short, depthwise FIR applied to input projections
+short_filter_length: 3 
+num_attention_heads: 32
+short_filter_bias: true # add bias to FIR
+mlp_init_method: torch.nn.init.normal_
+mlp_output_init_method: torch.nn.init.normal_
+eps: 0.00001
+# Number of states in the recurrence (per channel)
+state_size: 2
+rotary_emb_base: 500000 
+make_vocab_size_divisible_by: 8
+log_intermediate_values: False
+# Number of groups in GQA
+proj_groups: 4
+# Number of groups in grouped hyena
+hyena_filter_groups: 1
+# Split strategy for channels
+column_split_hyena: True
+column_split: False
+# Legacy options for MP / PP inference
+model_parallel_size: 1
+pipe_parallel_size: 1
+tie_embeddings: False
+inner_mlp_size: 14336
+mha_out_proj_bias: False
+qkv_proj_bias: False
+max_seqlen: 32768
+max_batch_size: 1
+final_norm: True 
+# Custom kernels for StripedHyena:
+# 1) FlashAttention-v2: required
+# 2) FusedRMSNorm: required
+# 3) FlashDepthwise (FIR): compatible
+# 4) FlashFFTConv (prefill_style=fft and parallel inference): compatible
+# 5) LaughingHyena (decoding): unreleased
+use_flash_attn: True
+use_flash_rmsnorm: True
+use_flash_depthwise: False
+use_flashfft: False
+use_laughing_hyena: False
+inference_mode: False
+tokenizer_type: HFAutoTokenizer
+vocab_file: tokenizer/tokenizer.json
+prefill_style: fft