1
- # AutoModelForCausalLM
1
+ # Decoder-only models
2
+ BioGptForCausalLM :
3
+ self_attention_module : " self_attn"
4
+ value_vector : " value_states"
2
5
BloomForCausalLM :
3
6
self_attention_module : " self_attention"
4
7
value_vector : " value_layer"
8
+ CodeGenForCausalLM :
9
+ self_attention_module : " attn"
10
+ value_vector : " value"
11
+ FalconForCausalLM :
12
+ self_attention_module : " self_attention"
13
+ value_vector : " value_layer"
14
+ GemmaForCausalLM :
15
+ self_attention_module : " self_attn"
16
+ value_vector : " value_states"
17
+ GPTBigCodeForCausalLM :
18
+ self_attention_module : " attn"
19
+ value_vector : " value"
20
+ GPTJForCausalLM :
21
+ self_attention_module : " attn"
22
+ value_vector : " value"
5
23
GPT2LMHeadModel :
6
24
self_attention_module : " attn"
7
25
value_vector : " value"
8
- OpenAIGPTLMHeadModel :
26
+ GPTNeoForCausalLM :
9
27
self_attention_module : " attn"
10
28
value_vector : " value"
11
29
GPTNeoXForCausalLM :
@@ -14,40 +32,80 @@ GPTNeoXForCausalLM:
14
32
LlamaForCausalLM :
15
33
self_attention_module : " self_attn"
16
34
value_vector : " value_states"
17
- GPTBigCodeForCausalLM :
35
+ MistralForCausalLM :
36
+ self_attention_module : " self_attn"
37
+ value_vector : " value_states"
38
+ MixtralForCausalLM :
39
+ self_attention_module : " self_attn"
40
+ value_vector : " value_states"
41
+ MptForCausalLM :
18
42
self_attention_module : " attn"
19
- value_vector : " value "
20
- CodeGenForCausalLM :
43
+ value_vector : " value_states "
44
+ OpenAIGPTLMHeadModel :
21
45
self_attention_module : " attn"
22
46
value_vector : " value"
23
- # TODO
24
- # BioGptForCausalLM
25
- # GemmaForCausalLM
26
- # GPTNeoForCausalLM
27
- # GPTJForCausalLM
28
- # MistralForCausalLM
29
- # MixtralForCausalLM
30
- # MptForCausalLM
31
- # OpenLlamaForCausalLM
32
- # OPTForCausalLM
33
- # PhiForCausalLM
34
- # StableLmForCausalLM
35
- # XGLMForCausalLM
47
+ OPTForCausalLM :
48
+ self_attention_module : " self_attn"
49
+ value_vector : " value_states"
50
+ PhiForCausalLM :
51
+ self_attention_module : " self_attn"
52
+ value_vector : " value_states"
53
+ Qwen2ForCausalLM :
54
+ self_attention_module : " self_attn"
55
+ value_vector : " value_states"
56
+ StableLmForCausalLM :
57
+ self_attention_module : " self_attn"
58
+ value_vector : " value_states"
59
+ XGLMForCausalLM :
60
+ self_attention_module : " self_attn"
61
+ value_vector : " value_states"
36
62
37
- # AutoModelForSeq2SeqLM
63
+ # Encoder-decoder models
64
+ BartForConditionalGeneration :
65
+ self_attention_module : " self_attn"
66
+ cross_attention_module : " encoder_attn"
67
+ value_vector : " value_states"
38
68
MarianMTModel :
39
69
self_attention_module : " self_attn"
40
70
cross_attention_module : " encoder_attn"
41
71
value_vector : " value_states"
42
-
43
- # TODO ForConditionalGeneration
44
- # BartForConditionalGeneration
45
- # FSMTForConditionalGeneration
46
- # LongT5ForConditionalGeneration
47
- # M2M100ForConditionalGeneration
48
- # MBartForConditionalGeneration
49
- # MT5ForConditionalGeneration
50
- # NllbMoeForConditionalGeneration
51
- # SeamlessM4TForTextToText
52
- # SeamlessM4Tv2ForTextToText
53
- # T5ForConditionalGeneration
72
+ FSMTForConditionalGeneration :
73
+ self_attention_module : " self_attn"
74
+ cross_attention_module : " encoder_attn"
75
+ value_vector : " v"
76
+ M2M100ForConditionalGeneration :
77
+ self_attention_module : " self_attn"
78
+ cross_attention_module : " encoder_attn"
79
+ value_vector : " value_states"
80
+ MBartForConditionalGeneration :
81
+ self_attention_module : " self_attn"
82
+ cross_attention_module : " encoder_attn"
83
+ value_vector : " value_states"
84
+ MT5ForConditionalGeneration :
85
+ self_attention_module : " SelfAttention"
86
+ cross_attention_module : " EncDecAttention"
87
+ value_vector : " value_states"
88
+ NllbMoeForConditionalGeneration :
89
+ self_attention_module : " self_attn"
90
+ cross_attention_module : " cross_attention"
91
+ value_vector : " value_states"
92
+ PegasusForConditionalGeneration :
93
+ self_attention_module : " self_attn"
94
+ cross_attention_module : " encoder_attn"
95
+ value_vector : " value_states"
96
+ SeamlessM4TForTextToText :
97
+ self_attention_module : " self_attn"
98
+ cross_attention_module : " cross_attention"
99
+ value_vector : " value"
100
+ SeamlessM4Tv2ForTextToText :
101
+ self_attention_module : " self_attn"
102
+ cross_attention_module : " cross_attention"
103
+ value_vector : " value"
104
+ T5ForConditionalGeneration :
105
+ self_attention_module : " SelfAttention"
106
+ cross_attention_module : " EncDecAttention"
107
+ value_vector : " value_states"
108
+ UMT5ForConditionalGeneration :
109
+ self_attention_module : " SelfAttention"
110
+ cross_attention_module : " EncDecAttention"
111
+ value_vector : " value_states"
0 commit comments