Added mistral support.

Eron Gjoni · Eron Gjoni · commit 14cad7a6526f · 2023-12-31T08:17:36.000Z
Made Dgenerate somewhate better conform to the pecularities of model.generate.
Fixed a bug that was handicapping A_dose_theta in llama2 models
moved away from the legacy cache format
diff --git a/base_ref.py b/base_ref.py
@@ -1,15 +1,13 @@
 #This file is just a sober baseline
-
 import time
 import bitsandbytes
 import sys
 import torch
 from transformers import AutoTokenizer, TextStreamer, GenerationConfig
 from transformers import AutoModelForCausalLM
 
-model_id = "NousResearch/Llama-2-7b-chat-hf"
+model_id = "cognitivecomputations/dolphin-2.2.1-mistral-7b"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
-tokenizer.pad_token_id = tokenizer.eos_token_id
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     device_map="auto",
@@ -39,14 +37,13 @@
             input_ids=tokenized_start.to('cuda'),
             generation_config=GenerationConfig(
                 use_cache=True,
-                min_new_tokens=20,
+                min_new_tokens=2,
                 max_new_tokens=500,
                 temperature=1,
                 do_sample=False,
-                pad_token_id=tokenizer.pad_token_id,
                 eos_token_id=tokenizer.eos_token_id,
                 return_dict_in_generate=True,
-                output_hidden_states=True,
+                output_hidden_states=False,
                 output_scores = True
             ),
             streamer=streamer,    
diff --git a/drugs/__init__.py b/drugs/__init__.py
@@ -10,7 +10,10 @@
     AutoModelForTokenClassification,
     LlamaForCausalLM,
     LlamaForSequenceClassification,
-    LlamaModel
+    LlamaModel,
+    MistralForCausalLM,
+    MistralForSequenceClassification,
+    MistralModel,
 )
 
 """
@@ -31,9 +34,6 @@
     GPTNeoXForTokenClassification,
     GPTNeoXModel,
     GPTNeoXPreTrainedModel,
-    MistralForCausalLM,
-    MistralForSequenceClassification,
-    MistralModel,
     MptForCausalLM,
     MptForQuestionAnswering,
     MptForSequenceClassification,
diff --git a/drugs/dgenerate.py b/drugs/dgenerate.py
diff --git a/drugs/inject_mixin.py b/drugs/inject_mixin.py
@@ -17,7 +17,7 @@
     #"mpt": "MptModel",
     #"gpt_neox": "GPTNeoXModel",
     #"gptj": "GPTJModel",
-    #"mistral": "MistralModel",
+    "mistral": "MistralModel",
     #"qwen": "QWenModel",
     #"stablelm_epoch": "StableLMEpochModel"
 }
@@ -29,7 +29,7 @@
     #"mpt": "MptAttention",
     #"gpt_neox": "GPTNeoXAttention",
     #"gptj": "GPTJAttention",
-    #"mistral": "MistralAttention",
+    "mistral": "MistralAttention",
     #"qwen": "QWenAttention",
     #"stablelm_epoch": "Attention",
 }
@@ -56,7 +56,7 @@ def _inject_drugged_attention(cls, model: PreTrainedModel, **kwargs) -> Optional
             #mpt_drugged_attention_forward,
             #gptj_drugged_attention_forward,
             llama_drugged_attention_forward,
-            #mistral_drugged_attention_forward,
+            mistral_drugged_attention_forward,
             #qwen_drugged_attention_forward,
             #stablelm_epoch_drugged_attention_forward,
         )
@@ -67,14 +67,14 @@ def _inject_drugged_attention(cls, model: PreTrainedModel, **kwargs) -> Optional
             #"mpt": None,
             #"gpt_neox": gpt_neox_drugged_attention_forward,
             #"gptj": gptj_drugged_attention_forward,
-            #"mistral": mistral_drugged_attention_forward,
+            "mistral": mistral_drugged_attention_forward,
             #"qwen": qwen_drugged_attention_forward,
             #"stablelm_epoch": stablelm_epoch_drugged_attention_forward,
         }
         
 
         # Not all models require updated attention forwards
-        if ATTENTION_FORWARD_MAPPING[model_type] is None:
+        if model_type not in ATTENTION_FORWARD_MAPPING or ATTENTION_FORWARD_MAPPING[model_type] is None:
             return
 
         #TODO: support flash attention 2 and sdpa 
diff --git a/drugs/models/__init__.py b/drugs/models/__init__.py
@@ -37,13 +37,13 @@
     LlamaModel,
     llama_drugged_attention_forward
 )
-"""from .mistral import (
+from .mistral import (
     MistralForCausalLM,
     MistralForSequenceClassification,
     MistralModel,
     mistral_drugged_attention_forward,
 )
-from .mpt import (
+"""from .mpt import (
     MptAttention,
     MptForCausalLM,
     MptForQuestionAnswering,
diff --git a/drugs/models/llama/drugged.py b/drugs/models/llama/drugged.py
@@ -53,8 +53,9 @@ def llama_drugged_attention_forward(
         
         """applying noise before rotatry embeddings because that just feels right (expecially with RoPE)
         and not touching the attention sink because that just feels wrong"""
-        if self.quelude_theta > 0:
-            query_states[:,:,6:,:] = get_perturbed_vectors(query_states[:,:,6:,:], self.quelude_theta)
+        sink_protect = (position_ids < 6).sum().item()
+        if self.quaalude_theta > 0:
+            query_states[:,:,sink_protect:, :] = get_perturbed_vectors(query_states[:,:,sink_protect:,:], self.quaalude_theta)
 
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
@@ -77,6 +78,7 @@ def llama_drugged_attention_forward(
         dkey_states = key_states
         dvalue_states = value_states
         
+        """sliced at 6th sequence vector to not touch any attention sinks. bad juju"""
         if self.ketamine_theta > 0:
             dkey_states = torch.clone(key_states)          
             dkey_states[:,:,6:,:] = get_perturbed_vectors(key_states[:,:,6:,:], self.ketamine_theta)
@@ -105,17 +107,16 @@ def llama_drugged_attention_forward(
         attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
         attn_output = torch.matmul(attn_weights, dvalue_states)
 
+        if self.adderall_theta > 0:
+            attn_output[:,:,sink_protect:, :] = get_perturbed_vectors(attn_output[:,:,sink_protect:, :], self.adderall_theta)
+
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
             raise ValueError(
                 f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
-        
-        if self.adderall_theta > 0:
-            attn_output[:,:,6:,:] = get_perturbed_vectors(attn_output[:,:,6:,:], self.adderall_theta)
-
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
 
         if self.config.pretraining_tp > 1:
diff --git a/drugs/models/llama/modeling_llama.py b/drugs/models/llama/modeling_llama.py
@@ -10,8 +10,6 @@
 import torch
 from typing import Optional, Tuple, List, Union
 
-#TODO: self.model._modules['layers']._modules exposes decoder layers, maybe better way to access layer idx?
-
 class LlamaPreTrainedModel(InjectDrugsMixin, TLlamaPreTrainedModel):
     pass
 
diff --git a/drugs/models/mistral/__init__.py b/drugs/models/mistral/__init__.py
@@ -0,0 +1,6 @@
+from .modeling_mistral import (
+    MistralForCausalLM,
+    MistralForSequenceClassification,
+    MistralModel,
+)
+from .drugged import mistral_drugged_attention_forward
diff --git a/drugs/models/mistral/drugged.py b/drugs/models/mistral/drugged.py
@@ -0,0 +1,114 @@
+import math
+from typing import Optional, Tuple
+import warnings
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from transformers.models.mistral.modeling_mistral import repeat_kv, rotate_half, apply_rotary_pos_emb
+from drugs.generation.utils import get_perturbed_vectors
+from transformers.cache_utils import Cache
+
+__all__ = ["mistral_drugged_attention_forward"]
+
+
+def mistral_drugged_attention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Cache] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    padding_mask: Optional[torch.Tensor] = None,
+     **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        
+        sink_protect = (position_ids < 6).sum().item()
+        if self.quaalude_theta > 0:
+            query_states[:,:,sink_protect:, :] = get_perturbed_vectors(query_states[:,:,sink_protect:,:], self.quaalude_theta)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dkey_states = key_states
+        dvalue_states = value_states
+        
+        """sliced at 6th sequence vector to not touch any attention sinks. bad juju"""
+        if self.ketamine_theta > 0:
+            dkey_states = torch.clone(key_states)          
+            dkey_states[:,:,6:,:] = get_perturbed_vectors(key_states[:,:,6:,:], self.ketamine_theta)
+        
+        if self.valium_theta > 0:
+            dvalue_states = torch.clone(value_states)
+            dvalue_states[:,:,6:,:] = get_perturbed_vectors(value_states[:,:,6:,:], self.valium_theta)
+
+        attn_weights = torch.matmul(query_states, dkey_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        
+        if self.adderall_theta > 0:
+            attn_output[:,:,sink_protect:, :] = get_perturbed_vectors(attn_output[:,:,sink_protect:, :], self.adderall_theta)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
diff --git a/drugs/models/mistral/modeling_mistral.py b/drugs/models/mistral/modeling_mistral.py
@@ -0,0 +1,22 @@
+from transformers import MistralForCausalLM as TMistralForCausalLM
+from transformers import MistralForSequenceClassification as TMistralForSequenceClassification
+from transformers import MistralModel as TMistralModel
+from transformers import MistralPreTrainedModel as TMistralPreTrainedModel
+
+from drugs.inject_mixin import InjectDrugsMixin
+
+
+class MistralPreTrainedModel(InjectDrugsMixin, TMistralPreTrainedModel):
+    pass
+
+
+class MistralModel(MistralPreTrainedModel, TMistralModel):
+    pass
+
+
+class MistralForCausalLM(MistralPreTrainedModel, TMistralForCausalLM):
+    pass
+
+
+class MistralForSequenceClassification(MistralPreTrainedModel, TMistralForSequenceClassification):
+    pass
diff --git a/just_chat.ipynb b/just_chat.ipynb
diff --git a/just_chat.py b/just_chat.py