diff --git a/mlx_vlm/models/idefics2/idefics2.py b/mlx_vlm/models/idefics2/idefics2.py
index 1703aff..1c78365 100644
--- a/mlx_vlm/models/idefics2/idefics2.py
+++ b/mlx_vlm/models/idefics2/idefics2.py
@@ -251,7 +251,12 @@ def _prepare_inputs_for_multimodal(self, image_features, inputs_embeds, input_id
         return mx.concatenate(final_embeddings, axis=1)
 
     def __call__(
-        self, input_ids: mx.array, pixel_values: mx.array, mask: mx.array, cache=None
+        self,
+        input_ids: mx.array,
+        pixel_values: mx.array,
+        mask: mx.array,
+        cache=None,
+        **kwargs,
     ):
         input_embeddings = self.get_input_embeddings(input_ids, pixel_values)
         logits = self.language_model(
diff --git a/mlx_vlm/models/idefics2/language.py b/mlx_vlm/models/idefics2/language.py
index ff67afe..66f1bb8 100644
--- a/mlx_vlm/models/idefics2/language.py
+++ b/mlx_vlm/models/idefics2/language.py
@@ -6,6 +6,8 @@
 import mlx.core as mx
 import mlx.nn as nn
 
+from ..base import KVCache, create_attention_mask
+
 
 @dataclass
 class TextConfig:
@@ -62,7 +64,7 @@ def __call__(
         self,
         x: mx.array,
         mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[KVCache] = None,
     ) -> mx.array:
         B, L, D = x.shape
 
@@ -116,7 +118,7 @@ def __call__(
         self,
         x: mx.array,
         mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[KVCache] = None,
     ) -> mx.array:
         r = self.self_attn(self.input_layernorm(x), mask, cache)
         h = x + r
@@ -153,10 +155,7 @@ def __call__(
         else:
             h = inputs_embeds
 
-        mask = None
-        if h.shape[1] > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
-            mask = mask.astype(h.dtype)
+        mask = create_attention_mask(h)
 
         if cache is None:
             cache = [None] * len(self.layers)
diff --git a/mlx_vlm/models/llava/language.py b/mlx_vlm/models/llava/language.py
index 8492002..732b636 100644
--- a/mlx_vlm/models/llava/language.py
+++ b/mlx_vlm/models/llava/language.py
@@ -5,6 +5,8 @@
 import mlx.core as mx
 import mlx.nn as nn
 
+from ..base import KVCache, create_attention_mask
+
 
 @dataclass
 class TextConfig:
@@ -78,7 +80,7 @@ def __call__(
         self,
         x: mx.array,
         mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[KVCache] = None,
     ) -> mx.array:
         B, L, D = x.shape
 
@@ -132,7 +134,7 @@ def __call__(
         self,
         x: mx.array,
         mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[KVCache] = None,
     ) -> mx.array:
         r = self.self_attn(self.input_layernorm(x), mask, cache)
         h = x + r
@@ -166,10 +168,7 @@ def __call__(
         else:
             h = inputs_embeds
 
-        mask = None
-        if h.shape[1] > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
-            mask = mask.astype(h.dtype)
+        mask = create_attention_mask(h)
 
         if cache is None:
             cache = [None] * len(self.layers)
diff --git a/mlx_vlm/models/llava/llava.py b/mlx_vlm/models/llava/llava.py
index 7a7209a..39aae4a 100644
--- a/mlx_vlm/models/llava/llava.py
+++ b/mlx_vlm/models/llava/llava.py
@@ -132,7 +132,12 @@ def _merge_input_ids_with_image_features(
         return mx.concatenate(final_embeddings, axis=1)
 
     def __call__(
-        self, input_ids: mx.array, pixel_values: mx.array, mask: mx.array, cache=None
+        self,
+        input_ids: mx.array,
+        pixel_values: mx.array,
+        mask: mx.array,
+        cache=None,
+        **kwargs,
     ):
         input_embddings = self.get_input_embeddings(input_ids, pixel_values)
         logits = self.language_model(
diff --git a/mlx_vlm/models/llava_bunny/language.py b/mlx_vlm/models/llava_bunny/language.py
index e4010da..153a650 100644
--- a/mlx_vlm/models/llava_bunny/language.py
+++ b/mlx_vlm/models/llava_bunny/language.py
@@ -5,7 +5,7 @@
 import mlx.core as mx
 import mlx.nn as nn
 
-from ..base import KVCache
+from ..base import KVCache, create_attention_mask
 
 
 @dataclass
@@ -174,10 +174,7 @@ def __call__(
         else:
             h = inputs_embeds
 
-        mask = None
-        if h.shape[1] > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
-            mask = mask.astype(h.dtype)
+        mask = create_attention_mask(h)
 
         if cache is None:
             cache = [None] * len(self.layers)
diff --git a/mlx_vlm/models/llava_bunny/llava_bunny.py b/mlx_vlm/models/llava_bunny/llava_bunny.py
index cf210f9..0e145a2 100644
--- a/mlx_vlm/models/llava_bunny/llava_bunny.py
+++ b/mlx_vlm/models/llava_bunny/llava_bunny.py
@@ -183,6 +183,7 @@ def __call__(
         pixel_values: mx.array,
         mask: Optional[mx.array] = None,
         cache: Optional[Tuple[mx.array, mx.array]] = None,
+        **kwargs,
     ):
         input_embeddings = self.get_input_embeddings(input_ids, pixel_values)
         logits = self.language_model(
diff --git a/mlx_vlm/models/llava_next/language.py b/mlx_vlm/models/llava_next/language.py
index a374a4e..497e3b3 100644
--- a/mlx_vlm/models/llava_next/language.py
+++ b/mlx_vlm/models/llava_next/language.py
@@ -5,6 +5,8 @@
 import mlx.core as mx
 import mlx.nn as nn
 
+from ..base import KVCache, create_attention_mask
+
 
 @dataclass
 class TextConfig:
@@ -78,7 +80,7 @@ def __call__(
         self,
         x: mx.array,
         mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[KVCache] = None,
     ) -> mx.array:
         B, L, D = x.shape
 
@@ -132,7 +134,7 @@ def __call__(
         self,
         x: mx.array,
         mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[KVCache] = None,
     ) -> mx.array:
         r = self.self_attn(self.input_layernorm(x), mask, cache)
         h = x + r
@@ -166,10 +168,7 @@ def __call__(
         else:
             h = inputs_embeds
 
-        mask = None
-        if h.shape[1] > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
-            mask = mask.astype(h.dtype)
+        mask = create_attention_mask(h)
 
         if cache is None:
             cache = [None] * len(self.layers)
diff --git a/mlx_vlm/models/llava_next/llava_next.py b/mlx_vlm/models/llava_next/llava_next.py
index dcc6018..878d7ca 100644
--- a/mlx_vlm/models/llava_next/llava_next.py
+++ b/mlx_vlm/models/llava_next/llava_next.py
@@ -136,7 +136,12 @@ def _merge_input_ids_with_image_features(
         return mx.concatenate(final_embeddings, axis=1)
 
     def __call__(
-        self, input_ids: mx.array, pixel_values: mx.array, mask: mx.array, cache=None
+        self,
+        input_ids: mx.array,
+        pixel_values: mx.array,
+        mask: mx.array,
+        cache=None,
+        **kwargs,
     ):
 
         input_embddings = self.get_input_embeddings(input_ids, pixel_values)
diff --git a/mlx_vlm/models/multi_modality/language.py b/mlx_vlm/models/multi_modality/language.py
index e598bf8..22a85d8 100644
--- a/mlx_vlm/models/multi_modality/language.py
+++ b/mlx_vlm/models/multi_modality/language.py
@@ -5,6 +5,8 @@
 import mlx.core as mx
 import mlx.nn as nn
 
+from ..base import KVCache, create_attention_mask
+
 
 @dataclass
 class TextConfig:
@@ -78,7 +80,7 @@ def __call__(
         self,
         x: mx.array,
         mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[KVCache] = None,
     ) -> mx.array:
         B, L, D = x.shape
 
@@ -132,7 +134,7 @@ def __call__(
         self,
         x: mx.array,
         mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[KVCache] = None,
     ) -> mx.array:
         r = self.self_attn(self.input_layernorm(x), mask, cache)
         h = x + r
@@ -166,10 +168,7 @@ def __call__(
         else:
             h = inputs_embeds
 
-        mask = None
-        if h.shape[1] > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
-            mask = mask.astype(h.dtype)
+        mask = create_attention_mask(h)
 
         if cache is None:
             cache = [None] * len(self.layers)
diff --git a/mlx_vlm/models/multi_modality/multi_modality.py b/mlx_vlm/models/multi_modality/multi_modality.py
index de86be8..52a0bc9 100644
--- a/mlx_vlm/models/multi_modality/multi_modality.py
+++ b/mlx_vlm/models/multi_modality/multi_modality.py
@@ -360,7 +360,12 @@ def _merge_input_ids_with_image_features(
         return mx.concatenate(final_embeddings, axis=1)
 
     def __call__(
-        self, input_ids: mx.array, pixel_values: mx.array, mask: mx.array, cache=None
+        self,
+        input_ids: mx.array,
+        pixel_values: mx.array,
+        mask: mx.array,
+        cache=None,
+        **kwargs,
     ):
 
         input_embeddings = self.get_input_embeddings(input_ids, pixel_values)
diff --git a/mlx_vlm/models/paligemma/language.py b/mlx_vlm/models/paligemma/language.py
index 6c8c57e..32cdecd 100644
--- a/mlx_vlm/models/paligemma/language.py
+++ b/mlx_vlm/models/paligemma/language.py
@@ -5,6 +5,8 @@
 import mlx.core as mx
 import mlx.nn as nn
 
+from ..base import KVCache, create_attention_mask
+
 
 @dataclass
 class TextConfig:
@@ -66,7 +68,7 @@ def __call__(
         self,
         x: mx.array,
         mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[KVCache] = None,
     ) -> mx.array:
         B, L, D = x.shape
 
@@ -120,7 +122,7 @@ def __call__(
         self,
         x: mx.array,
         mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[KVCache] = None,
     ) -> mx.array:
         r = self.self_attn(self.input_layernorm(x), mask, cache)
         h = x + r
@@ -155,11 +157,9 @@ def __call__(
         else:
             h = inputs_embeds
 
-        h = h * (self.config.hidden_size**0.5)
+        h *= self.config.hidden_size**0.5
 
-        if cache is not None:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
-            mask = mask.astype(h.dtype)
+        mask = create_attention_mask(h)
 
         if cache is None:
             cache = [None] * len(self.layers)
diff --git a/mlx_vlm/models/phi3_v/phi3_v.py b/mlx_vlm/models/phi3_v/phi3_v.py
index 07fa579..770f001 100644
--- a/mlx_vlm/models/phi3_v/phi3_v.py
+++ b/mlx_vlm/models/phi3_v/phi3_v.py
@@ -8,6 +8,7 @@
 import mlx.nn as nn
 import numpy as np
 
+from ..base import KVCache, create_attention_mask
 from .language import LanguageModel, TextConfig
 from .su_rope import Phi3SuScaledRotaryEmbedding
 from .vision import VisionConfig, VisionModel
@@ -90,7 +91,7 @@ def __call__(
         self,
         x: mx.array,
         mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[KVCache] = None,
     ) -> mx.array:
         B, L, D = x.shape
 
@@ -148,7 +149,7 @@ def __call__(
         self,
         x: mx.array,
         mask: Optional[mx.array] = None,
-        cache: Optional[Tuple[mx.array, mx.array]] = None,
+        cache: Optional[KVCache] = None,
     ) -> mx.array:
         r = self.self_attn(self.input_layernorm(x), mask, cache)
         h = x + r
@@ -179,16 +180,18 @@ def __call__(
     ):
         h = self.embed_tokens(inputs)
         p = np.argwhere(inputs < 0).tolist()
+
         if pixel_values is not None:
             h = self.vision_embed_tokens(pixel_values, h, image_sizes, p)
-        mask = None
-        if h.shape[1] > 1:
-            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
-            mask = mask.astype(h.dtype)
+
+        mask = create_attention_mask(h)
+
         if cache is None:
             cache = [None] * len(self.layers)
+
         for layer, c in zip(self.layers, cache):
             h = layer(h, mask, c)
+
         return self.norm(h)
 
 
@@ -206,6 +209,7 @@ def __call__(
         pixel_values=None,
         mask=None,
         cache=None,
+        **kwargs,
     ):
         out = self.model(inputs, pixel_values, mask, cache)
         return self.lm_head(out).astype(self.lm_head.weight.dtype)
diff --git a/mlx_vlm/models/pixtral/__init__.py b/mlx_vlm/models/pixtral/__init__.py
new file mode 100644
index 0000000..41a9815
--- /dev/null
+++ b/mlx_vlm/models/pixtral/__init__.py
@@ -0,0 +1,8 @@
+from .pixtral import (
+    LanguageModel,
+    Model,
+    ModelConfig,
+    TextConfig,
+    VisionConfig,
+    VisionModel,
+)
diff --git a/mlx_vlm/models/pixtral/language.py b/mlx_vlm/models/pixtral/language.py
new file mode 100644
index 0000000..da8482c
--- /dev/null
+++ b/mlx_vlm/models/pixtral/language.py
@@ -0,0 +1,220 @@
+import inspect
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple, Union
+
+import mlx.core as mx
+import mlx.nn as nn
+
+from ..base import KVCache, create_attention_mask
+
+
+@dataclass
+class TextConfig:
+    model_type: str
+    hidden_size: int = 5120
+    head_dim: int = 128
+    num_hidden_layers: int = 40
+    intermediate_size: int = 14336
+    num_attention_heads: int = 32
+    rms_norm_eps: float = 1e-06
+    vocab_size: int = 131072
+    num_key_value_heads: int = 8
+    rope_theta: float = 1000000000.0
+    rope_traditional: bool = False
+    rope_scaling: Optional[Dict[str, Union[float, str]]] = None
+
+    @classmethod
+    def from_dict(cls, params):
+        return cls(
+            **{
+                k: v
+                for k, v in params.items()
+                if k in inspect.signature(cls).parameters
+            }
+        )
+
+    def __post_init__(self):
+        if self.num_key_value_heads is None:
+            self.num_key_value_heads = self.num_attention_heads
+
+        if self.rope_scaling:
+            required_keys = {"factor", "type"}
+            if not all(key in self.rope_scaling for key in required_keys):
+                raise ValueError(f"rope_scaling must contain keys {required_keys}")
+
+            if self.rope_scaling["type"] != "linear":
+                raise ValueError("rope_scaling 'type' currently only supports 'linear'")
+
+
+class Attention(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+
+        dim = config.hidden_size
+        self.n_heads = n_heads = config.num_attention_heads
+        self.n_kv_heads = n_kv_heads = config.num_key_value_heads
+
+        head_dim = config.head_dim
+        self.scale = head_dim**-0.5
+
+        self.q_proj = nn.Linear(dim, n_heads * head_dim, bias=False)
+        self.k_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=False)
+        self.v_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=False)
+        self.o_proj = nn.Linear(n_heads * head_dim, dim, bias=False)
+
+        rope_scale = (
+            1 / config.rope_scaling["factor"]
+            if config.rope_scaling is not None
+            and config.rope_scaling["type"] == "linear"
+            else 1
+        )
+        self.rope = nn.RoPE(
+            head_dim,
+            traditional=config.rope_traditional,
+            base=config.rope_theta,
+            scale=rope_scale,
+        )
+
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[KVCache] = None,
+    ) -> mx.array:
+        B, L, D = x.shape
+
+        queries, keys, values = self.q_proj(x), self.k_proj(x), self.v_proj(x)
+
+        # Prepare the queries, keys and values for the attention computation
+        queries = queries.reshape(B, L, self.n_heads, -1).transpose(0, 2, 1, 3)
+        keys = keys.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
+        values = values.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
+
+        if cache is not None:
+            queries = self.rope(queries, offset=cache.offset)
+            keys = self.rope(keys, offset=cache.offset)
+            keys, values = cache.update_and_fetch(keys, values)
+        else:
+            queries = self.rope(queries)
+            keys = self.rope(keys)
+
+        output = mx.fast.scaled_dot_product_attention(
+            queries, keys, values, scale=self.scale, mask=mask
+        )
+        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
+        return self.o_proj(output)
+
+
+class MLP(nn.Module):
+    def __init__(self, dim, hidden_dim):
+        super().__init__()
+        self.gate_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.down_proj = nn.Linear(hidden_dim, dim, bias=False)
+        self.up_proj = nn.Linear(dim, hidden_dim, bias=False)
+
+    def __call__(self, x) -> mx.array:
+        return self.down_proj(nn.silu(self.gate_proj(x)) * self.up_proj(x))
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.self_attn = Attention(config)
+        self.mlp = MLP(config.hidden_size, config.intermediate_size)
+        self.input_layernorm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = nn.RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.config = config
+
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[KVCache] = None,
+    ) -> mx.array:
+        r = self.self_attn(self.input_layernorm(x), mask, cache)
+        h = x + r
+        r = self.mlp(self.post_attention_layernorm(h))
+        out = h + r
+        return out
+
+
+class Mistral(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.num_hidden_layers = config.num_hidden_layers
+        assert self.vocab_size > 0
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = [
+            TransformerBlock(config=config) for _ in range(config.num_hidden_layers)
+        ]
+        self.norm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def __call__(
+        self,
+        inputs: mx.array,
+        cache=None,
+        inputs_embeds=None,
+    ):
+        # for passing merged input embeddings
+        if inputs_embeds is None:
+            h = self.embed_tokens(inputs)
+        else:
+            h = inputs_embeds
+
+        mask = create_attention_mask(h, cache)
+
+        if cache is None:
+            cache = [None] * len(self.layers)
+
+        for layer, c in zip(self.layers, cache):
+            h = layer(h, mask, c)
+
+        return self.norm(h)
+
+
+class LanguageModel(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        self.config = config
+        self.model_type = config.model_type
+        if self.model_type != "mistral":
+            raise ValueError(
+                f"Model type {self.model_type} not supported. Currently only 'mistral' is supported"
+            )
+        self.model = Mistral(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+    def __call__(
+        self,
+        inputs: mx.array,
+        cache=None,
+        inputs_embeds=None,
+        mask: Optional[mx.array] = None,
+    ):
+        out = self.model(inputs, cache, inputs_embeds)
+        return self.lm_head(out)
+
+    @staticmethod
+    def sanitize(weights):
+        # Remove unused precomputed rotary freqs
+        return {
+            k: v for k, v in weights.items() if "self_attn.rotary_emb.inv_freq" not in k
+        }
+
+    @property
+    def layers(self):
+        return self.model.layers
+
+    @property
+    def head_dim(self):
+        return self.config.head_dim
+
+    @property
+    def n_kv_heads(self):
+        return self.config.num_key_value_heads
diff --git a/mlx_vlm/models/pixtral/pixtral.py b/mlx_vlm/models/pixtral/pixtral.py
new file mode 100644
index 0000000..b49397b
--- /dev/null
+++ b/mlx_vlm/models/pixtral/pixtral.py
@@ -0,0 +1,193 @@
+import glob
+import inspect
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+from huggingface_hub import snapshot_download
+
+from .language import LanguageModel, TextConfig
+from .vision import VisionConfig, VisionModel
+
+
+@dataclass
+class ModelConfig:
+    text_config: TextConfig
+    vision_config: VisionConfig
+    model_type: str
+    ignore_index: int = -100
+    image_token_index: int = 10
+    vision_feature_select_strategy: str = "full"
+    vision_feature_layer: int = -1
+    vocab_size: int = 32000
+
+    @classmethod
+    def from_dict(cls, params):
+        return cls(
+            **{
+                k: v
+                for k, v in params.items()
+                if k in inspect.signature(cls).parameters
+            }
+        )
+
+
+class LlavaMultiModalProjector(nn.Module):
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size, config.text_config.hidden_size, bias=True
+        )
+        self.gelu = nn.GELU()
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size, config.text_config.hidden_size, bias=True
+        )
+
+    def __call__(self, x: mx.array) -> mx.array:
+        x = self.linear_1(x)
+        x = self.gelu(x)
+        x = self.linear_2(x)
+        return x
+
+
+class Model(nn.Module):
+    def __init__(self, config: ModelConfig):
+        self.config = config
+        self.vision_tower = VisionModel(config.vision_config)
+        self.language_model = LanguageModel(config.text_config)
+        self.multi_modal_projector = LlavaMultiModalProjector(config)
+        self.vision_feature_layer = config.vision_feature_layer
+        self.vision_feature_select_strategy = config.vision_feature_select_strategy
+
+    def get_input_embeddings(
+        self,
+        input_ids: Optional[mx.array] = None,
+        pixel_values: Optional[mx.array] = None,
+    ):
+        if pixel_values is None:
+            return self.language_model(input_ids)
+
+        # Get the input embeddings from the language model
+        inputs_embeds = self.language_model.model.embed_tokens(input_ids)
+
+        # Get the ouptut hidden states from the vision model
+        *_, hidden_states = self.vision_tower(
+            pixel_values.transpose(0, 2, 3, 1), output_hidden_states=True
+        )
+
+        # Select the hidden states from the desired layer
+        selected_image_feature = hidden_states[self.vision_feature_layer]
+
+        if self.vision_feature_select_strategy == "default":
+            selected_image_feature = selected_image_feature[:, 1:]
+        elif self.vision_feature_select_strategy == "full":
+            selected_image_feature = selected_image_feature
+        else:
+            raise ValueError(
+                "Unexpected feature selection strategy: "
+                f"{self.vision_feature_select_strategy}"
+            )
+
+        # Pass image features through the multi-modal projector
+        image_features = self.multi_modal_projector(selected_image_feature)
+
+        # Insert special image tokens in the input_ids
+        final_inputs_embeds = self._merge_input_ids_with_image_features(
+            image_features, inputs_embeds, input_ids
+        )
+        return final_inputs_embeds
+
+    def _merge_input_ids_with_image_features(
+        self, image_features, inputs_embeds, input_ids
+    ):
+        image_token_index = self.config.image_token_index
+        num_images, num_image_patches, embed_dim = image_features.shape
+
+        # Positions of <image> tokens in input_ids, assuming batch size is 1
+        image_positions = np.where(input_ids[0] == image_token_index)[0].tolist()
+
+        text_segments = []
+        start_idx = 0
+
+        for position in image_positions:
+            text_segments.append(inputs_embeds[:, start_idx:position])
+            start_idx = position + 1
+
+        image_embeddings = mx.split(image_features, image_features.shape[0])
+        final_embeddings = [v for p in zip(text_segments, image_embeddings) for v in p]
+        final_embeddings += [inputs_embeds[:, start_idx:]]
+
+        # Create a final embedding of shape
+        # (1, num_image_patches*num_images + sequence_len, embed_dim)
+        return mx.concatenate(final_embeddings, axis=1)
+
+    def __call__(
+        self,
+        input_ids: mx.array,
+        pixel_values: mx.array,
+        mask: mx.array,
+        cache=None,
+        **kwargs,
+    ):
+        input_embddings = self.get_input_embeddings(input_ids, pixel_values)
+        logits = self.language_model(
+            input_ids, cache=cache, inputs_embeds=input_embddings
+        )
+        return logits
+
+    @staticmethod
+    def from_pretrained(path_or_hf_repo: str):
+        path = Path(path_or_hf_repo)
+        if not path.exists():
+            path = Path(
+                snapshot_download(
+                    repo_id=path_or_hf_repo,
+                    allow_patterns=[
+                        "*.json",
+                        "*.safetensors",
+                        "*.py",
+                        "tokenizer.model",
+                        "*.tiktoken",
+                    ],
+                )
+            )
+
+        with open(path / "config.json", "r") as f:
+            model_config = json.load(f)
+
+        model_config = ModelConfig.from_dict(model_config)
+
+        model_config.vision_config = VisionConfig.from_dict(model_config.vision_config)
+        model_config.text_config = TextConfig.from_dict(model_config.text_config)
+
+        model = Model(model_config)
+        weight_files = glob.glob(str(path / "*.safetensors"))
+        if not weight_files:
+            raise FileNotFoundError(f"No safetensors found in {path}")
+
+        weights = {}
+        for wf in weight_files:
+            weights.update(mx.load(wf))
+
+        weights = VisionModel.sanitize(weights)
+        weights = LanguageModel.sanitize(weights)
+
+        model.load_weights(list(weights.items()))
+        return model
+
+    def sanitize(self, weights):
+        def transform_key(key):
+            if "vision_tower" in key:
+                if "transformer" in key:
+                    key = key.replace("vision_tower", "vision_tower.vision_model")
+                if "patch_conv" in key:
+                    key = key.replace("vision_tower", "vision_tower.vision_model")
+                if "ln_pre" in key:
+                    key = key.replace("vision_tower", "vision_tower.vision_model")
+            return key
+
+        return {transform_key(k): v for k, v in weights.items()}
diff --git a/mlx_vlm/models/pixtral/vision.py b/mlx_vlm/models/pixtral/vision.py
new file mode 100644
index 0000000..2db77f4
--- /dev/null
+++ b/mlx_vlm/models/pixtral/vision.py
@@ -0,0 +1,324 @@
+import inspect
+from dataclasses import dataclass
+from typing import Optional
+
+import mlx.core as mx
+import mlx.nn as nn
+
+
+@dataclass
+class VisionConfig:
+    model_type: str
+    num_hidden_layers: int = 24
+    hidden_size: int = 1024
+    head_dim: int = 64
+    intermediate_size: int = 4096
+    num_attention_heads: int = 16
+    image_size: int = 336
+    patch_size: int = 14
+    projection_dim: int = 768
+    vocab_size: int = 32000
+    num_channels: int = 3
+    rms_norm_eps: float = 1e-5
+    rope_theta: float = 10000.0
+
+    @classmethod
+    def from_dict(cls, params):
+        return cls(
+            **{
+                k: v
+                for k, v in params.items()
+                if k in inspect.signature(cls).parameters
+            }
+        )
+
+
+def check_array_shape(arr):
+    shape = arr.shape
+
+    # Check if the shape has 4 dimensions
+    if len(shape) != 4:
+        return False
+
+    out_channels, kH, KW, _ = shape
+
+    # Check if out_channels is the largest, and kH and KW are the same
+    if (out_channels >= kH) and (out_channels >= KW) and (kH == KW):
+        return True
+    else:
+        return False
+
+
+def position_ids_in_meshgrid(patch_embeds_list, max_width):
+    positions = []
+    for patch in patch_embeds_list:
+        height, width = patch.shape[1], patch.shape[2]
+        h_grid, v_grid = mx.meshgrid(mx.arange(height), mx.arange(width), indexing="ij")
+        h_grid = h_grid.reshape(-1, 1)
+        v_grid = v_grid.reshape(-1, 1)
+        ids = h_grid * max_width + v_grid
+        positions.append(ids.flatten())
+    return mx.concatenate(positions)
+
+
+def generate_block_attention_mask(patch_embeds_list, tensor):
+    seq_len = tensor.shape[1]
+    d_min = -1e9  # Using a large negative value as MLX doesn't have finfo
+
+    causal_mask = mx.full((seq_len, seq_len), vals=d_min)
+
+    block_end_idx = mx.cumsum(mx.array(patch_embeds_list))
+    block_start_idx = mx.concatenate([mx.array([0]), mx.array(patch_embeds_list[:-1])])
+    block_start_idx = mx.cumsum(block_start_idx)
+
+    for start, end in zip(block_start_idx, block_end_idx):
+        start, end = int(start), int(end)  # Convert to integers for indexing
+        causal_mask[start:end, start:end] = 0
+
+    causal_mask = mx.broadcast_to(
+        causal_mask[None, None, :, :], (tensor.shape[0], 1, seq_len, seq_len)
+    )
+    return causal_mask
+
+
+def rotate_half(x):
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return mx.concatenate((-x2, x1), axis=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
+    cos = mx.expand_dims(cos, axis=unsqueeze_dim)
+    sin = mx.expand_dims(sin, axis=unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dims: int,
+        num_heads: int,
+        query_input_dims: Optional[int] = None,
+        key_input_dims: Optional[int] = None,
+        value_input_dims: Optional[int] = None,
+        value_dims: Optional[int] = None,
+        value_output_dims: Optional[int] = None,
+        bias: bool = False,
+    ):
+        super().__init__()
+
+        if (dims % num_heads) != 0:
+            raise ValueError(
+                "The input feature dimensions should be divisible by the "
+                f"number of heads ({dims} % {num_heads}) != 0"
+            )
+
+        query_input_dims = query_input_dims or dims
+        key_input_dims = key_input_dims or dims
+        value_input_dims = value_input_dims or key_input_dims
+        value_dims = value_dims or dims
+        value_output_dims = value_output_dims or dims
+
+        self.embed_dim = dims
+        self.num_heads = num_heads
+        self.head_dim = self.embed_dim // self.num_heads
+
+        self.scale = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.o_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+
+    def __call__(self, queries, keys, values, position_embeddings, mask=None):
+        queries = self.q_proj(queries)
+        keys = self.k_proj(keys)
+        values = self.v_proj(values)
+
+        num_heads = self.num_heads
+        B, L, D = queries.shape
+        _, S, _ = keys.shape
+        queries = queries.reshape(B, L, num_heads, -1).transpose(0, 2, 1, 3)
+        keys = keys.reshape(B, S, num_heads, -1).transpose(0, 2, 1, 3)
+        values = values.reshape(B, S, num_heads, -1).transpose(0, 2, 1, 3)
+
+        cos, sin = position_embeddings
+        queries, keys = apply_rotary_pos_emb(queries, keys, cos, sin, unsqueeze_dim=0)
+
+        attn_weights = mx.matmul(queries, keys.transpose(0, 1, 3, 2)) * self.scale
+
+        if mask is not None:
+            attn_weights = attn_weights + mask
+
+        attn_weights = mx.softmax(attn_weights, axis=-1)
+        output = mx.matmul(attn_weights, values)
+
+        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
+
+        return self.o_proj(output)
+
+
+class MLP(nn.Module):
+    def __init__(self, config: VisionConfig):
+        super().__init__()
+        dim = config.hidden_size
+        hidden_dim = config.intermediate_size
+        self.gate_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.down_proj = nn.Linear(hidden_dim, dim, bias=False)
+        self.up_proj = nn.Linear(dim, hidden_dim, bias=False)
+
+    def __call__(self, x) -> mx.array:
+        return self.down_proj(nn.silu(self.gate_proj(x)) * self.up_proj(x))
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, config: VisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.attention = Attention(
+            config.hidden_size, config.num_attention_heads, bias=True
+        )
+        self.attention_norm = nn.RMSNorm(self.embed_dim, eps=config.rms_norm_eps)
+        self.feed_forward = MLP(config)
+        self.ffn_norm = nn.RMSNorm(self.embed_dim, eps=config.rms_norm_eps)
+
+    def __call__(
+        self,
+        x: mx.array,
+        position_embeddings: mx.array,
+        mask: Optional[mx.array] = None,
+    ) -> mx.array:
+        y = self.attention_norm(x)
+        y = self.attention(y, y, y, position_embeddings, mask)
+        x = x + y
+        y = self.ffn_norm(x)
+        y = self.feed_forward(y)
+        return x + y
+
+
+class Encoder(nn.Module):
+    def __init__(self, config: VisionConfig):
+        super().__init__()
+        self.layers = [EncoderLayer(config) for _ in range(config.num_hidden_layers)]
+
+
+class PixtralRotaryEmbedding:
+    def __init__(self, config):
+        self.dim = config.head_dim
+        self.base = config.rope_theta
+        max_patches_per_side = config.image_size // config.patch_size
+        freqs = 1.0 / (
+            self.base ** (mx.arange(0, self.dim, 2).astype(mx.float32) / self.dim)
+        )
+
+        h = mx.arange(max_patches_per_side)
+        w = mx.arange(max_patches_per_side)
+
+        freqs_h = mx.outer(h, freqs[::2]).astype(mx.float32)
+        freqs_w = mx.outer(w, freqs[1::2]).astype(mx.float32)
+        inv_freq = mx.concatenate(
+            [
+                mx.tile(freqs_h[:, None, :], (1, max_patches_per_side, 1)),
+                mx.tile(freqs_w[None, :, :], (max_patches_per_side, 1, 1)),
+            ],
+            axis=-1,
+        ).reshape(-1, self.dim // 2)
+
+        self.inv_freq = mx.concatenate((inv_freq, inv_freq), axis=-1)
+
+    def __call__(self, x, position_ids):
+        freqs = self.inv_freq[position_ids]
+        emb = freqs
+        cos = mx.cos(emb)
+        sin = mx.sin(emb)
+        return cos.astype(x.dtype), sin.astype(x.dtype)
+
+
+class PixtralVisionModel(nn.Module):
+    def __init__(self, config: VisionConfig):
+        super().__init__()
+        self.config = config
+        self.patch_conv = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=config.hidden_size,
+            kernel_size=config.patch_size,
+            stride=config.patch_size,
+            bias=False,
+        )
+        self.ln_pre = nn.RMSNorm(config.hidden_size)
+        self.transformer = Encoder(config)
+        self.patch_positional_embedding = PixtralRotaryEmbedding(config)
+
+    def __call__(
+        self,
+        x: mx.array,
+        output_hidden_states: Optional[bool] = None,
+    ) -> mx.array:
+        B, H, W, C = x.shape
+        patch_embeds_list = [self.patch_conv(img[None, :]) for img in x]
+
+        patch_embeds = mx.concatenate(
+            [p.reshape(B, -1, p.shape[-1]) for p in patch_embeds_list], axis=1
+        )
+
+        patch_embeds = self.ln_pre(patch_embeds)
+
+        position_ids = position_ids_in_meshgrid(
+            patch_embeds_list,
+            max_width=self.config.image_size // self.config.patch_size,
+        )
+
+        position_embedding = self.patch_positional_embedding(patch_embeds, position_ids)
+
+        mask = generate_block_attention_mask(
+            [p.shape[2] * p.shape[1] for p in patch_embeds_list], patch_embeds
+        )
+
+        encoder_states = (patch_embeds,) if output_hidden_states else None
+
+        for l in self.transformer.layers:
+            patch_embeds = l(
+                patch_embeds, mask=mask, position_embeddings=position_embedding
+            )
+            if output_hidden_states:
+                encoder_states = encoder_states + (patch_embeds,)
+
+        return patch_embeds, encoder_states
+
+
+class VisionModel(nn.Module):
+    def __init__(self, config: VisionConfig):
+        super().__init__()
+
+        self.model_type = config.model_type
+        if self.model_type not in ["clip_vision_model", "pixtral"]:
+            raise ValueError(f"Unsupported model type: {self.model_type}")
+
+        self.vision_model = PixtralVisionModel(config)
+
+    def __call__(
+        self, x: mx.array, output_hidden_states: Optional[bool] = None
+    ) -> mx.array:
+        return self.vision_model(x, output_hidden_states)
+
+    def sanitize(self, weights):
+        sanitized_weights = {}
+        for k, v in weights.items():
+            if "position_ids" in k:
+                # Remove unused position_ids
+                continue
+            elif "patch_conv.weight" in k:
+                # PyTorch conv2d weight tensors have shape:
+                #   [out_channels, in_channels, kH, KW]
+                # MLX conv2d expects the weight be of shape:
+                #   [out_channels, kH, KW, in_channels]
+                if check_array_shape(v):
+                    sanitized_weights[k] = v
+                else:
+                    sanitized_weights[k] = v.transpose(0, 2, 3, 1)
+            else:
+                sanitized_weights[k] = v
+
+        return sanitized_weights
diff --git a/mlx_vlm/models/qwen2_vl/vision.py b/mlx_vlm/models/qwen2_vl/vision.py
index ddc43ec..f07f880 100644
--- a/mlx_vlm/models/qwen2_vl/vision.py
+++ b/mlx_vlm/models/qwen2_vl/vision.py
@@ -310,9 +310,9 @@ def rot_pos_emb(self, grid_thw):
         max_grid_size = mx.max(grid_thw[:, 1:])
         rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
 
-        rotary_pos_emb_np = rotary_pos_emb_full[pos_ids]
+        rotary_pos_emb_full = rotary_pos_emb_full[pos_ids]
 
-        return rotary_pos_emb_np.reshape(pos_ids.shape[0], -1)
+        return rotary_pos_emb_full.reshape(pos_ids.shape[0], -1)
 
     def __call__(
         self,
diff --git a/mlx_vlm/prompt_utils.py b/mlx_vlm/prompt_utils.py
index e8ce18d..b47a9a1 100644
--- a/mlx_vlm/prompt_utils.py
+++ b/mlx_vlm/prompt_utils.py
@@ -11,13 +11,13 @@ def get_message_json(model_name, prompt):
     Returns:
         dict: A dictionary representing the JSON message for the specified model.
     """
-    if model_name.lower() in ["idefics2", "qwen2_vl"]:
+    if model_name.lower() in ["idefics2", "qwen2_vl", "llava"]:
         message = {
             "role": "user",
             "content": [{"type": "image"}, {"type": "text", "text": prompt}],
         }
 
-    elif model_name.lower() in ["llava-qwen2", "llava", "llava_next", "bunny-llama"]:
+    elif model_name.lower() in ["llava-qwen2", "llava_next", "bunny-llama"]:
         message = {"role": "user", "content": f"<image>\n{prompt}"}
     elif model_name.lower() == "phi3_v":
         message = {"role": "user", "content": f"<|image_1|>\n{prompt}"}
@@ -25,6 +25,11 @@ def get_message_json(model_name, prompt):
         message = {"role": "user", "content": f"<image>{prompt}"}
     elif model_name.lower() == "paligemma":
         message = prompt
+    elif model_name.lower() == "pixtral":
+        message = {
+            "role": "user",
+            "content": [{"type": "image"}, {"type": "text", "content": prompt}],
+        }
     else:
         raise ValueError(f"Unsupported model: {model_name}")
 
diff --git a/mlx_vlm/utils.py b/mlx_vlm/utils.py
index 7adb01a..9545311 100644
--- a/mlx_vlm/utils.py
+++ b/mlx_vlm/utils.py
@@ -711,21 +711,37 @@ def prepare_inputs(image_processor, processor, image, prompt, image_token_index)
     if isinstance(image, str):
         image = load_image(image)
 
+    image_grid_thw = None
     if image_processor is not None:
         text_chunks = [processor(chunk).input_ids for chunk in prompt.split("<image>")]
         input_ids = mx.array([text_chunks[0] + [image_token_index] + text_chunks[1]])
-        pixel_values = image_processor.preprocess(images=[image])[0]
-        pixel_values = mx.array(np.expand_dims(pixel_values, axis=0))
+        pixel_values = mx.array(image_processor.preprocess(images=[image])[0])
+        pixel_values = mx.array(mx.expand_dims(pixel_values, axis=0))
     else:
-        inputs = processor(
-            text=[prompt], images=[image], padding=True, return_tensors="np"
-        )
-        pixel_values = mx.array(inputs["pixel_values"])
+        processor.tokenizer.pad_token = processor.tokenizer.eos_token
+        try:
+            inputs = processor(
+                text=[prompt], images=[image], padding=True, return_tensors="mlx"
+            )
+        except Exception as e:
+            inputs = processor(
+                text=prompt, images=[image], padding=True, return_tensors="mlx"
+            )  # for phi3_v model
+
+        if isinstance(inputs["pixel_values"], list):
+            pixel_values = mx.array(inputs["pixel_values"][0][0])[None, :]
+        elif isinstance(inputs["pixel_values"], np.ndarray):
+            pixel_values = mx.array(inputs["pixel_values"])
+        else:
+            raise ValueError(
+                f"Invalid pixel_values type: {type(inputs['pixel_values'])}"
+            )
+
         input_ids = mx.array(inputs["input_ids"])
-        mask = mx.array(inputs["attention_mask"])
-        if "image_sizes" in inputs:
-            return input_ids, pixel_values, inputs["image_sizes"]
+        mask = inputs["attention_mask"]
         image_grid_thw = inputs.get("image_grid_thw", None)
+        if "image_sizes" in inputs:
+            return input_ids, pixel_values, inputs["image_sizes"], image_grid_thw
 
     return input_ids, pixel_values, mask, image_grid_thw