diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index dd164593134..82a2ebbf810 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -6703,7 +6703,7 @@ class MLLMSpecs:
         sm8650_token_rate: float
         sm8750_token_rate: float
         encoder_pte_size: float
-        text_embedding_pte_size: float
+        tok_embedding_pte_size: float
         decoder_pte_size: float
 
     @dataclass(frozen=True)
@@ -6719,7 +6719,7 @@ def setUp(self):
                 sm8650_token_rate=50,
                 sm8750_token_rate=55,
                 encoder_pte_size=110_000_000,  # 110MB
-                text_embedding_pte_size=100_000_000,  # 100MB
+                tok_embedding_pte_size=100_000_000,  # 100MB
                 decoder_pte_size=400_000_000,  # 400MB
                 image_path="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",  # New York Bay
                 golden_image_feature="city",
@@ -6729,7 +6729,7 @@ def setUp(self):
                 sm8650_token_rate=11,
                 sm8750_token_rate=13,
                 encoder_pte_size=425_000_000,  # 425MB
-                text_embedding_pte_size=300_000_000,  # 300MB
+                tok_embedding_pte_size=300_000_000,  # 300MB
                 decoder_pte_size=550_000_000,  # 550 MB
                 image_path="http://images.cocodataset.org/val2017/000000039769.jpg",  # Two cats lying on a blanket
                 golden_image_feature="cats",
@@ -6801,16 +6801,16 @@ def test_static_vlm(self):
                     print(f"Answer: {model_out}")
                 if not self.enable_x86_64:
                     encoder_pte_size = msg["encoder_pte_size"]
-                    text_embedding_pte_size = msg["text_embedding_pte_size"]
+                    tok_embedding_pte_size = msg["tok_embedding_pte_size"]
                     decoder_pte_size = msg["pte_size"]
                     self.assertLessEqual(encoder_pte_size, vlm_specs.encoder_pte_size)
                     self.assertLessEqual(
-                        text_embedding_pte_size, vlm_specs.text_embedding_pte_size
+                        tok_embedding_pte_size, vlm_specs.tok_embedding_pte_size
                     )
                     self.assertLessEqual(decoder_pte_size, vlm_specs.decoder_pte_size)
                     print(f"Encoder PTE Size: {encoder_pte_size} bytes")
-                    print(f"Text Embedding PTE Size: {text_embedding_pte_size} bytes")
-                    print(f"Decoder PTE Size: {decoder_pte_size} bytes")
+                    print(f"Token Embedding PTE Size: {tok_embedding_pte_size} bytes")
+                    print(f"Text Decoder PTE Size: {decoder_pte_size} bytes")
 
                 attr_name = f"{self.model.lower()}_token_rate"
                 if (
diff --git a/examples/qualcomm/oss_scripts/llama/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt
index 78a8410f034..5c6939546a6 100644
--- a/examples/qualcomm/oss_scripts/llama/CMakeLists.txt
+++ b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt
@@ -94,12 +94,15 @@ list(
   ${CMAKE_CURRENT_LIST_DIR}/qnn_multimodal_runner.cpp
   ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_runner.cpp
   ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_runner.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_embedding_merger.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_embedding_merger.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/utils.h
   ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/encoder.cpp
   ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/encoder.h
-  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_runner.cpp
-  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_runner.h
-  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_processor.cpp
-  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_processor.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/tok_embedding_runner.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/tok_embedding_runner.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/tok_embedding_processor.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/tok_embedding_processor.h
   ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_prompt_processor.cpp
   ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_prompt_processor.h
   ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_token_generator.cpp
diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md
index 0a4629ac132..fb926e9f613 100644
--- a/examples/qualcomm/oss_scripts/llama/README.md
+++ b/examples/qualcomm/oss_scripts/llama/README.md
@@ -308,6 +308,37 @@ If you have already compiled a VLM model, you can run inference with pre-generat
 python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode kv --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE}
 ```
 
+### Multi-Turn Conversation with VLM
+
+The framework supports multi-turn conversations with VLMs, allowing you to conduct dialogues that can involve multiple images.
+
+- **Multi-Turn Prompts**: To engage in a conversation, provide multiple prompts sequentially using the `--prompt` argument. Each string will be treated as a separate turn.
+- **Multiple Images**: You can supply multiple images (from URLs or local paths) using the `--image_path` argument.
+- **Flexible Image Placement**: Use the `<image>` token within your prompt to specify exactly where each image's embeddings should be placed. The images provided via `--image_path` will replace the `<image>` tokens in the order they appear.
+
+**Example**:
+
+In this example, the first turn compares two images, the second turn asks a follow-up question about the first image, and the third turn asks for a caption for a third image.
+
+```bash
+# Define image URLs and prompts for a 3-turn conversation
+IMAGE1_URL="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+IMAGE2_URL="http://images.cocodataset.org/val2017/000000039769.jpg"
+IMAGE3_URL="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"
+
+PROMPT1="<image><image>Compare these images above and list the differences."
+PROMPT2="Answer the question: What's the main object in first image?"
+PROMPT3="<image>Caption this image."
+
+# Execute the multi-turn conversation
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode kv --max_seq_len 2048 --prompt "$PROMPT1" "$PROMPT2" "$PROMPT3" --image_path "$IMAGE1_URL" "$IMAGE2_URL" "$IMAGE3_URL"
+```
+
+**How it works:**
+- **Turn 1**: The prompt `"<image><image>Compare these images above and list the differences."` uses the first two images (`$IMAGE1_URL`, `$IMAGE2_URL`).
+- **Turn 2**: The prompt `"Answer the question: What's the main object in first image?"` is a text-only follow-up. The conversation context is maintained from the previous turn.
+- **Turn 3**: The prompt `"<image>Caption this image."` uses the third image (`$IMAGE3_URL`).
+
 ### VLM Processing Details
 
 The VLM inference pipeline consists of:
diff --git a/examples/qualcomm/oss_scripts/llama/dataset.py b/examples/qualcomm/oss_scripts/llama/dataset.py
index 72f9e5d766a..2994baaafaa 100644
--- a/examples/qualcomm/oss_scripts/llama/dataset.py
+++ b/examples/qualcomm/oss_scripts/llama/dataset.py
@@ -5,17 +5,15 @@
 # LICENSE file in the root directory of this source tree.
 
 import argparse
-import warnings
-from typing import Callable, List, Optional
+from typing import Callable, Dict, List, Optional
 
 from executorch.examples.qualcomm.oss_scripts.llama import LLMModelConfig
 from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import (
     AUDIO_ENCODER,
     TEXT_DECODER,
-    TEXT_EMBEDDING,
     TEXT_ENCODER,
+    TOK_EMBEDDING,
     VISION_ENCODER,
-    VISION_ENCODER_INPUT_FILENAME,
 )
 
 from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_config import (
@@ -23,7 +21,6 @@
     VisionModalityConfig,
 )
 from executorch.examples.qualcomm.oss_scripts.llama.tokenizer import TokenizerWrapper
-
 from transformers import AutoProcessor
 from transformers.image_utils import load_image
 
@@ -43,35 +40,30 @@ def __init__(
         self.artifact = control_args.artifact
         self.repo_id = config.repo_id
 
-    def _build_vision_dataset(self, config: VisionModalityConfig, prompt: str):
+    def _build_vision_dataset(
+        self, config: VisionModalityConfig, prompt: str, files_path: List[str]
+    ):
         """
         This will processes images using the HuggingFace processor and saves
         the processed pixel values for runtime evaluation.
 
         Args:
             config (VisionModalityConfig): containing image URL and resize parameters
-            prompt (str): Text prompt to be processed alongside the image
+            prompt (str): Text prompt
+            files_path (List[str]): List of file paths for images. Each path can be either a URL or a local file path.
 
         Returns:
             tuple of pixel values tensors
         """
-        # Load image from user-specified path (URL or local file)
-        # fall back to the default image URL if no image is provided.
-        image_path = self.control_args.image_path or config.img_url
-        if not self.control_args.image_path:
-            warnings.warn(
-                f"No image path/URL provided, using default image URL: {config.img_url}",
-                UserWarning,
-                stacklevel=1,
-            )
-        image = load_image(image_path)
+
+        images = [load_image(image_path) for image_path in files_path]
 
         # Process image with text prompt using HuggingFace processor
         # Some HF processors (e.g. InternVL3) need to pass text arg or it will cause error and process failed
         processor = AutoProcessor.from_pretrained(self.repo_id)
         pixel_values = processor(
             text=prompt,
-            images=[image],
+            images=images,
             return_tensors="pt",
             crop_to_patches=False,
             size={
@@ -80,19 +72,26 @@ def _build_vision_dataset(self, config: VisionModalityConfig, prompt: str):
             },
         ).pixel_values
 
-        # save image file for runtime evaluation
-        pixel_values.detach().numpy().tofile(
-            f"{self.artifact}/{VISION_ENCODER_INPUT_FILENAME}.raw"
+        assert pixel_values.dim() in (4, 5), (
+            f"Unsupported pixel_values dim={pixel_values.dim()}); "
+            f"expected 5D (1,N,C,H,W) or 4D (N,C,H,W)."
         )
-        return (pixel_values,)
+
+        # HTP Prepare failed when pixel_values has 5D dimension, so we squeeze the batch dimension here.
+        if pixel_values.dim() == 5:
+            pixel_values = pixel_values.squeeze(0)  # (N, C, H, W)
+
+        # save image file for runtime evaluation
+        return [(pixel_values[i][None, ...],) for i in range(len(pixel_values))]
 
     def _build_dataset_for_encoder(
         self,
         config: MultiModalityConfig,
         prompt: str,
+        files_path: List[str],
     ) -> Optional[tuple]:
         if issubclass(config, VisionModalityConfig):
-            return self._build_vision_dataset(config, prompt)
+            return self._build_vision_dataset(config, prompt, files_path)
         else:
             # Audio and text encoder dataset building are not yet implemented
             # TODO: Add support for AudioModalityConfig and TextModalityConfig
@@ -106,22 +105,33 @@ def prepare_calibration_dataset(
         prompts: List[str],
         chat_template: Callable,
     ):
-        calibration_data = {
-            AUDIO_ENCODER: [],
-            TEXT_ENCODER: [],
-            VISION_ENCODER: [],
-            TEXT_EMBEDDING: [],
-            TEXT_DECODER: [],
+        # 1. Initialize data
+        # Shape convention: (num_samples, num_turns).
+        # Currently, user prompt calibration is one-shot per prompt (num_samples = 1).
+        calibration_data: Dict[str, List[List]] = {
+            # Encoders / embeddings: initialize an empty turn list for each prompt.
+            AUDIO_ENCODER: [[] for _ in range(len(prompts))],
+            TEXT_ENCODER: [[] for _ in range(len(prompts))],
+            VISION_ENCODER: [[] for _ in range(len(prompts))],
+            TOK_EMBEDDING: [[] for _ in range(len(prompts))],
+            # Decoder targets: one string per prompt.
+            TEXT_DECODER: ["" for _ in range(len(prompts))],
         }
 
+        # 2. Prepare messages for multi-turn conversation
+        messages = self.tokenizer_wrapper.prepare_messages(prompts)
+
+        # 3. build dataset by modality
         is_multimodal = any(
             [
                 hasattr(self.config, AUDIO_ENCODER),
                 hasattr(self.config, VISION_ENCODER),
             ]
         )
-        for prompt in prompts:
-            # Apply chat template formatting if available (for instruction-tuned/reasoning models)
+        for turn_idx, message in enumerate(messages):
+            prompt = message["text"]
+
+            # 3.1. Apply chat template formatting if available (for instruction-tuned/reasoning models)
             prompt = (
                 self.tokenizer_wrapper.apply_prompt_template(
                     chat_template, prompt, self.control_args.system_prompt
@@ -130,16 +140,19 @@ def prepare_calibration_dataset(
                 else prompt
             )
 
-            # Build calibration datasets for each available encoder modality
+            # 3.2 Build calibration datasets for each available encoder modality
             for modality in [AUDIO_ENCODER, TEXT_ENCODER, VISION_ENCODER]:
-                if hasattr(self.config, modality):
-                    data = self._build_dataset_for_encoder(
-                        getattr(self.config, modality),
-                        prompt,
-                    )
-                    calibration_data[modality].append(data)
-
-            # Expand multimodal tokens in prompt for decoder
+                if not hasattr(self.config, modality) or not message["files_path"]:
+                    continue
+
+                data = self._build_dataset_for_encoder(
+                    getattr(self.config, modality),
+                    prompt,
+                    message["files_path"],
+                )
+                calibration_data[modality][turn_idx] = data
+
+            # 3.3. Expand multimodal tokens in prompt for decoder
             prompt = (
                 self.tokenizer_wrapper.prepare_multimodal_prompt(prompt)
                 if is_multimodal
@@ -147,6 +160,6 @@ def prepare_calibration_dataset(
             )
 
             # Add prompt to decoder calibration data
-            calibration_data[TEXT_DECODER].append(prompt)
+            calibration_data[TEXT_DECODER][turn_idx] = prompt
 
         return calibration_data
diff --git a/examples/qualcomm/oss_scripts/llama/decoder_constants.py b/examples/qualcomm/oss_scripts/llama/decoder_constants.py
index 74e3959a86e..7a4c3e20be6 100644
--- a/examples/qualcomm/oss_scripts/llama/decoder_constants.py
+++ b/examples/qualcomm/oss_scripts/llama/decoder_constants.py
@@ -12,20 +12,21 @@
 TASKS_EVAL = "tasks_eval"
 SQNR_EVAL = "sqnr_eval"
 
-# filenames for vision model
-VISION_ENCODER_INPUT_FILENAME = "vision_encoder_input"
-
-
 # Component identifiers
 AUDIO_ENCODER = "audio_encoder"
 VISION_ENCODER = "vision_encoder"
 TEXT_ENCODER = "text_encoder"
-TEXT_EMBEDDING = "text_embedding"
+TOK_EMBEDDING = "tok_embedding"
 TEXT_DECODER = "text_decoder"
 ATTENTION_SINK_EVICTOR = "attention_sink_evictor"
 
+# Mapping of input flags for the runner
+MODALITY_INPUT_FLAG_MAP = {
+    VISION_ENCODER: "image_path",
+}
+
 # Text embedding graph names
-TEXT_EMBEDDING_GRAPH_NAMES = [
+TOK_EMBEDDING_GRAPH_NAMES = [
     "tok_embedding_kv_forward",
     "tok_embedding_prefill_forward",
 ]
diff --git a/examples/qualcomm/oss_scripts/llama/decoder_runtime_evaluator.py b/examples/qualcomm/oss_scripts/llama/decoder_runtime_evaluator.py
index ff173b1f753..55d7409a1e6 100644
--- a/examples/qualcomm/oss_scripts/llama/decoder_runtime_evaluator.py
+++ b/examples/qualcomm/oss_scripts/llama/decoder_runtime_evaluator.py
@@ -20,16 +20,20 @@
     ATTENTION_SINK_EVICTOR,
     DECODER_MODEL_VERSION,
     EVAL_MODE,
+    MODALITY_INPUT_FLAG_MAP,
     TEXT_DECODER,
-    TEXT_EMBEDDING,
+    TOK_EMBEDDING,
     VISION_ENCODER,
-    VISION_ENCODER_INPUT_FILENAME,
 )
 from executorch.examples.qualcomm.oss_scripts.llama.decoder_utils import (
     INFERENCE_REGISTRY,
     retrieve_info_from_pte,
 )
-from executorch.examples.qualcomm.utils import make_output_dir, SimpleADB
+from executorch.examples.qualcomm.utils import (
+    generate_inputs,
+    make_output_dir,
+    SimpleADB,
+)
 from pytorch_tokenizers.hf_tokenizer import HuggingFaceTokenizer
 from pytorch_tokenizers.llama2c import Llama2cTokenizer as SentencePieceTokenizer
 from pytorch_tokenizers.tiktoken import TiktokenTokenizer
@@ -82,19 +86,20 @@ def __init__(
         args: argparse.Namespace,
         pte_paths: Dict,
         runtime_tokenizer_path: str,
-        is_modality: bool,
+        is_multimodal: bool,
+        modality_inputs=None,
     ):
         self.args = args
         self.pte_paths = pte_paths
         self.runtime_tokenizer_path = runtime_tokenizer_path
         self.qnn_sdk = os.getenv("QNN_SDK_ROOT")
-        self.is_modality = is_modality
+        self.is_multimodal = is_multimodal
 
         self.device_workspace = (
             f"/data/local/tmp/{getpass.getuser()}/executorch/static_llm"
         )
         self.runner = (
-            "qnn_multimodal_runner" if self.is_modality else "qnn_llama_runner"
+            "qnn_multimodal_runner" if self.is_multimodal else "qnn_llama_runner"
         )
         device_output_path = self._get_adb().output_folder
         if args.enable_x86_64:
@@ -130,14 +135,13 @@ def _init_runner_base_cmd(self):
                     f"--performance_output_path {self.device_performance_path}",
                 ]
             )
-            if self.is_modality:
+            if self.is_multimodal:
                 base_cmd = " ".join(
                     [
                         base_cmd,
                         f"--decoder_path {self.pte_paths[TEXT_DECODER]}",
                         f"--encoder_path {self.pte_paths[VISION_ENCODER]}",
-                        f"--embedding_path {self.pte_paths[TEXT_EMBEDDING]}",
-                        f"--image_path {args.artifact}/{VISION_ENCODER_INPUT_FILENAME}.raw",
+                        f"--tok_embedding_path {self.pte_paths[TOK_EMBEDDING]}",
                     ]
                 )
             else:
@@ -165,14 +169,13 @@ def _init_runner_base_cmd(self):
                 ]
             )
 
-            if self.is_modality:
+            if self.is_multimodal:
                 base_cmd = " ".join(
                     [
                         base_cmd,
                         f"--decoder_path {os.path.basename(self.pte_paths[TEXT_DECODER])}",
                         f"--encoder_path {os.path.basename(self.pte_paths[VISION_ENCODER])}",
-                        f"--embedding_path {os.path.basename(self.pte_paths[TEXT_EMBEDDING])}",
-                        f"--image_path {VISION_ENCODER_INPUT_FILENAME}.raw",
+                        f"--tok_embedding_path {os.path.basename(self.pte_paths[TOK_EMBEDDING])}",
                     ]
                 )
             else:
@@ -218,11 +221,47 @@ def run(self) -> Any:
 
 
 class DefaultEval(EvalBase):
-    def __init__(self, args, pte_paths, runtime_tokenizer_path, is_modality):
-        super().__init__(args, pte_paths, runtime_tokenizer_path, is_modality)
+    def __init__(
+        self,
+        args,
+        pte_paths,
+        runtime_tokenizer_path,
+        is_multimodal,
+        modality_inputs=None,
+    ):
+        super().__init__(
+            args, pte_paths, runtime_tokenizer_path, is_multimodal, modality_inputs
+        )
         self.adb = self._get_adb()
         self.inference_speed = 0
 
+        modality_input_cmd = []
+        self.modality_input_files = []
+        for modality, data in modality_inputs.items():
+            if (
+                not modality_inputs[modality]
+                or modality not in MODALITY_INPUT_FLAG_MAP
+                or modality is TEXT_DECODER
+            ):
+                continue
+
+            # Specify the input list filename by it's modality.
+            # This helps distinguish inputs coming from different encoders,
+            # especially in models like OMI where vision and audio encoders coexist.
+            input_list_filename = f"{modality}_input_list.txt"
+            input_list_file, input_files = generate_inputs(
+                self.args.artifact,
+                input_list_filename=input_list_filename,
+                inputs=data,
+                prefix_input_filename=modality,
+            )
+            self.modality_input_files.append(input_list_file)
+            self.modality_input_files.extend(input_files)
+            modality_input_cmd.append(
+                f"--{MODALITY_INPUT_FLAG_MAP[modality]} {input_list_filename}"
+            )
+        self.modality_input_cmd = " ".join(modality_input_cmd)
+
         lookahead_args = " ".join(
             [
                 f"--window {args.window}",
@@ -282,10 +321,9 @@ def run(self, prompt):
             )
 
             extra_files = [self.runtime_tokenizer_path]
-            if self.is_modality:
-                extra_files = extra_files + [
-                    f"{self.args.artifact}/{VISION_ENCODER_INPUT_FILENAME}.raw"
-                ]
+            if self.is_multimodal:
+                extra_files.extend(self.modality_input_files)
+                runner_cmd = " ".join([runner_cmd, self.modality_input_cmd])
             self.adb.push(inputs=[], files=extra_files)
             self.adb.execute(custom_runner_cmd=runner_cmd)
             self.adb.pull(
@@ -324,9 +362,9 @@ def __init__(
         pte_paths,
         tokenizer,
         runtime_tokenizer_path,
-        is_modality,
+        is_multimodal,
     ):
-        super().__init__(args, pte_paths, runtime_tokenizer_path, is_modality)
+        super().__init__(args, pte_paths, runtime_tokenizer_path, is_multimodal)
         self.inference_speed = 0
         self.source_model = source_model
         self.get_example_inputs = get_example_inputs
@@ -613,12 +651,14 @@ def _model_call(self, inps):
             self.inference_speed = output_performance_holder[0]
             return output_logits_holder[0][:, :seq_len, :]
 
-    def __init__(self, args, pte_paths, tokenizer, runtime_tokenizer_path, is_modality):
+    def __init__(
+        self, args, pte_paths, tokenizer, runtime_tokenizer_path, is_multimodal
+    ):
         super().__init__(
             args=args,
             pte_paths=pte_paths,
             runtime_tokenizer_path=runtime_tokenizer_path,
-            is_modality=is_modality,
+            is_multimodal=is_multimodal,
         )
         self.inference_speed = None
         self.tasks = args.tasks
diff --git a/examples/qualcomm/oss_scripts/llama/decoder_utils.py b/examples/qualcomm/oss_scripts/llama/decoder_utils.py
index 20a7ab99c8d..d3261e1bb68 100644
--- a/examples/qualcomm/oss_scripts/llama/decoder_utils.py
+++ b/examples/qualcomm/oss_scripts/llama/decoder_utils.py
@@ -41,7 +41,7 @@ def _modality_inputs_merger(
     input_ids: torch.LongTensor,
     inputs_embeds: torch.Tensor,
     image_hidden_states: torch.Tensor,
-    modality_placeholder_token_id,
+    image_token_id,
 ):
     """
     This method aims at merging the token embeddings with the image hidden states into one single sequence of vectors that are fed to the transformer LM.
@@ -53,7 +53,7 @@ def _modality_inputs_merger(
     - To fit the format of that sequence, `input_ids`, `input_embeds`, `attention_mask` are all 3 adapted to insert the image hidden states.
     """
 
-    special_image_mask = input_ids == modality_placeholder_token_id
+    special_image_mask = input_ids == image_token_id
     special_image_mask = (
         special_image_mask.unsqueeze(-1)
         .expand_as(inputs_embeds)
@@ -501,7 +501,7 @@ def _generate(
     pos,
     module: torch.fx.GraphModule,
     tokenizer,
-    text_embedding,
+    tok_embedding,
     ar_len: int,
     max_seq_len: int,
     k_caches,
@@ -527,7 +527,7 @@ def _generate(
 
             if inputs.input_ids is None:
                 # Get text_embedding
-                embedding = text_embedding(tmp_token_list)
+                embedding = tok_embedding(tmp_token_list)
 
             # Prepare tmp_pos (padded with zeros).
             tmp_pos = torch.zeros((1, ar_len), dtype=torch.int32)
@@ -606,7 +606,7 @@ def _generate(
                 )
             else:
                 logits, new_k_caches, new_v_caches = module(
-                    text_embedding(
+                    tok_embedding(
                         torch.tensor(
                             input_tokens, dtype=inputs.input_ids_dtype
                         ).unsqueeze(0)
@@ -667,8 +667,8 @@ def kv_inference(  # noqa: C901
     module: torch.fx.GraphModule,
     tokenizer,
     tok_embedding=None,
-    hidden_states=None,
-    modality_placeholder_token_id=None,
+    hidden_states: Tuple = (),
+    image_token_id=None,
     ar_len=1,
     max_seq_len=512,
     use_i64_token=False,
@@ -679,8 +679,7 @@ def kv_inference(  # noqa: C901
     is_multimodal = all(
         [
             tok_embedding is not None,
-            hidden_states is not None,
-            modality_placeholder_token_id is not None,
+            image_token_id is not None,
         ]
     )
 
@@ -706,8 +705,9 @@ def kv_inference(  # noqa: C901
         # pyre-ignore
         prompt_token_list = prompt.flatten().tolist()
 
-    # 2. forward text embedding
+    # 2. process embedding
     if is_multimodal:
+        # 2.1 forward text embedding
         input_ids = torch.tensor([prompt_token_list])
         input_ids = (
             input_ids.to(torch.int64) if use_i64_token else input_ids.to(torch.int32)
@@ -716,11 +716,12 @@ def kv_inference(  # noqa: C901
         padded_seq_len = max(input_ids_len, ar_len)
         padded_seq_len = ((padded_seq_len + ar_len - 1) // ar_len) * ar_len
 
+        embedding_dim = [p for _, p in tok_embedding.named_parameters()][0].shape[-1]
         text_embeddings = torch.zeros(
             (
                 1,
                 padded_seq_len,
-                hidden_states[0].shape[-1],
+                embedding_dim,
             ),
             dtype=torch.float32,
         )
@@ -745,12 +746,18 @@ def kv_inference(  # noqa: C901
                         :, chunk_start_idx : chunk_start_idx + actual_chunk_len, :
                     ] = embedding
 
-            multimodal_embedding = _modality_inputs_merger(
-                input_ids,
-                text_embeddings[:, :input_ids_len, :],  # Only use actual prompt length
-                torch.cat(hidden_states, dim=1),
-                modality_placeholder_token_id,
-            )
+            # 2.2 merge text and multimodality embedding
+            if hidden_states:
+                multimodal_embedding = _modality_inputs_merger(
+                    input_ids,
+                    text_embeddings[
+                        :, :input_ids_len, :
+                    ],  # Only use actual prompt length
+                    torch.cat(hidden_states, dim=1),
+                    image_token_id,
+                )
+            else:
+                multimodal_embedding = text_embeddings[:, :input_ids_len, :]
 
     # record total input tokens and generated tokens
     total_token_list = prompt_token_list
@@ -809,7 +816,7 @@ def prefill_inference(
     tokenizer,
     tok_embedding=None,
     hidden_states=None,
-    modality_placeholder_token_id=None,
+    image_token_id=None,
     max_seq_len=512,
     use_i64_token=False,
     collect_logits=False,
@@ -818,7 +825,7 @@ def prefill_inference(
         [
             tok_embedding is not None,
             hidden_states is not None,
-            modality_placeholder_token_id is not None,
+            image_token_id is not None,
         ]
     )
 
@@ -863,7 +870,7 @@ def prefill_inference(
                     tmp_token_list,
                     text_embeddings,
                     torch.cat(hidden_states, dim=1),
-                    modality_placeholder_token_id,
+                    image_token_id,
                 )
                 results = module(multimodal_embedding, *atten_mask)
             else:
@@ -891,8 +898,8 @@ def graph_module_inference(
     max_seq_len=512,
     prompt=None,
     tok_embedding=None,
-    hidden_states=None,
-    modality_placeholder_token_id=None,
+    hidden_states: Tuple = (),
+    image_token_id=None,
     tasks=None,
     tasks_limit=1,
     num_fewshot=None,
@@ -923,7 +930,7 @@ def graph_module_inference(
             tokenizer,
             tok_embedding=tok_embedding,
             hidden_states=hidden_states,
-            modality_placeholder_token_id=modality_placeholder_token_id,
+            image_token_id=image_token_id,
             max_seq_len=max_seq_len,
             use_i64_token=use_i64_token,
             collect_logits=False,
@@ -941,7 +948,6 @@ def graph_module_inference(
             use_i64_token=use_i64_token,
             seq_mse_candidates=seq_mse_candidates,
         )
-        # Evaluate the model
         with torch.no_grad():
             eval_results = simple_evaluate(
                 model=calibration_wrapper,
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
index 2e7ae6d57d4..3200cdcd728 100755
--- a/examples/qualcomm/oss_scripts/llama/llama.py
+++ b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -36,9 +36,9 @@
     SQNR_EVAL,
     TASKS_EVAL,
     TEXT_DECODER,
-    TEXT_EMBEDDING,
-    TEXT_EMBEDDING_GRAPH_NAMES,
     TEXT_ENCODER,
+    TOK_EMBEDDING,
+    TOK_EMBEDDING_GRAPH_NAMES,
     VISION_ENCODER,
 )
 from executorch.examples.qualcomm.oss_scripts.llama.decoder_runtime_evaluator import (
@@ -101,68 +101,80 @@ def compile(
     os.makedirs(args.artifact, exist_ok=True)
     multi_modal_mgr = MultiModalManager(control_args=args, config=decoder_model_config)
 
-    # perform ptq
-    multi_modal_mgr.quantize(
-        calibration_data=calibration_data,
-        tokenizer=tokenizer,
-        backend=get_backend_type(args.backend),
-        soc_model=args.model,
+    skip_quantize = {}
+    is_multimodal = any(
+        [
+            hasattr(decoder_model_config, VISION_ENCODER),
+            hasattr(decoder_model_config, AUDIO_ENCODER),
+        ]
     )
 
-    # Prepare dataset
+    # Prepare ptq option and compile spec
     compile_specs = {
         AUDIO_ENCODER: None,
         TEXT_ENCODER: None,
         VISION_ENCODER: None,
-        TEXT_EMBEDDING: None,
+        TOK_EMBEDDING: None,
         TEXT_DECODER: None,
     }
-    is_modality = False
-    # compile spec for multimodality encoder
     for modality in compile_specs:
-        if not hasattr(decoder_model_config, modality):
-            continue
-
-        backend_options = generate_htp_compiler_spec(
-            use_fp16=False,
-        )
-        encoder_compile_specs = generate_qnn_executorch_compiler_spec(
-            soc_model=get_soc_to_chipset_map()[args.model],
-            backend_options=backend_options,
-        )
-        compile_specs[modality] = encoder_compile_specs
-        is_modality = True
-
-    # text embedding compilation spec: default we use quantization version, since embedding is huge
-    if is_modality:
-        backend_options = generate_htp_compiler_spec(
-            use_fp16=False,
-            # x86 emulator does not support weight sharing
-            use_weight_sharing=not args.enable_x86_64,
-        )
-        compile_specs[TEXT_EMBEDDING] = [
-            generate_qnn_executorch_compiler_spec(
+        if is_multimodal and modality in {AUDIO_ENCODER, TEXT_ENCODER, VISION_ENCODER}:
+            # Encoder quantization is enabled only when the input contains a single image in each conversation.
+            # In multi‑image scenarios, we skip encoder quantization by default to preserve modality feature quality,
+            # because the encoder is quite sensitive and quantization can make it harder for the model to distinguish
+            # between images within the same conversation.
+            to_skip = len(args.image_path) > 1
+            backend_options = generate_htp_compiler_spec(
+                use_fp16=to_skip,
+            )
+            encoder_compile_specs = generate_qnn_executorch_compiler_spec(
                 soc_model=get_soc_to_chipset_map()[args.model],
                 backend_options=backend_options,
-                shared_buffer=not args.enable_x86_64,  # x86 emulator does not support shared buffer
+                # x86 emulator does not support shared buffer
+                shared_buffer=not args.enable_x86_64,
             )
-        ] * len(TEXT_EMBEDDING_GRAPH_NAMES)
-
-    # compile spec for text decoder
-    backend_options = generate_htp_compiler_spec(
-        use_fp16=False,
-        use_multi_contexts=decoder_model_config.num_sharding > 1,
-        # x86 emulator does not support weight sharing
-        use_weight_sharing=not args.enable_x86_64,
-    )
-    compile_specs[TEXT_DECODER] = [
-        generate_qnn_executorch_compiler_spec(
-            soc_model=get_soc_to_chipset_map()[args.model],
-            backend_options=backend_options,
-            shared_buffer=not args.enable_x86_64,
-            use_mha2sha=True,
-        )
-    ] * len(DECODER_GRAPH_NAMES)
+            skip_quantize[modality] = to_skip
+            compile_specs[modality] = encoder_compile_specs
+        elif is_multimodal and modality == TOK_EMBEDDING:
+            backend_options = generate_htp_compiler_spec(
+                use_fp16=False,
+                # x86 emulator does not support weight sharing
+                use_weight_sharing=not args.enable_x86_64,
+            )
+            compile_specs[modality] = [
+                generate_qnn_executorch_compiler_spec(
+                    soc_model=get_soc_to_chipset_map()[args.model],
+                    backend_options=backend_options,
+                    # x86 emulator does not support shared buffer
+                    shared_buffer=not args.enable_x86_64,
+                )
+            ] * len(TOK_EMBEDDING_GRAPH_NAMES)
+        elif modality == TEXT_DECODER:
+            # compile spec for text decoder
+            backend_options = generate_htp_compiler_spec(
+                use_fp16=False,
+                use_multi_contexts=decoder_model_config.num_sharding > 1,
+                # x86 emulator does not support weight sharing
+                use_weight_sharing=not args.enable_x86_64,
+            )
+            compile_specs[modality] = [
+                generate_qnn_executorch_compiler_spec(
+                    soc_model=get_soc_to_chipset_map()[args.model],
+                    backend_options=backend_options,
+                    # x86 emulator does not support shared buffer
+                    shared_buffer=not args.enable_x86_64,
+                    use_mha2sha=True,
+                )
+            ] * len(DECODER_GRAPH_NAMES)
+
+    # perform ptq
+    multi_modal_mgr.quantize(
+        calibration_data=calibration_data,
+        skip_quantize=skip_quantize,
+        tokenizer=tokenizer,
+        backend=get_backend_type(args.backend),
+        soc_model=args.model,
+    )
 
     # perform compilation
     multi_modal_mgr.compile(compile_specs=compile_specs, pte_filenames=pte_filenames)
@@ -176,14 +188,18 @@ def inference(
     chat_template,
     text_decoder_pte_path: str,
     encoder_pte_path: str,
-    text_embedding_pte_path: str,
+    tok_embedding_pte_path: str,
     attention_sink_evictor_pte_path: str,
+    calibration_data,
 ):
 
     assert args.model_mode in EVAL_MODE, f"Unknown model_mode: {args.model_mode}."
 
-    is_modality = hasattr(decoder_model_config, VISION_ENCODER) or hasattr(
-        decoder_model_config, AUDIO_ENCODER
+    is_multimodal = any(
+        [
+            hasattr(decoder_model_config, VISION_ENCODER),
+            hasattr(decoder_model_config, AUDIO_ENCODER),
+        ]
     )
     pte_paths = {TEXT_DECODER: text_decoder_pte_path}
     eval_results = {
@@ -200,17 +216,17 @@ def inference(
             }
         )
 
-    if is_modality:
+    if is_multimodal:
         eval_results.update(
             {
                 "encoder_pte_size": os.path.getsize(encoder_pte_path),
-                "text_embedding_pte_size": os.path.getsize(text_embedding_pte_path),
+                "tok_embedding_pte_size": os.path.getsize(tok_embedding_pte_path),
             }
         )
         pte_paths.update(
             {
                 VISION_ENCODER: encoder_pte_path,
-                TEXT_EMBEDDING: text_embedding_pte_path,
+                TOK_EMBEDDING: tok_embedding_pte_path,
             }
         )
 
@@ -219,7 +235,8 @@ def inference(
             args=args,
             pte_paths=pte_paths,
             runtime_tokenizer_path=runtime_tokenizer_path,
-            is_modality=is_modality,
+            is_multimodal=is_multimodal,
+            modality_inputs=calibration_data,
         )
         output_prompt = prompt_evaluator.run(prompt=args.prompt)
         eval_results.update(
@@ -232,7 +249,7 @@ def inference(
             logging.info(f"Device Inference Results[{idx}]:\n{output}")
 
     if SQNR_EVAL in args.eval_methods:
-        assert not is_modality, "Modality Model does not support SQNR_EVAL."
+        assert not is_multimodal, "Modality Model does not support SQNR_EVAL."
         tokenizer_wrapper = TokenizerWrapper(
             args,
             decoder_model_config,
@@ -255,7 +272,7 @@ def inference(
             pte_paths=pte_paths,
             tokenizer=tokenizer,
             runtime_tokenizer_path=runtime_tokenizer_path,
-            is_modality=is_modality,
+            is_multimodal=is_multimodal,
         )
         sqnr, golden_logits, _ = sqnr_evaluator.run(prompt=prompt)
         logging.info(f"SQNR Eval Score between FP32 nn.Module and QNN: {sqnr}")
@@ -280,7 +297,7 @@ def inference(
                 pte_paths=pte_paths,
                 tokenizer=tokenizer,
                 runtime_tokenizer_path=runtime_tokenizer_path,
-                is_modality=is_modality,
+                is_multimodal=is_multimodal,
             )
             qdq_sqnr, cpu_qdq_logits, _ = qdq_sqnr_evaluator.run(prompt=prompt)
             eval_results["qdq_sqnr"] = qdq_sqnr
@@ -294,14 +311,14 @@ def inference(
             )
 
     if TASKS_EVAL in args.eval_methods:
-        assert not is_modality, "Modality Model does not support TASKS_EVAL."
+        assert not is_multimodal, "Multimodal does not support TASKS_EVAL."
         # Generate the eval wrapper
         ppl_evaluator = TaskEval(
             args=args,
             pte_paths=pte_paths,
             tokenizer=tokenizer,
             runtime_tokenizer_path=runtime_tokenizer_path,
-            is_modality=is_modality,
+            is_multimodal=is_multimodal,
         )
         ppl_eval_result = ppl_evaluator.run()
         eval_results["inference_speed"] = ppl_evaluator.inference_speed
@@ -470,8 +487,9 @@ def _build_parser():
     parser.add_argument(
         "--image_path",
         help="Path to the image file for multimodal language models (MLLM). If not specified, the default image from encoder/encoder_config.py will be used. The image should be preprocessed and saved in raw binary format.",
-        default=None,
+        default=[],
         type=str,
+        nargs="+",
     )
 
     parser.add_argument(
@@ -563,7 +581,7 @@ def export_llama(args) -> None:
         AUDIO_ENCODER: f"{AUDIO_ENCODER}_qnn",
         TEXT_ENCODER: f"{TEXT_ENCODER}_qnn",
         VISION_ENCODER: f"{VISION_ENCODER}_qnn",
-        TEXT_EMBEDDING: f"{TEXT_EMBEDDING}_qnn",
+        TOK_EMBEDDING: f"{TOK_EMBEDDING}_qnn",
     }
     # Prepare tokenizer
     tokenizer_wrapper = TokenizerWrapper(
@@ -584,7 +602,7 @@ def export_llama(args) -> None:
     text_decoder_pte_path = f"{args.artifact}/{pte_filenames[TEXT_DECODER]}.pte"
     attention_sink_evictor_pte_path = f"{args.artifact}/{ATTENTION_SINK_EVICTOR}.pte"
     encoder_pte_path = f"{args.artifact}/{pte_filenames[VISION_ENCODER]}.pte"
-    text_embedding_pte_path = f"{args.artifact}/{pte_filenames[TEXT_EMBEDDING]}.pte"
+    tok_embedding_pte_path = f"{args.artifact}/{pte_filenames[TOK_EMBEDDING]}.pte"
 
     # TODO: Implement attention sink support for multimodal models (vision/audio).
     assert (
@@ -596,25 +614,14 @@ def export_llama(args) -> None:
         "Multimodal models currently do not support attention sink feature."
     )
 
-    # TODO: Implement multi-turn conversation support for multimodal models (vision/audio).
-    assert (
-        not (
-            hasattr(decoder_model_config, VISION_ENCODER)
-            or hasattr(decoder_model_config, AUDIO_ENCODER)
-        )
-    ) or (len(args.prompt) <= 1), (
-        "Multimodal models currently do not support multi-turn. "
-        "Please set `--prompt` to 1 or switch to a unimodal (text-only) decoder."
-    )
-
     if args.pre_gen_pte:
         text_decoder_pte_path = f"{args.pre_gen_pte}/{pte_filenames[TEXT_DECODER]}.pte"
         attention_sink_evictor_pte_path = (
             f"{args.pre_gen_pte}/{ATTENTION_SINK_EVICTOR}.pte"
         )
         encoder_pte_path = f"{args.pre_gen_pte}/{pte_filenames[VISION_ENCODER]}.pte"
-        text_embedding_pte_path = (
-            f"{args.pre_gen_pte}/{pte_filenames[TEXT_EMBEDDING]}.pte"
+        tok_embedding_pte_path = (
+            f"{args.pre_gen_pte}/{pte_filenames[TOK_EMBEDDING]}.pte"
         )
 
         if args.use_attention_sink:
@@ -632,8 +639,9 @@ def export_llama(args) -> None:
             chat_template,
             text_decoder_pte_path,
             encoder_pte_path,
-            text_embedding_pte_path,
+            tok_embedding_pte_path,
             attention_sink_evictor_pte_path,
+            calibration_data,
         )
         print(f"Finish the running pre_gen_pte from {args.pre_gen_pte}")
         return
@@ -655,16 +663,36 @@ def export_llama(args) -> None:
 
     if args.compile_only:
         if args.ip and args.port != -1:
-            pte_path = f"{args.artifact}/{pte_filename}.pte"
-            pte_size = os.path.getsize(pte_path)
-            with Client((args.ip, args.port)) as conn:
-                conn.send(
-                    json.dumps(
-                        {
-                            "pte_size": pte_size,
-                        }
-                    )
+            # Prepare validation results for CI system
+            text_decoder_pte_path = f"{args.artifact}/{pte_filename}.pte"
+            text_decoder_pte_path = os.path.getsize(text_decoder_pte_path)
+            validation_results = {
+                "pte_size": text_decoder_pte_path,
+            }
+            if any(
+                [
+                    hasattr(decoder_model_config, VISION_ENCODER),
+                    hasattr(decoder_model_config, AUDIO_ENCODER),
+                ]
+            ):
+                encoder_pte_path = (
+                    f"{args.artifact}/{pte_filenames[VISION_ENCODER]}.pte"
+                )
+                tok_embedding_pte_path = (
+                    f"{args.artifact}/{pte_filenames[TOK_EMBEDDING]}.pte"
                 )
+                validation_results.update(
+                    {
+                        "encoder_pte_size": os.path.getsize(encoder_pte_path),
+                        "tok_embedding_pte_size": os.path.getsize(
+                            tok_embedding_pte_path
+                        ),
+                    }
+                )
+
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps(validation_results))
+
         print(f"Finish compile_only and save to {args.artifact}")
         return
 
@@ -676,8 +704,9 @@ def export_llama(args) -> None:
         chat_template,
         text_decoder_pte_path,
         encoder_pte_path,
-        text_embedding_pte_path,
+        tok_embedding_pte_path,
         attention_sink_evictor_pte_path,
+        calibration_data,
     )
 
 
diff --git a/examples/qualcomm/oss_scripts/llama/model/embedding.py b/examples/qualcomm/oss_scripts/llama/model/embedding.py
index 4956012baf0..8d1c92913c1 100644
--- a/examples/qualcomm/oss_scripts/llama/model/embedding.py
+++ b/examples/qualcomm/oss_scripts/llama/model/embedding.py
@@ -8,7 +8,7 @@
 from torch import nn
 
 
-class TextEmbedding(nn.Module):
+class TokenEmbedding(nn.Module):
     def __init__(
         self,
         input_embedding_module,
diff --git a/examples/qualcomm/oss_scripts/llama/model/static_llama.py b/examples/qualcomm/oss_scripts/llama/model/static_llama.py
index e90e69e4114..3ccee2d7749 100755
--- a/examples/qualcomm/oss_scripts/llama/model/static_llama.py
+++ b/examples/qualcomm/oss_scripts/llama/model/static_llama.py
@@ -842,18 +842,9 @@ def __init__(
             use_i64_token=use_i64_token,
         )
 
-        # Initialize modality placeholder token ID
-        # Default value of -1 indicates embeddings come from text encoder
-        # Note: Text encoder modality is not currently supported
-        self.modality_placeholder_token_id = kwargs.get(
-            "modality_placeholder_token_id", -1
-        )
-
-        if self.modality_placeholder_token_id == -1:
-            raise NotImplementedError(
-                "Text encoder modality (modality_placeholder_token_id=-1) is not currently supported. "
-                "Please provide a valid modality_placeholder_token_id in kwargs."
-            )
+        # Set the image token ID from keyword arguments. It defaults to None if not provided.
+        # If an ID is provided, it will be stored in the model's metadata.
+        self.image_token_id = kwargs.get("image_token_id", None)
 
     def forward(
         self,
@@ -943,7 +934,8 @@ def get_example_inputs(self):
 
     def get_metadata(self):
         meta_data = super().get_metadata()
-        meta_data["modality_placeholder_token_id"] = self.modality_placeholder_token_id
+        if self.image_token_id:
+            meta_data["image_token_id"] = self.image_token_id
         return meta_data
 
 
diff --git a/examples/qualcomm/oss_scripts/llama/model/vision_encoder.py b/examples/qualcomm/oss_scripts/llama/model/vision_encoder.py
index 431e28a20d6..5eb107d67b7 100644
--- a/examples/qualcomm/oss_scripts/llama/model/vision_encoder.py
+++ b/examples/qualcomm/oss_scripts/llama/model/vision_encoder.py
@@ -4,8 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Tuple
-
 import torch
 
 from executorch.examples.qualcomm.utils import replace_module_with_custom_class
@@ -177,11 +175,6 @@ def __init__(
             extra_custom_kwargs={"config": config.vision_config},
         )
 
-    def preprocess(self, pixel_values: Tuple[torch.FloatTensor]) -> Tuple[torch.Tensor]:
-        # HTP Prepare failed when pixel_values has 5D dimension, so we squeeze the batch dimension here.
-        pixel_values = pixel_values[0]
-        return (pixel_values.squeeze(0),)
-
     def get_example_inputs(self):
         # pixel values - use config dimensions instead of hardcoded values
         return (
@@ -257,9 +250,6 @@ def __init__(
         self.img_resized_h = img_resized_h
         self.img_resized_w = img_resized_w
 
-    def preprocess(self, pixel_values: Tuple[torch.FloatTensor]) -> Tuple[torch.Tensor]:
-        return pixel_values
-
     def get_example_inputs(self):
         # pixel values - use config dimensions instead of hardcoded values
         return (
diff --git a/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner.cpp
index 7cadc0bb0dd..f6379d9243d 100644
--- a/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner.cpp
@@ -15,21 +15,32 @@
  */
 
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/chat_template.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/utils.h>
 #include <executorch/extension/llm/runner/image.h>
 #include <executorch/extension/llm/runner/irunner.h>
+#include <executorch/extension/llm/runner/multimodal_input.h>
 #include <executorch/runtime/platform/log.h>
 #include <gflags/gflags.h>
 
 #include <fstream>
 #include <vector>
 
+using executorch::aten::ScalarType;
+using executorch::extension::llm::Image;
+using ::executorch::extension::llm::make_image_input;
+using ::executorch::extension::llm::make_text_input;
+using executorch::extension::llm::MultimodalInput;
+using executorch::runtime::MethodMeta;
+using executorch::runtime::Result;
+
 // Model paths
 DEFINE_string(
-    embedding_path,
-    "embedding.pte",
-    "Path to embedding model serialized in flatbuffer format.");
+    tok_embedding_path,
+    "tok_embedding.pte",
+    "Path to tok_embedding model serialized in flatbuffer format.");
 DEFINE_string(
     encoder_path,
     "encoder.pte",
@@ -119,128 +130,11 @@ std::vector<std::string> CollectPrompts(int argc, char** argv) {
   return prompts;
 }
 
-/**
- * Special tokens structure for different models
- */
-struct SpecialTokens {
-  std::string image_token;
-  std::string global_img;
-  std::string fake_wrap_start;
-  std::string fake_wrap_end;
-};
-
-/**
- * Get special tokens based on decoder model version
- */
-SpecialTokens get_special_tokens(
-    example::MultimodalDecoderModelVersion decoder_model_version) {
-  SpecialTokens tokens;
-
-  switch (decoder_model_version) {
-    case example::MultimodalDecoderModelVersion::
-        kSmolvlm: // smolvlm_500m_instruct
-      tokens.image_token = "<image>";
-      tokens.global_img = "<global-img>";
-      tokens.fake_wrap_start = "<fake_token_around_image>";
-      tokens.fake_wrap_end = "<fake_token_around_image>";
-      break;
-    case example::MultimodalDecoderModelVersion::kInternvl3: // internvl3_1b
-      tokens.image_token = "<IMG_CONTEXT>";
-      tokens.global_img = "";
-      tokens.fake_wrap_start = "<img>";
-      tokens.fake_wrap_end = "</img>";
-      break;
-    default:
-      break;
-  }
-
-  return tokens;
-}
-
-/**
- * Prepare multimodal token IDs by expanding image tokens
- * This implements the logic from prepare_multimodal_token_ids in Python
- */
-std::string prepare_multimodal_prompt(
-    const std::string& prompt,
-    int image_seq_len,
-    const SpecialTokens& specials) {
-  // Create image prompt with repeated image tokens
-  std::string image_prompt = specials.fake_wrap_start;
-  image_prompt += specials.global_img;
-  for (int i = 0; i < image_seq_len; ++i) {
-    image_prompt += specials.image_token;
-  }
-  image_prompt += specials.fake_wrap_end;
-
-  // Replace single image token with expanded version
-  size_t pos = 0;
-  std::string expanded = prompt;
-  while ((pos = expanded.find(specials.image_token, pos)) !=
-         std::string::npos) {
-    expanded.replace(pos, specials.image_token.size(), image_prompt);
-    pos += image_prompt.size();
-  }
-  ET_LOG(Info, "Prompt after expanding image token: %s", expanded.c_str());
-
-  return expanded;
-}
-
-/**
- * Format prompt based on model version with multimodal token expansion
- */
-std::string get_formatted_prompt(
-    const std::string& prompt,
-    const std::string& system_prompt,
-    example::MultimodalDecoderModelVersion decoder_model_version,
-    int32_t img_seq_len = 0) {
-  std::string formatted_prompt;
-
-  // Get special tokens for this model
-  SpecialTokens specials = get_special_tokens(decoder_model_version);
-
-  switch (decoder_model_version) {
-    case example::MultimodalDecoderModelVersion::kSmolvlm:
-      if (!system_prompt.empty()) {
-        formatted_prompt.append(
-            "<|start_header_id|>system<|end_header_id|>\n\n");
-        formatted_prompt.append(system_prompt);
-        formatted_prompt.append("<|eot_id|>");
-      }
-      formatted_prompt.append("<|im_start|>User:");
-      formatted_prompt.append(specials.image_token);
-      formatted_prompt.append(prompt);
-      formatted_prompt.append("<end_of_utterance>\nAssistant:");
-      break;
-    case example::MultimodalDecoderModelVersion::kInternvl3:
-      if (!system_prompt.empty()) {
-        formatted_prompt.append("<|im_start|>system<|im_end|>\n\n");
-        formatted_prompt.append(system_prompt);
-        formatted_prompt.append("<|im_end|>");
-      }
-      formatted_prompt.append("<|im_start|>user:\n");
-      formatted_prompt.append(specials.image_token);
-      formatted_prompt.append("\n");
-      formatted_prompt.append(prompt);
-      formatted_prompt.append("<|im_end|>assistant\n");
-      break;
-    default:
-      ET_CHECK_MSG(false, "unsupported VLM version");
-      break;
-  }
-
-  // Expand image tokens
-  formatted_prompt =
-      prepare_multimodal_prompt(formatted_prompt, img_seq_len, specials);
-
-  return formatted_prompt;
-}
-
 template <typename T>
 void start_multimodal_runner(
-    std::unique_ptr<example::EncoderRunner> encoder_runner,
-    std::unique_ptr<executorch::extension::Module> module,
-    std::unique_ptr<executorch::extension::Module> embedding,
+    std::unique_ptr<executorch::extension::Module> encoder,
+    std::unique_ptr<executorch::extension::Module> tok_embedding,
+    std::unique_ptr<executorch::extension::Module> text_decoder,
     std::vector<std::string>& prompts) {
   ET_LOG(Info, "Starting multimodal runner");
 
@@ -248,32 +142,12 @@ void start_multimodal_runner(
       gflags::GetCommandLineFlagInfoOrDie("tokenized_prompt").is_default ? false
                                                                          : true;
 
-  // Load image, run encoder forward pass, and set image hidden states if
-  // provided
-  bool has_image = !FLAGS_image_path.empty();
-
-  // Load encoder
-  if (encoder_runner->load() != executorch::runtime::Error::Ok) {
-    ET_LOG(Error, "Failed to load encoder");
-    return;
-  }
-
-  // Encode image from file
-  auto encode_result =
-      encoder_runner->encode_from_file(FLAGS_image_path.c_str());
-  if (!encode_result.ok()) {
-    ET_LOG(Error, "Failed to encode image");
-    return;
-  }
-
-  auto image_hidden_states = encode_result.get();
-
   // Create multimodal runner
-  example::MultimodalRunner<T> runner(
-      std::move(module),
-      std::move(embedding),
+  example::QNNMultimodalRunner<T> runner(
+      std::move(encoder),
+      std::move(tok_embedding),
+      std::move(text_decoder),
       FLAGS_decoder_model_version.c_str(),
-      FLAGS_decoder_path.c_str(),
       FLAGS_tokenizer_path.c_str(),
       FLAGS_dump_logits_path.c_str(),
       FLAGS_performance_output_path.c_str(),
@@ -282,23 +156,25 @@ void start_multimodal_runner(
       FLAGS_shared_buffer,
       FLAGS_ngram,
       FLAGS_window,
-      FLAGS_gcap,
-      std::make_unique<executorch::aten::Tensor>(image_hidden_states));
+      FLAGS_gcap);
+
+  auto model_version = runner.get_model_version().get();
 
-  auto decoder_model_version = runner.get_decoder_model_version();
+  if (modality_of(model_version) == example::Modality::kVision) {
+    ET_CHECK_MSG(
+        !FLAGS_image_path.empty(),
+        "For VLM models, please specify image path.");
+  }
 
   // Prepare output buffer (similar to qnn_llama_runner.cpp)
   std::vector<char> buf;
   buf.reserve(5 * FLAGS_seq_len); // assume each token is around 5 char
   std::ofstream fout(FLAGS_output_path.c_str());
-
   auto callback = [&](const std::string& piece) {
     for (const char c : piece) {
       buf.push_back(c);
     }
   };
-
-  // Configure generation
   executorch::extension::llm::GenerationConfig config{
       true,
       false,
@@ -309,25 +185,45 @@ void start_multimodal_runner(
       0,
       0};
 
-  // Get image sequence length from encoder
-  int32_t img_seq_len = encoder_runner->get_image_seq_len();
-  if (use_tokenized_prompt) {
-    runner.generate_from_prompt_or_file(
-        FLAGS_tokenizer_path.c_str(), use_tokenized_prompt, config, callback);
-  } else {
-    // generate tokens & store inference output
-    for (int i = 0; i < FLAGS_num_iters; i++) {
-      for (size_t j = 0; j < prompts.size(); ++j) {
-        const auto& prompt = prompts[j];
-        std::string formatted_prompt;
-        formatted_prompt = get_formatted_prompt(
-            prompt,
-            FLAGS_system_prompt,
-            decoder_model_version.get(),
-            img_seq_len);
-        runner.generate_from_prompt_or_file(
-            formatted_prompt.c_str(), use_tokenized_prompt, config, callback);
+  // 1. [Multi-modality] Get raw files from input_list.txt
+  std::vector<std::string> raw_files =
+      example::load_raw_files(FLAGS_image_path.c_str());
+
+  // 2. Prepare messages for multi-turn simulation
+  std::vector<Message> messages = prepare_messages(prompts, raw_files);
+
+  // 3. Get expected input size/dtype for encoder
+  Result<MethodMeta> method_meta = runner.get_encoder_method_meta();
+  auto input_meta_result = method_meta->input_tensor_meta(0);
+  std::vector<int32_t> expected_size(
+      input_meta_result->sizes().begin(), input_meta_result->sizes().end());
+  ScalarType expected_dtype = input_meta_result->scalar_type();
+
+  // TODO: add use_tokenized_prompt for enable running static Llama models
+  // inside LlamaDemo Android
+  //  4. generate tokens & store inference output
+  for (int i = 0; i < FLAGS_num_iters; i++) {
+    for (size_t j = 0; j < messages.size(); ++j) {
+      const auto& prompt = messages[j].text;
+      const std::vector<std::string> files_path = messages[j].files_path;
+
+      // 4.1 prepare image input
+      std::vector<MultimodalInput> inputs;
+      if (modality_of(model_version) == example::Modality::kVision) {
+        for (const std::string& file_path : files_path) {
+          Image image;
+          example::load_image(file_path, image, expected_size, expected_dtype);
+          inputs.emplace_back(make_image_input(image));
+        }
       }
+
+      // 4.2 prepare prompt input
+      std::string formatted_prompt =
+          apply_chat_template(prompt, FLAGS_system_prompt, model_version);
+      inputs.emplace_back(make_text_input(formatted_prompt));
+
+      // 4.3 generate text
+      runner.generate(inputs, config, callback);
     }
   }
   fout.write(buf.data(), buf.size());
@@ -346,22 +242,24 @@ int main(int argc, char** argv) {
     ET_CHECK_MSG(
         false, "Only TokenGenerator(kv) mode is supported to dump all logits.");
   }
-  ET_LOG(Info, "Embedding: %s", FLAGS_embedding_path.c_str());
-  ET_LOG(Info, "Encoder: %s", FLAGS_encoder_path.c_str());
-  ET_LOG(Info, "Decoder: %s", FLAGS_decoder_path.c_str());
 
-  // Create encoder runner
-  std::unique_ptr<example::EncoderRunner> encoder_runner =
-      std::make_unique<example::EncoderRunner>(FLAGS_encoder_path.c_str());
+  // Load encoder
+  ET_LOG(Info, "Load Encoder: %s", FLAGS_encoder_path.c_str());
+  std::unique_ptr<executorch::extension::Module> encoder =
+      std::make_unique<executorch::extension::Module>(
+          FLAGS_encoder_path.c_str(),
+          executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors);
 
-  // load embedding
-  std::unique_ptr<executorch::extension::Module> embedding =
+  // Load token embedding
+  ET_LOG(Info, "Load Token Embedding: %s", FLAGS_tok_embedding_path.c_str());
+  std::unique_ptr<executorch::extension::Module> tok_embedding =
       std::make_unique<executorch::extension::Module>(
-          FLAGS_embedding_path.c_str(),
+          FLAGS_tok_embedding_path.c_str(),
           executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors);
 
-  // load decoder
-  std::unique_ptr<executorch::extension::Module> module =
+  // Load text decoder
+  ET_LOG(Info, "Load Text Decoder: %s", FLAGS_decoder_path.c_str());
+  std::unique_ptr<executorch::extension::Module> text_decoder =
       std::make_unique<executorch::extension::Module>(
           FLAGS_decoder_path.c_str(),
           executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors);
@@ -369,22 +267,25 @@ int main(int argc, char** argv) {
   // Using 8bit as default since this meta is introduced with 16bit kv io
   // support and older models only have 8bit kv io.
   example::KvBitWidth kv_bitwidth = example::KvBitWidth::kWidth8;
-  if (module->method_names()->count("get_kv_io_bit_width") > 0) {
+  if (text_decoder->method_names()->count("get_kv_io_bit_width") > 0) {
     kv_bitwidth = static_cast<example::KvBitWidth>(
-        module->get("get_kv_io_bit_width").get().toScalar().to<int64_t>());
+        text_decoder->get("get_kv_io_bit_width")
+            .get()
+            .toScalar()
+            .to<int64_t>());
   }
   // Start runner with appropriate KV bitwidth
   if (kv_bitwidth == example::KvBitWidth::kWidth8) {
     start_multimodal_runner<uint8_t>(
-        std::move(encoder_runner),
-        std::move(module),
-        std::move(embedding),
+        std::move(encoder),
+        std::move(tok_embedding),
+        std::move(text_decoder),
         prompts);
   } else if (kv_bitwidth == example::KvBitWidth::kWidth16) {
     start_multimodal_runner<uint16_t>(
-        std::move(encoder_runner),
-        std::move(module),
-        std::move(embedding),
+        std::move(encoder),
+        std::move(tok_embedding),
+        std::move(text_decoder),
         prompts);
   } else {
     ET_CHECK_MSG(
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/chat_template.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/chat_template.h
new file mode 100644
index 00000000000..1b8fb57408d
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/chat_template.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/vision_chat_template.h>
+#include <executorch/runtime/platform/log.h>
+#include <string>
+#include <vector>
+
+/**
+ * Message structure for multi-turn conversations
+ */
+struct Message {
+  size_t id;
+  std::string text;
+  std::vector<std::string> files_path;
+};
+
+/**
+ * Prepare messages for multi-turn simulation
+ * This function validates that the number of image tokens matches the number of
+ * images, and distributes images across messages based on image token
+ * positions.
+ */
+inline std::vector<Message> prepare_messages(
+    std::vector<std::string>& prompts,
+    const std::vector<std::string>& image_paths) {
+  size_t num_images = image_paths.size();
+  size_t total_image_tokens = 0;
+
+  // Count total image tokens across all prompts
+  for (const auto& prompt : prompts) {
+    size_t pos = 0;
+    while ((pos = prompt.find(IMG_TOKEN, pos)) != std::string::npos) {
+      total_image_tokens++;
+      pos += IMG_TOKEN.length();
+    }
+  }
+
+  // If no image tokens but images provided, prepend image tokens to prompt in
+  // first turn and check the number of image tokens given by user are equal to
+  // image num.
+  if (total_image_tokens == 0 && num_images > 0) {
+    std::string prefix;
+    for (size_t i = 0; i < num_images; ++i) {
+      prefix += IMG_TOKEN;
+    }
+    prompts[0] = prefix + prompts[0];
+  }
+  ET_CHECK_MSG(
+      total_image_tokens == num_images,
+      "Number of %s tokens (%zu) does not match number of images (%zu). Please check your prompts and image paths.",
+      IMG_TOKEN.c_str(),
+      total_image_tokens,
+      num_images);
+
+  // Build messages and dispatch images
+  std::vector<Message> messages;
+  size_t img_idx = 0;
+  ET_LOG(Info, "Simulation multi-turn:");
+
+  for (size_t i = 0; i < prompts.size(); ++i) {
+    Message msg;
+    msg.id = i;
+    msg.text = prompts[i];
+
+    // Count image tokens in this prompt
+    size_t count = 0;
+    size_t pos = 0;
+    while ((pos = msg.text.find(IMG_TOKEN, pos)) != std::string::npos) {
+      count++;
+      pos += IMG_TOKEN.length();
+    }
+
+    // Assign corresponding images to this message
+    if (count > 0) {
+      for (size_t k = 0; k < count && img_idx < image_paths.size(); ++k) {
+        msg.files_path.emplace_back(image_paths[img_idx++]);
+      }
+    }
+
+    // Log message info
+    std::string paths_str = "[";
+    for (size_t i = 0; i < msg.files_path.size(); ++i) {
+      paths_str += "'";
+      paths_str += msg.files_path[i];
+      paths_str += "'";
+      if (i < msg.files_path.size() - 1)
+        paths_str += ", ";
+    }
+    paths_str += "]";
+    ET_LOG(
+        Info,
+        "Turn-%zu: {id: %zu, text: \"%s\", files_path: %s}",
+        i,
+        i,
+        msg.text.c_str(),
+        paths_str.c_str());
+
+    messages.emplace_back(std::move(msg));
+  }
+
+  return messages;
+}
+
+inline std::string apply_chat_template(
+    const std::string& prompt,
+    const std::string& system_prompt,
+    example::ModelVersion model_version) {
+  return std::visit(
+      [&](const auto& model) {
+        return apply_chat_template(system_prompt, prompt, model);
+      },
+      model_version);
+}
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.cpp
index 91789f07a90..9304d2e4688 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h>
+#include <cstring>
 #include <fstream>
 
 using executorch::aten::Tensor;
@@ -18,12 +19,8 @@ using executorch::runtime::Result;
 
 namespace example {
 
-EncoderRunner::EncoderRunner(const std::string& model_path)
-    : image_seq_len_(0) {
-  module_ = std::make_unique<Module>(
-      model_path, Module::LoadMode::MmapUseMlockIgnoreErrors);
-  ET_LOG(Info, "Creating encoder module: model_path=%s", model_path.c_str());
-}
+EncoderRunner::EncoderRunner(executorch::extension::Module* module)
+    : module_(module) {}
 
 bool EncoderRunner::is_method_loaded() const {
   return module_->is_method_loaded(kEncoderForwardName);
@@ -47,17 +44,9 @@ Error EncoderRunner::load() {
     return method_meta.error();
   }
 
-  // vision embedding output shape: [1, seq_len, dim]
-  image_seq_len_ = method_meta->output_tensor_meta(0)->sizes()[1];
-  ET_LOG(Info, "Encoder loaded successfully, image_seq_len=%d", image_seq_len_);
-
   return Error::Ok;
 }
 
-int32_t EncoderRunner::get_image_seq_len() const {
-  return image_seq_len_;
-}
-
 Result<Tensor> EncoderRunner::encode(TensorPtr& image_tensor) {
   ET_CHECK_MSG(is_method_loaded(), "Encoder method not loaded");
 
@@ -77,54 +66,4 @@ Result<Tensor> EncoderRunner::encode(TensorPtr& image_tensor) {
   return image_hidden_states;
 }
 
-Result<Tensor> EncoderRunner::encode_from_file(
-    const std::string& image_file_path) {
-  ET_CHECK_MSG(is_method_loaded(), "Encoder method not loaded");
-
-  // Get input tensor metadata
-  Result<MethodMeta> method_meta = module_->method_meta(kEncoderForwardName);
-  auto sizes_span = method_meta->input_tensor_meta(0)->sizes();
-
-  // Calculate total number of elements
-  int64_t num_elem = 1;
-  for (const auto& size : sizes_span) {
-    num_elem *= size;
-  }
-
-  // Read image data from file
-  ET_LOG(
-      Info,
-      "Reading image from file: %s, num_elements=%ld",
-      image_file_path.c_str(),
-      num_elem);
-  std::ifstream file(image_file_path, std::ios::binary | std::ios::ate);
-  ET_CHECK_MSG(
-      file.is_open(), "Failed to open image file: %s", image_file_path.c_str());
-
-  // To prevent users from passing images that have not been
-  // resized to match the encoder input size.
-  std::streamsize file_size = file.tellg();
-  std::streamsize expected_size = num_elem * sizeof(float);
-  ET_CHECK_MSG(
-      file_size == expected_size,
-      "Image file size mismatch: expected %ld bytes but got %ld bytes (file: %s)",
-      expected_size,
-      file_size,
-      image_file_path.c_str());
-
-  file.seekg(0, std::ios::beg);
-  std::vector<float> buffer(num_elem);
-  file.read(reinterpret_cast<char*>(buffer.data()), expected_size);
-  file.close();
-
-  // Create tensor from buffer
-  TensorPtr tensor = executorch::extension::from_blob(
-      buffer.data(),
-      std::vector<int32_t>(sizes_span.begin(), sizes_span.end()),
-      executorch::aten::ScalarType::Float);
-
-  // Encode the tensor
-  return encode(tensor);
-}
-
 } // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h
index 0e1becc05b6..e8c8a948877 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h
@@ -12,6 +12,7 @@
 #include <executorch/extension/tensor/tensor_ptr.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/log.h>
+#include <list>
 #include <memory>
 #include <string>
 #include <vector>
@@ -28,7 +29,7 @@ class EncoderRunner {
    * @brief Constructor for EncoderRunner
    * @param model_path Path to the encoder model PTE file
    */
-  explicit EncoderRunner(const std::string& model_path);
+  explicit EncoderRunner(executorch::extension::Module* module);
 
   /**
    * @brief Check if the encoder method is loaded
@@ -56,18 +57,11 @@ class EncoderRunner {
   executorch::runtime::Result<executorch::aten::Tensor> encode(
       executorch::extension::TensorPtr& image_tensor);
 
-  /**
-   * @brief Encode image from raw file
-   * @param image_file_path Path to raw image file
-   * @return Result containing the image hidden states tensor
-   */
-  executorch::runtime::Result<executorch::aten::Tensor> encode_from_file(
-      const std::string& image_file_path);
-
  private:
-  std::unique_ptr<executorch::extension::Module> module_;
+  executorch::extension::Module* module_;
   inline static const std::string kEncoderForwardName = "forward";
-  int32_t image_seq_len_;
+  std::list<std::vector<float>> output_buffers_;
+  std::list<executorch::extension::TensorPtr> output_tensors_;
 };
 
 } // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.cpp
new file mode 100644
index 00000000000..d45ce10a9af
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.cpp
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.h>
+#include <executorch/runtime/platform/log.h>
+
+namespace example {
+
+MultimodalEmbeddingMerger::MultimodalEmbeddingMerger(int32_t embedding_dim)
+    : embedding_dim_(embedding_dim) {
+  ET_CHECK_MSG(embedding_dim_ > 0, "Embedding dimension must be positive");
+}
+
+void MultimodalEmbeddingMerger::reset() {
+  text_embedding_buffers_.clear();
+  text_embedding_token_counts_.clear();
+  image_embedding_buffers_.clear();
+  image_embedding_token_counts_.clear();
+  total_tokens_ = 0;
+}
+
+void MultimodalEmbeddingMerger::add_embeddings(
+    const executorch::aten::Tensor& embeddings,
+    const float* data,
+    EmbeddingType type) {
+  // shape: [1, num_tokens, embedding_dim]
+  ET_CHECK_MSG(embeddings.dim() == 3, "Embeddings must be a 3D tensor");
+
+  size_t batch_size = embeddings.sizes()[0];
+  size_t num_tokens = embeddings.sizes()[1];
+  size_t dim = embeddings.sizes()[2];
+
+  ET_CHECK_MSG(batch_size == 1, "Batch size must be 1");
+  ET_CHECK_MSG(
+      dim == embedding_dim_,
+      "Embedding dimension mismatch: expected %zu, got %zu",
+      embedding_dim_,
+      dim);
+
+  // Copy embedding data to prevent it from being overwritten
+  size_t num_elements = num_tokens * dim;
+  std::vector<float> buffer(data, data + num_elements);
+
+  std::string type_str = (type == EmbeddingType::kText) ? "text" : "image";
+  if (type == EmbeddingType::kText) {
+    text_embedding_buffers_.emplace_back(std::move(buffer));
+    text_embedding_token_counts_.push_back(num_tokens);
+  } else {
+    image_embedding_buffers_.emplace_back(std::move(buffer));
+    image_embedding_token_counts_.push_back(num_tokens);
+  }
+
+  ET_LOG(
+      Info,
+      "Added %s embeddings: num_tokens=%zu",
+      type_str.c_str(),
+      num_tokens);
+}
+
+void MultimodalEmbeddingMerger::add_text_embeddings(
+    const TensorStruct<float>& text_embeddings) {
+  ET_CHECK_MSG(
+      text_embeddings.tensor != nullptr,
+      "Text embeddings tensor cannot be null");
+  ET_CHECK_MSG(
+      text_embeddings.data != nullptr, "Text embeddings data cannot be null");
+
+  executorch::aten::Tensor tensor_wrapper(text_embeddings.tensor.get());
+
+  add_embeddings(tensor_wrapper, text_embeddings.data, EmbeddingType::kText);
+}
+
+void MultimodalEmbeddingMerger::add_image_embeddings(
+    const executorch::aten::Tensor& image_embeddings) {
+  add_embeddings(
+      image_embeddings,
+      image_embeddings.const_data_ptr<float>(),
+      EmbeddingType::kImage);
+}
+
+TensorStruct<float> MultimodalEmbeddingMerger::merge(
+    const std::vector<uint64_t>& input_ids,
+    uint64_t image_token_id) {
+  ET_CHECK_MSG(!input_ids.empty(), "input_ids cannot be empty");
+  ET_CHECK_MSG(
+      !text_embedding_buffers_.empty(),
+      "No text embeddings added. Call add_text_embeddings() first.");
+
+  // Final merged embeddings
+  std::vector<float> merged_buffer;
+  std::vector<executorch::aten::TensorImpl::SizesType> sizes;
+  TensorStruct<float> merged_embeddings;
+
+  size_t num_placeholder_tokens = 0;
+  if (image_token_id != 0) {
+    for (uint64_t token_id : input_ids) {
+      if (token_id == image_token_id) {
+        num_placeholder_tokens++;
+      }
+    }
+  }
+
+  ET_CHECK_MSG(
+      num_placeholder_tokens == image_embedding_buffers_.size(),
+      "Number of placeholder tokens (%zu) must match number of image embeddings (%zu)",
+      num_placeholder_tokens,
+      image_embedding_buffers_.size());
+
+  // Calculate total tokens: sum of all text tokens + all image tokens
+  for (int64_t count : text_embedding_token_counts_) {
+    total_tokens_ += count;
+  }
+  for (int64_t count : image_embedding_token_counts_) {
+    total_tokens_ += count;
+  }
+  total_tokens_ = total_tokens_ - num_placeholder_tokens;
+
+  size_t total_elements = total_tokens_ * embedding_dim_;
+  merged_buffer.resize(total_elements);
+
+  // Merge embeddings based on input_ids
+  size_t text_emb_idx = 0; // Which text embedding chunk in current turn
+  size_t text_token_idx = 0; // Token index within current text embedding chunk
+  size_t image_emb_idx = 0; // Which image embedding chunk in current turn
+  size_t output_offset = 0; // Output buffer offset
+
+  for (int i = 0; i < input_ids.size(); i++) {
+    uint64_t token_id = input_ids[i];
+
+    if (image_token_id != 0 && token_id == image_token_id) {
+      // Insert entire image embedding
+      ET_CHECK_MSG(
+          image_emb_idx < image_embedding_buffers_.size(),
+          "Image index out of bounds");
+
+      const std::vector<float>& image_buffer =
+          image_embedding_buffers_[image_emb_idx];
+      int64_t num_image_tokens = image_embedding_token_counts_[image_emb_idx];
+
+      size_t num_elements = num_image_tokens * embedding_dim_;
+      std::memcpy(
+          merged_buffer.data() + output_offset,
+          image_buffer.data(),
+          num_elements * sizeof(float));
+
+      output_offset += num_elements;
+      image_emb_idx++;
+      text_token_idx++; // Skip this image placeholder token
+    } else {
+      // Insert one text token embedding
+      ET_CHECK_MSG(
+          text_emb_idx < text_embedding_buffers_.size(),
+          "Text embedding index out of bounds");
+
+      const std::vector<float>& text_buffer =
+          text_embedding_buffers_[text_emb_idx];
+      std::memcpy(
+          merged_buffer.data() + output_offset,
+          text_buffer.data() + text_token_idx * embedding_dim_,
+          embedding_dim_ * sizeof(float));
+
+      output_offset += embedding_dim_;
+      text_token_idx++;
+    }
+  }
+
+  ET_CHECK_MSG(
+      image_emb_idx == image_embedding_buffers_.size(),
+      "Not all image embeddings were used: used %zu, expected %zu",
+      image_emb_idx,
+      image_embedding_buffers_.size());
+
+  // Setup tensor metadata
+  merged_embeddings.data = merged_buffer.data();
+  merged_embeddings.size = total_elements * sizeof(float);
+
+  // Setup sizes and dim_order: [1, total_tokens, embedding_dim]
+  sizes = {1, total_tokens_, embedding_dim_};
+
+  // Create TensorImpl
+  merged_embeddings.tensor = std::make_unique<executorch::aten::TensorImpl>(
+      executorch::aten::ScalarType::Float,
+      sizes.size(),
+      sizes.data(),
+      merged_embeddings.data);
+
+  ET_LOG(
+      Info,
+      "Merged embeddings: total_tokens=%d, text=%zu, images=%zu, embedding_dim=%d",
+      total_tokens_,
+      text_embedding_buffers_.size(),
+      image_embedding_buffers_.size(),
+      embedding_dim_);
+
+  return merged_embeddings;
+}
+
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.h
new file mode 100644
index 00000000000..f545a80a354
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/utils.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace example {
+
+/**
+ * @class MultimodalEmbeddingMerger
+ * @brief Merges text and image embeddings based on token IDs
+ *
+ * This class collects text and image embeddings separately, then merges them
+ * based on input token IDs. When a placeholder token ID is encountered,
+ * it inserts the corresponding image embedding. Otherwise, it inserts the text
+ * embedding for that token position.
+ */
+enum class EmbeddingType { kText, kImage };
+
+class MultimodalEmbeddingMerger {
+ public:
+  /**
+   * @brief Construct a new Multimodal Embedding Merger
+   *
+   * @param embedding_dim Expected embedding dimension for all inputs
+   */
+  explicit MultimodalEmbeddingMerger(int32_t embedding_dim);
+
+  /**
+   * @brief Reset the merger state for a new sequence
+   */
+  void reset();
+
+  /**
+   * @brief Add text embeddings to the collection
+   *
+   * @param text_embeddings Text embedding tensor [1, num_tokens, embedding_dim]
+   */
+  void add_text_embeddings(const TensorStruct<float>& text_embeddings);
+
+  /**
+   * @brief Add image embeddings to the collection
+   *
+   * @param image_embeddings Image embedding tensor [1, num_tokens,
+   * embedding_dim]
+   */
+  void add_image_embeddings(const executorch::aten::Tensor& image_embeddings);
+
+  /**
+   * @brief Merge collected embeddings based on input token IDs
+   *
+   * This method examines each token ID in input_ids. When it encounters
+   * placeholder_token_id, it inserts the next image embedding. Otherwise,
+   * it inserts the text embedding at the corresponding position.
+   *
+   * @param input_ids Vector of token IDs (including placeholder tokens)
+   * @param image_token_id Token ID that represents image modality placeholder
+   * @return TensorStruct<float> Merged embeddings [1, total_tokens,
+   * embedding_dim]
+   */
+  TensorStruct<float> merge(
+      const std::vector<uint64_t>& input_ids,
+      uint64_t image_token_id);
+
+  /**
+   * @brief Get the total number of tokens after merging
+   * @return int64_t Total token count
+   */
+  inline size_t get_total_tokens() const {
+    return total_tokens_;
+  }
+
+ private:
+  void add_embeddings(
+      const executorch::aten::Tensor& embeddings,
+      const float* data,
+      EmbeddingType type);
+
+  // Expected embedding dimension
+  int32_t embedding_dim_;
+
+  // Total tokens after merge
+  int32_t total_tokens_{0};
+
+  // Collected embeddings before merge
+  // Text embeddings are copied to prevent external modifications
+  std::vector<std::vector<float>> text_embedding_buffers_;
+  std::vector<int64_t> text_embedding_token_counts_;
+
+  // Image embeddings are copied since they're temporary
+  std::vector<std::vector<float>> image_embedding_buffers_;
+  std::vector<int64_t> image_embedding_token_counts_;
+};
+
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.cpp
index fc6ec382cef..14a93104e1a 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.cpp
@@ -30,9 +30,9 @@ void MultimodalLhdTokenGenerator<T>::prepare_io(
   std::vector<uint64_t> tokens_to_process(
       input_tokens.begin(), input_tokens.begin() + num_tokens_to_process);
 
-  embedding_runner_->prefill(tokens_to_process);
+  tok_embedding_runner_->prefill(tokens_to_process);
   const TensorStruct<float>& text_embeddings =
-      embedding_runner_->get_prompt_embeddings();
+      tok_embedding_runner_->get_prompt_embeddings();
   int64_t embedding_dim = text_embeddings.tensor->size(2);
 
   // Copy embedding to input buffer from the left
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h
index 2c9e54b49d2..83da9e7a6ba 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h
@@ -35,7 +35,7 @@ class MultimodalLhdTokenGenerator
   };
   MultimodalLhdTokenGenerator(
       tokenizers::Tokenizer* tokenizer,
-      EmbeddingProcessor* embedding_runner,
+      TokenEmbeddingProcessor* embedding_runner,
       DecoderRunner* decoder_runner,
       KVManager<T>* kv_manager,
       const std::string& forward_name,
@@ -60,7 +60,7 @@ class MultimodalLhdTokenGenerator
                 metadata.cache_mode,
                 metadata.embedding_dim},
             stats),
-        embedding_runner_(embedding_runner),
+        tok_embedding_runner_(embedding_runner),
         metadata_(metadata),
         lhd_branch_(metadata.ngram - 1, std::vector<int32_t>(metadata.window)),
         lhd_branch_prev_(metadata.window),
@@ -123,7 +123,7 @@ class MultimodalLhdTokenGenerator
   void update_ngrams_pool();
 
   // Additional members specific to multimodal
-  EmbeddingProcessor* embedding_runner_;
+  TokenEmbeddingProcessor* tok_embedding_runner_;
 
   struct NgramData {
     bool active = false;
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.cpp
index 9ddfa5e78f6..2859e16a42a 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.cpp
@@ -266,15 +266,6 @@ Result<uint64_t> MultimodalPromptProcessor<T>::prefill(
     prepare_io(prompt_embedding, num_prompt_tokens, prompt_pos, pos);
 
     // Run inference
-    for (int layer = 0; layer < metadata_.num_layers; ++layer) {
-      std::vector<KVCache<T>> k_cache_ptrs = kv_manager_->get_k_cache_();
-      T* k_cache_data = k_cache_ptrs[layer].buffer;
-    }
-    for (int layer = 0; layer < metadata_.num_layers; ++layer) {
-      std::vector<KVCache<T>> v_cache_ptrs = kv_manager_->get_v_cache_();
-      T* v_cache_data = v_cache_ptrs[layer].buffer;
-    }
-
     decoder_runner_->step(method_name_, inputs_);
     if (dump_logits) {
       prompt_all_logits_.insert(
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.h
index 51ed0b829ee..fcfc07c9590 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.h
@@ -7,7 +7,7 @@
  */
 
 #pragma once
-#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h>
 
 namespace example {
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp
index a7ced9c138d..0ac38308bfe 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp
@@ -7,10 +7,12 @@
  */
 
 // A llama 3.2 runner that includes preprocessing and post processing
-// logic. The module takes in a string as input and emits a string as output.
+// logic. The text_decoder takes in a string as input and emits a string as
+// output.
 
 #include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/client_mem.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/rpc_mem.h>
@@ -31,6 +33,8 @@
 using executorch::aten::Tensor;
 using executorch::extension::Module;
 using executorch::extension::llm::get_rss_bytes;
+using executorch::extension::llm::Image;
+using executorch::extension::llm::MultimodalInput;
 using executorch::extension::llm::print_report;
 using executorch::extension::llm::Stats;
 using executorch::extension::llm::time_in_ms;
@@ -89,11 +93,11 @@ void save_logits(
 } // namespace
 
 template <typename T>
-MultimodalRunner<T>::MultimodalRunner(
-    std::unique_ptr<executorch::extension::Module> module,
-    std::unique_ptr<executorch::extension::Module> embedding_module,
-    const std::string& decoder_model_version,
-    const std::string& model_path,
+QNNMultimodalRunner<T>::QNNMultimodalRunner(
+    std::unique_ptr<executorch::extension::Module> encoder,
+    std::unique_ptr<executorch::extension::Module> tok_embedding,
+    std::unique_ptr<executorch::extension::Module> text_decoder,
+    const std::string& model_version,
     const std::string& tokenizer_path,
     const std::string& dump_logits_path,
     const std::string& performance_output_path,
@@ -102,11 +106,22 @@ MultimodalRunner<T>::MultimodalRunner(
     const bool shared_buffer,
     const int ngram,
     const int window,
-    const int gcap,
-    std::unique_ptr<executorch::aten::Tensor> image_hidden_states)
-    : module_(std::move(module)),
-      embedding_module_(std::move(embedding_module)),
-      image_hidden_states_(std::move(image_hidden_states)),
+    const int gcap)
+    // TODO: The arguments for the MultimodalRunner base class are currently
+    // unused in this runner implementation. QNNMultimodalRunner will be
+    // refactored in the future to align with MultimodalRunner's usage.
+    : MultimodalRunner(
+          {},
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          std::make_unique<Stats>()),
+      encoder_(std::move(encoder)),
+      tok_embedding_(std::move(tok_embedding)),
+      text_decoder_(std::move(text_decoder)),
       ngram_(ngram),
       window_(window),
       gcap_(gcap),
@@ -118,39 +133,38 @@ MultimodalRunner<T>::MultimodalRunner(
       shared_buffer_(shared_buffer) {
   stats_.reset();
 
-  if (decoder_model_version == "smolvlm") {
-    decoder_model_version_ = MultimodalDecoderModelVersion::kSmolvlm;
-  } else if (decoder_model_version == "internvl3") {
-    decoder_model_version_ = MultimodalDecoderModelVersion::kInternvl3;
+  if (model_version == "smolvlm") {
+    model_version_ = VisionLanguageModel::kSmolvlm;
+  } else if (model_version == "internvl3") {
+    model_version_ = VisionLanguageModel::kInternvl3;
   } else {
     ET_CHECK_MSG(false, "Unsupported Decoder Model");
   }
 
-  ET_LOG(Info, "creating module: model_path=%s", model_path.c_str());
   ET_LOG(Info, "creating runner: tokenizer_path=%s", tokenizer_path_.c_str());
   ET_LOG(Info, "eval mode=%d", eval_mode_);
 }
 
 template <typename T>
-bool MultimodalRunner<T>::is_loaded() const {
-  return module_->is_loaded() && embedding_module_->is_loaded() && tokenizer_ &&
+bool QNNMultimodalRunner<T>::is_loaded() const {
+  return encoder_->is_loaded() && tok_embedding_->is_loaded() &&
+      text_decoder_->is_loaded() && embedding_merger_ && tokenizer_ &&
       decoder_runner_ && prompt_processor_ && token_generator_ && kv_manager_ &&
       buffer_manager_;
 }
 
 template <typename T>
-Error MultimodalRunner<T>::load() {
+Error QNNMultimodalRunner<T>::load() {
   if (is_loaded()) {
     return Error::Ok;
   }
-
-  std::string prompt_embedding_method_name, token_embedding_method_name;
+  std::string prompt_embedding_method_name, tok_embedding_method_name;
   std::string token_generator_method_name, prompt_processor_method_name;
   std::vector<std::string> method_names;
   switch (eval_mode_) {
     case EvalMode::kKVCached:
       prompt_embedding_method_name = "tok_embedding_kv_forward";
-      token_embedding_method_name = "tok_embedding_kv_forward";
+      tok_embedding_method_name = "tok_embedding_kv_forward";
       prompt_processor_method_name = "kv_forward";
       token_generator_method_name = "kv_forward";
       method_names.emplace_back(prompt_processor_method_name);
@@ -159,7 +173,7 @@ Error MultimodalRunner<T>::load() {
     case EvalMode::kHybrid:
     case EvalMode::kLookaheadDecoding:
       prompt_embedding_method_name = "tok_embedding_prefill_forward";
-      token_embedding_method_name = "tok_embedding_kv_forward";
+      tok_embedding_method_name = "tok_embedding_kv_forward";
       prompt_processor_method_name = "prefill_forward";
       token_generator_method_name = "kv_forward";
       method_names.emplace_back(prompt_processor_method_name);
@@ -183,29 +197,29 @@ Error MultimodalRunner<T>::load() {
     }
     eos_ids->insert(tokenizer_->eos_tok());
   }
-  if (decoder_model_version_ == MultimodalDecoderModelVersion::kSmolvlm) {
-    eos_ids->insert(tokenizer_->encode("<end_of_utterance>", 0, 0).get()[0]);
-  } else if (
-      decoder_model_version_ == MultimodalDecoderModelVersion::kInternvl3) {
-    eos_ids->insert(tokenizer_->encode("<|im_end|>", 0, 0).get()[0]);
+  if (const auto* vlm = std::get_if<VisionLanguageModel>(&model_version_)) {
+    if (*vlm == VisionLanguageModel::kSmolvlm) {
+      eos_ids->insert(tokenizer_->encode("<end_of_utterance>", 0, 0).get()[0]);
+    } else if (*vlm == VisionLanguageModel::kInternvl3) {
+      eos_ids->insert(tokenizer_->encode("<|im_end|>", 0, 0).get()[0]);
+    }
   }
 
-  // Try avoid getMetadataHelper as it is time consuming.
   Result<MethodMeta> method_meta =
-      module_->method_meta(token_generator_method_name);
+      text_decoder_->method_meta(token_generator_method_name);
 
   // For some tokenizer.json, runtime vocab_size might be different, use output
   // shape to get vocab size.
   int32_t vocab_size = method_meta->output_tensor_meta(0)->sizes()[2];
-  decoder_runner_ =
-      std::make_unique<DecoderRunner>(module_.get(), vocab_size, temperature_);
+  decoder_runner_ = std::make_unique<DecoderRunner>(
+      text_decoder_.get(), vocab_size, temperature_);
 
   ET_CHECK_OK_OR_RETURN_ERROR(decoder_runner_->load(method_names));
 
   ET_LOG(Info, "Reading metadata from model");
   // retrieve any method meta, can be either prefill or kv
   int64_t num_layers =
-      ET_UNWRAP(module_->get("get_n_layers")).toScalar().to<int64_t>();
+      ET_UNWRAP(text_decoder_->get("get_n_layers")).toScalar().to<int64_t>();
 
   ET_CHECK_MSG(num_layers != -1, "Could not retrieve num layers");
   // k_cache: [1, n_heads, head_dim, seq_len]
@@ -213,12 +227,12 @@ Error MultimodalRunner<T>::load() {
   int64_t num_heads = k_cache_shape[1];
   int64_t head_dim = k_cache_shape[2];
 
-  // TODO: filter shape hidden_state: [1, ar_len, dim]
-  int64_t dim = embedding_module_->method_meta(token_embedding_method_name)
+  // hidden_state: [1, ar_len, dim]
+  int64_t dim = tok_embedding_->method_meta(tok_embedding_method_name)
                     ->output_tensor_meta(0)
                     ->sizes()[2];
   bool use_int64_token =
-      embedding_module_->method_meta(token_embedding_method_name)
+      tok_embedding_->method_meta(tok_embedding_method_name)
           ->input_tensor_meta(0)
           ->scalar_type() == executorch::aten::ScalarType::Long;
 
@@ -238,7 +252,7 @@ Error MultimodalRunner<T>::load() {
       eval_mode_ == EvalMode::kHybrid ||
       eval_mode_ == EvalMode::kLookaheadDecoding) {
     auto atten_mask_meta_prompt =
-        module_->method_meta(prompt_processor_method_name)
+        text_decoder_->method_meta(prompt_processor_method_name)
             ->input_tensor_meta(1);
     prompt_processor_ar_len = atten_mask_meta_prompt->sizes()[1];
   }
@@ -249,15 +263,19 @@ Error MultimodalRunner<T>::load() {
         std::min(token_generator_ar_len, prompt_processor_ar_len);
   max_ar_len = std::max(token_generator_ar_len, prompt_processor_ar_len);
 
-  embedding_runner_ =
-      std::make_unique<EmbeddingRunner>(embedding_module_.get());
-  ET_CHECK_OK_OR_RETURN_ERROR(embedding_runner_->load(
-      {prompt_embedding_method_name, token_embedding_method_name}));
-  // Initialize EmbeddingProcessor
-  embedding_processor_ = std::make_unique<EmbeddingProcessor>(
-      embedding_runner_.get(),
+  // Initialize Encoder
+  encoder_runner_ = std::make_unique<EncoderRunner>(encoder_.get());
+  ET_CHECK_OK_OR_RETURN_ERROR(encoder_runner_->load());
+
+  // Initialize TokenEmbeddingProcessor
+  tok_embedding_runner_ =
+      std::make_unique<TokenEmbeddingRunner>(tok_embedding_.get());
+  ET_CHECK_OK_OR_RETURN_ERROR(tok_embedding_runner_->load(
+      {prompt_embedding_method_name, tok_embedding_method_name}));
+  tok_embedding_processor_ = std::make_unique<TokenEmbeddingProcessor>(
+      tok_embedding_runner_.get(),
       prompt_embedding_method_name,
-      EmbeddingProcessor::Metadata{
+      TokenEmbeddingProcessor::Metadata{
           context_len_,
           prompt_processor_ar_len,
           vocab_size,
@@ -268,8 +286,9 @@ Error MultimodalRunner<T>::load() {
   // This is used to configure the attention mask for models with window
   // attention
   int32_t sliding_window = context_len_;
-  if (module_->method_names()->count("get_sliding_window") > 0) {
-    sliding_window = ET_UNWRAP(module_->get("get_sliding_window")).toInt();
+  if (text_decoder_->method_names()->count("get_sliding_window") > 0) {
+    sliding_window =
+        ET_UNWRAP(text_decoder_->get("get_sliding_window")).toInt();
   }
   kv_manager_ = std::make_unique<KVManager<T>>(typename KVManager<T>::Metadata{
       context_len_,
@@ -295,10 +314,10 @@ Error MultimodalRunner<T>::load() {
           static_cast<int32_t>(dim)});
 
   // Initialize EmbeddingGenerator
-  embedding_generator_ = std::make_unique<EmbeddingProcessor>(
-      embedding_runner_.get(),
-      token_embedding_method_name,
-      EmbeddingProcessor::Metadata{
+  tok_embedding_generator_ = std::make_unique<TokenEmbeddingProcessor>(
+      tok_embedding_runner_.get(),
+      tok_embedding_method_name,
+      TokenEmbeddingProcessor::Metadata{
           context_len_,
           token_generator_ar_len,
           vocab_size,
@@ -308,7 +327,7 @@ Error MultimodalRunner<T>::load() {
     // Initialize TokenGenerator
     token_generator_ = std::make_unique<MultimodalLhdTokenGenerator<T>>(
         tokenizer_.get(),
-        embedding_generator_.get(),
+        tok_embedding_generator_.get(),
         decoder_runner_.get(),
         kv_manager_.get(),
         token_generator_method_name,
@@ -330,7 +349,7 @@ Error MultimodalRunner<T>::load() {
   } else {
     token_generator_ = std::make_unique<MultimodalTokenGenerator<T>>(
         tokenizer_.get(),
-        embedding_generator_.get(),
+        tok_embedding_generator_.get(),
         decoder_runner_.get(),
         kv_manager_.get(),
         token_generator_method_name,
@@ -354,8 +373,8 @@ Error MultimodalRunner<T>::load() {
         kv_manager_->total_cache_size_in_bytes(),
         prompt_processor_->total_prompt_processor_io_size_in_bytes(),
         token_generator_->total_token_generator_io_size_in_bytes(),
-        embedding_processor_->total_embedding_processor_io_size_in_bytes(),
-        embedding_generator_->total_embedding_processor_io_size_in_bytes());
+        tok_embedding_processor_->total_embedding_processor_io_size_in_bytes(),
+        tok_embedding_generator_->total_embedding_processor_io_size_in_bytes());
   }
 
   ET_LOG(Info, "creating io_memory");
@@ -363,44 +382,50 @@ Error MultimodalRunner<T>::load() {
   kv_manager_->init_cache(buffer_manager_.get(), prompt_processor_ar_len);
   prompt_processor_->init_io(
       buffer_manager_.get(),
-      module_->method_meta(prompt_processor_method_name));
+      text_decoder_->method_meta(prompt_processor_method_name));
   token_generator_->init_io(
-      buffer_manager_.get(), module_->method_meta(token_generator_method_name));
+      buffer_manager_.get(),
+      text_decoder_->method_meta(token_generator_method_name));
   // Prepare io for embedding
-  embedding_processor_->init_io(
+  tok_embedding_processor_->init_io(
       buffer_manager_.get(),
-      embedding_module_->method_meta(prompt_embedding_method_name));
-  embedding_generator_->init_io(
+      tok_embedding_->method_meta(prompt_embedding_method_name));
+  tok_embedding_generator_->init_io(
       buffer_manager_.get(),
-      embedding_module_->method_meta(token_embedding_method_name));
-  return Error::Ok;
-}
+      tok_embedding_->method_meta(tok_embedding_method_name));
 
-template <typename T>
-Error MultimodalRunner<T>::generate(
-    const std::string& prompt,
-    const llm::GenerationConfig& config,
-    std::function<void(const std::string&)> token_callback,
-    std::function<void(const Stats&)> stats_callback) {
-  return generate_from_prompt_or_file(
-      prompt, false, config, token_callback, stats_callback);
+  // Get image token ID from text_decoder
+  if (modality_of(model_version_) == Modality::kVision) {
+    ET_CHECK_MSG(
+        text_decoder_->method_names()->count("image_token_id") > 0,
+        "Vision model is missing the required 'image_token_id' in metadata.");
+    image_token_id_ = ET_UNWRAP(text_decoder_->get("image_token_id")).toInt();
+    ET_LOG(
+        Info,
+        "Image placeholder token ID for vision modality loaded: %zu",
+        image_token_id_);
+  }
+
+  // Initialize embedding merger
+  embedding_merger_ =
+      std::make_unique<MultimodalEmbeddingMerger>(static_cast<int32_t>(dim));
+
+  return Error::Ok;
 }
 
 template <typename T>
-Error MultimodalRunner<T>::generate_from_prompt_or_file(
-    const std::string& prompt,
-    bool tokenized_prompt,
+executorch::runtime::Error QNNMultimodalRunner<T>::generate(
+    const std::vector<MultimodalInput>& inputs,
     const llm::GenerationConfig& config,
     std::function<void(const std::string&)> token_callback,
     std::function<void(const Stats&)> stats_callback) {
-  ET_CHECK_MSG(!prompt.empty(), "prompt cannot be null");
+  ET_CHECK_MSG(!inputs.empty(), "inputs cannot be empty");
   if (!is_loaded()) {
     stats_.model_load_start_ms = time_in_ms();
     ET_CHECK_OK_OR_RETURN_ERROR(load());
     stats_.model_load_end_ms = time_in_ms();
   }
   stats_.inference_start_ms = time_in_ms();
-
   int32_t seq_len = config.seq_len;
   if (seq_len > context_len_) {
     ET_LOG(
@@ -421,62 +446,67 @@ Error MultimodalRunner<T>::generate_from_prompt_or_file(
   // For multimodal, we will disable n_bos
   int32_t n_bos = 0;
 
-  // encode the (string) prompt into tokens sequence
+  std::string prompt;
   std::vector<uint64_t> prompt_tokens;
-  if (tokenized_prompt) {
-    std::ifstream inFile(prompt, std::ios::binary);
-    if (inFile.is_open()) {
-      // Get file size
-      inFile.seekg(0, std::ios::end);
-      size_t fileSize = inFile.tellg();
-      inFile.seekg(0, std::ios::beg);
-
-      // Resize vector and read raw data
-      prompt_tokens.resize(fileSize / sizeof(uint64_t));
-
-      inFile.read(reinterpret_cast<char*>(prompt_tokens.data()), fileSize);
-      inFile.close();
+  bool dump_logits = !dump_logits_path_.empty();
+
+  // Reset merger for new generation
+  embedding_merger_->reset();
+
+  // Process each input and add embeddings to merger
+  for (const MultimodalInput& input : inputs) {
+    if (input.is_text()) {
+      std::string text = input.get_text();
+      prompt += text;
+
+      tokenizers::Result<std::vector<uint64_t>> encode_res =
+          tokenizer_->encode(text, n_bos, 0);
+      ET_CHECK_TK_OK_OR_RETURN_ERROR(
+          encode_res.error(), "failed to encode prompt %s", text.c_str());
+
+      std::vector<uint64_t> tokens = encode_res.get();
+      tok_embedding_processor_->prefill(tokens);
+      const TensorStruct<float>& text_embeddings =
+          tok_embedding_processor_->get_prompt_embeddings();
+
+      // Add text embeddings to merger
+      embedding_merger_->add_text_embeddings(text_embeddings);
+
+      prompt_tokens.insert(prompt_tokens.end(), tokens.begin(), tokens.end());
+
+    } else if (input.is_image()) {
+      const Image& image = input.get_image();
+      auto image_tensor_res = image.toTensor(/*with_batch*/ true);
+      executorch::extension::TensorPtr image_tensor_ptr =
+          image_tensor_res.get();
+
+      auto encode_res = encoder_runner_->encode(image_tensor_ptr);
+      executorch::aten::Tensor image_embeddings_tensor = encode_res.get();
+
+      // Add image embeddings to merger
+      embedding_merger_->add_image_embeddings(image_embeddings_tensor);
+
     } else {
-      ET_CHECK_MSG(
-          false,
-          "Unable to read tokenized prompt from file: %s",
-          prompt.c_str());
+      ET_CHECK_MSG(false, "Unsupported input data type");
     }
-  } else {
-    tokenizers::Result<std::vector<uint64_t>> encode_res =
-        tokenizer_->encode(prompt, n_bos, 0);
-    ET_CHECK_TK_OK_OR_RETURN_ERROR(
-        encode_res.error(), "failed to encode prompt %s", prompt.c_str());
-    prompt_tokens = encode_res.get();
   }
-  int num_prompt_tokens = prompt_tokens.size();
+
+  // Fuse embeddings by placeholder_token_id from model
+  TensorStruct<float> merged_embeddings =
+      embedding_merger_->merge(prompt_tokens, image_token_id_);
+  int num_prompt_tokens = embedding_merger_->get_total_tokens();
+
   ET_CHECK_MSG(num_prompt_tokens >= 1, "Expected at least 1 prompt token");
   ET_CHECK_MSG(
       cur_pos_ + num_prompt_tokens < seq_len,
       "sequence length exceeded - please increase the seq_len value");
 
-  // Prompt Processor first
   if (token_callback && config.echo) {
     token_callback(prompt);
   }
-  bool dump_logits = dump_logits_path_.empty() ? false : true;
-  embedding_processor_->prefill(prompt_tokens);
-  const TensorStruct<float>& text_embeddings =
-      embedding_processor_->get_prompt_embeddings();
-  int64_t embedding_dim = text_embeddings.tensor->size(2);
-
-  uint64_t placeholder_token_id = 0;
-  if (module_->method_names()->count("modality_placeholder_token_id") > 0) {
-    placeholder_token_id =
-        module_->get("modality_placeholder_token_id")->toInt();
-  }
-
-  ET_LOG(Info, "Merging text embeddings with image hidden states");
-  merge_multimodal_embeddings(
-      prompt_tokens, text_embeddings, placeholder_token_id);
 
   auto prefill_res = prompt_processor_->prefill(
-      merged_embeddings_, cur_pos_, dump_logits, nullptr);
+      merged_embeddings, cur_pos_, dump_logits, nullptr);
   ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
   uint64_t cur_token = prefill_res.get();
   cur_pos_ += num_prompt_tokens;
@@ -526,85 +556,25 @@ Error MultimodalRunner<T>::generate_from_prompt_or_file(
 }
 
 template <typename T>
-void MultimodalRunner<T>::merge_multimodal_embeddings(
-    const std::vector<uint64_t>& input_ids,
-    const TensorStruct<float>& text_embeddings,
-    uint64_t placeholder_token_id) {
-  // This implements the modality_inputs_merger logic from decoder_utils.py
-  // Find positions where placeholder tokens appear
-  std::vector<size_t> placeholder_positions;
-  for (size_t i = 0; i < input_ids.size(); ++i) {
-    if (input_ids[i] == placeholder_token_id) {
-      placeholder_positions.push_back(i);
-    }
-  }
-
-  int64_t embedding_dim;
-  int64_t num_tokens = input_ids.size();
-  if (text_embeddings.tensor) {
-    embedding_dim = text_embeddings.tensor->size(2);
-    num_tokens = text_embeddings.tensor->size(1);
-  } else {
-    ET_CHECK_MSG(
-        false,
-        "text_embeddings.tensor is null; cannot determine embedding dim during multimodal embedding merge");
-  }
-
-  // Allocate new buffer for merged embeddings
-  size_t total_elements = num_tokens * embedding_dim;
-  multimodal_embeddings_buffer_.resize(total_elements);
-
-  // First, copy all text embeddings to the new buffer
-  std::memcpy(
-      multimodal_embeddings_buffer_.data(),
-      text_embeddings.data,
-      total_elements * sizeof(float));
-
-  // Then replace placeholder positions with image hidden states
-  auto* image_data = image_hidden_states_->const_data_ptr<float>();
-  auto* merged_data = multimodal_embeddings_buffer_.data();
-
-  int64_t image_seq_len = image_hidden_states_->size(1);
-
-  // Copy image hidden states to placeholder positions
-  for (int32_t i = 0; i < placeholder_positions.size(); ++i) {
-    int32_t pos = placeholder_positions[i];
-    std::memcpy(
-        merged_data + pos * embedding_dim,
-        image_data + i * embedding_dim,
-        embedding_dim * sizeof(float));
+Result<ModelVersion> QNNMultimodalRunner<T>::get_model_version() {
+  if (!is_loaded()) {
+    stats_.model_load_start_ms = time_in_ms();
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+    stats_.model_load_end_ms = time_in_ms();
   }
-
-  merged_embeddings_.data = multimodal_embeddings_buffer_.data();
-  merged_embeddings_.size = total_elements * sizeof(float);
-
-  // Create TensorImpl with proper shape [1, num_tokens, embedding_dim]
-  multimodal_embeddings_sizes_ = {
-      1, static_cast<int>(num_tokens), static_cast<int>(embedding_dim)};
-  multimodal_embeddings_dim_order_ = {0, 1, 2};
-  merged_embeddings_.tensor = std::make_unique<executorch::aten::TensorImpl>(
-      executorch::aten::ScalarType::Float,
-      multimodal_embeddings_sizes_.size(),
-      multimodal_embeddings_sizes_.data(),
-      merged_embeddings_.data,
-      multimodal_embeddings_dim_order_.data());
-
-  ET_LOG(Info, "Multimodal embeddings merged successfully");
+  return model_version_;
 }
 
 template <typename T>
-Result<MultimodalDecoderModelVersion>
-MultimodalRunner<T>::get_decoder_model_version() {
+Result<MethodMeta> QNNMultimodalRunner<T>::get_encoder_method_meta() {
   if (!is_loaded()) {
-    stats_.model_load_start_ms = time_in_ms();
     ET_CHECK_OK_OR_RETURN_ERROR(load());
-    stats_.model_load_end_ms = time_in_ms();
   }
-  return decoder_model_version_;
+  return encoder_->method_meta(kEncoderForwardName);
 }
 
 // Explicit instantiations
-template class MultimodalRunner<uint16_t>;
-template class MultimodalRunner<uint8_t>;
+template class QNNMultimodalRunner<uint16_t>;
+template class QNNMultimodalRunner<uint8_t>;
 
 } // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h
index 0d56f52341f..4bf58c10339 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h
@@ -14,41 +14,70 @@
 #include <functional>
 #include <memory>
 #include <string>
+#include <variant>
 
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/cache_utils.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h>
-#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.h>
-#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_runner.h>
+#include <executorch/extension/llm/runner/image.h>
 #include <executorch/extension/llm/runner/irunner.h>
+#include <executorch/extension/llm/runner/multimodal_input.h>
+#include <executorch/extension/llm/runner/multimodal_runner.h>
 #include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/module/module.h>
 #include <pytorch/tokenizers/tokenizer.h>
 
 namespace example {
 
-// Extend DecoderModelVersion enum with multimodal models
-enum MultimodalDecoderModelVersion {
+enum class Modality {
+  kAudio = 0,
+  kVision,
+};
+
+enum class VisionLanguageModel {
   kSmolvlm = 0,
   kInternvl3,
 };
 
+// TODO: Add audio models when they are supported
+enum class AudioLanguageModel {};
+
+using ModelVersion = std::variant<VisionLanguageModel, AudioLanguageModel>;
+
+constexpr Modality modality_of(const VisionLanguageModel& vlm) {
+  return Modality::kVision;
+}
+
+constexpr Modality modality_of(const AudioLanguageModel& alm) {
+  return Modality::kAudio;
+}
+
+inline Modality modality_of(const ModelVersion& model_version) {
+  return std::visit(
+      [](const auto& model) { return modality_of(model); }, model_version);
+}
+
 enum KvBitWidth {
   kWidth8 = 8,
   kWidth16 = 16,
 };
 
 template <typename T>
-class MultimodalRunner : public executorch::extension::llm::IRunner {
+class QNNMultimodalRunner
+    : public executorch::extension::llm::MultimodalRunner {
  public:
-  explicit MultimodalRunner(
-      std::unique_ptr<executorch::extension::Module> module,
-      std::unique_ptr<executorch::extension::Module> embedding_module,
-      const std::string& decoder_model,
-      const std::string& model_path,
+  explicit QNNMultimodalRunner(
+      std::unique_ptr<executorch::extension::Module> encoder,
+      std::unique_ptr<executorch::extension::Module> tok_embedding,
+      std::unique_ptr<executorch::extension::Module> text_decoder,
+      const std::string& model_version,
       const std::string& tokenizer_path,
       const std::string& performance_output_path,
       const std::string& dump_logits_path,
@@ -57,37 +86,21 @@ class MultimodalRunner : public executorch::extension::llm::IRunner {
       const bool shared_buffer = false,
       const int ngram = 0,
       const int window = 0,
-      const int gcap = 0,
-      std::unique_ptr<executorch::aten::Tensor> image_hidden_states = nullptr);
+      const int gcap = 0);
 
   bool is_loaded() const override;
   executorch::runtime::Error load() override;
 
-  // Override generate to support multimodal inputs
   executorch::runtime::Error generate(
-      const std::string& prompt,
+      const std::vector<executorch::extension::llm::MultimodalInput>& inputs,
       const executorch::extension::llm::GenerationConfig& config,
       std::function<void(const std::string&)> token_callback = {},
       std::function<void(const executorch::llm::Stats&)> stats_callback = {})
       override;
 
-  // Multimodal-specific generation with image embeddings
-  executorch::runtime::Error generate_from_prompt_or_file(
-      const std::string& prompt,
-      bool tokenized_prompt,
-      const executorch::extension::llm::GenerationConfig& config,
-      std::function<void(const std::string&)> token_callback = {},
-      std::function<void(const executorch::llm::Stats&)> stats_callback = {});
-  void stop() override {};
-  void reset() override {};
-  executorch::runtime::Result<MultimodalDecoderModelVersion>
-  get_decoder_model_version();
-
-  // Multimodal-specific method for merging embeddings
-  void merge_multimodal_embeddings(
-      const std::vector<uint64_t>& input_ids,
-      const TensorStruct<float>& text_embeddings,
-      uint64_t placeholder_token_id);
+  executorch::runtime::Result<ModelVersion> get_model_version();
+  executorch::runtime::Result<executorch::runtime::MethodMeta>
+  get_encoder_method_meta();
 
  private:
   enum EvalMode {
@@ -98,8 +111,11 @@ class MultimodalRunner : public executorch::extension::llm::IRunner {
   };
 
   // Modules
-  std::unique_ptr<executorch::extension::Module> module_;
-  std::unique_ptr<executorch::extension::Module> embedding_module_;
+  std::unique_ptr<executorch::extension::Module> encoder_;
+  std::unique_ptr<executorch::extension::Module> tok_embedding_;
+  std::unique_ptr<executorch::extension::Module> text_decoder_;
+
+  inline static const std::string kEncoderForwardName = "forward";
 
   int32_t context_len_{0};
 
@@ -119,27 +135,23 @@ class MultimodalRunner : public executorch::extension::llm::IRunner {
   EvalMode eval_mode_;
   bool shared_buffer_;
 
-  MultimodalDecoderModelVersion decoder_model_version_;
+  ModelVersion model_version_;
   std::unique_ptr<IMemAlloc> buffer_manager_;
   std::unique_ptr<KVManager<T>> kv_manager_;
   std::unique_ptr<tokenizers::Tokenizer> tokenizer_;
   std::unique_ptr<DecoderRunner> decoder_runner_;
   std::unique_ptr<MultimodalPromptProcessor<T>> prompt_processor_;
   std::unique_ptr<MultimodalTokenGenerator<T>> token_generator_;
-  std::unique_ptr<EmbeddingRunner> embedding_runner_;
-  std::unique_ptr<EmbeddingProcessor> embedding_processor_;
-  std::unique_ptr<EmbeddingProcessor> embedding_generator_;
-
-  // Image hidden states storage
-  std::unique_ptr<executorch::aten::Tensor> image_hidden_states_;
-
-  // Multimodal embeddings storage
-  std::vector<float> multimodal_embeddings_buffer_;
-  std::vector<executorch::aten::TensorImpl::SizesType>
-      multimodal_embeddings_sizes_;
-  std::vector<executorch::aten::TensorImpl::DimOrderType>
-      multimodal_embeddings_dim_order_;
-  TensorStruct<float> merged_embeddings_;
+  std::unique_ptr<EncoderRunner> encoder_runner_;
+  std::unique_ptr<TokenEmbeddingRunner> tok_embedding_runner_;
+  std::unique_ptr<TokenEmbeddingProcessor> tok_embedding_processor_;
+  std::unique_ptr<TokenEmbeddingProcessor> tok_embedding_generator_;
+  std::unique_ptr<MultimodalEmbeddingMerger> embedding_merger_;
+
+  // Placeholder token ID for image inputs. This value will be set from the
+  // model's metadata. A default of 0 indicates that the vision modality is not
+  // supported.
+  uint64_t image_token_id_{0};
 
   // stats
   executorch::llm::Stats stats_;
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.cpp
index 89b8614d407..2ed8ae51f1d 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.cpp
@@ -18,7 +18,7 @@ namespace example {
 template <typename T>
 MultimodalTokenGenerator<T>::MultimodalTokenGenerator(
     tokenizers::Tokenizer* tokenizer,
-    EmbeddingProcessor* embedding_runner,
+    TokenEmbeddingProcessor* tok_embedding_runner,
     DecoderRunner* decoder_runner,
     KVManager<T>* kv_manager,
     const std::string& method_name,
@@ -40,7 +40,7 @@ MultimodalTokenGenerator<T>::MultimodalTokenGenerator(
            metadata.sliding_window,
            metadata.cache_mode},
           stats),
-      embedding_runner_(embedding_runner),
+      tok_embedding_runner_(tok_embedding_runner),
       metadata_(metadata) {
   // Set input_toks_.size to 0 since we use embeddings instead
   input_toks_.size = 0;
@@ -195,9 +195,9 @@ void MultimodalTokenGenerator<T>::prepare_io(
     uint64_t cur_token,
     int64_t start_pos) {
   // Generate embedding for current token using embedding runner
-  embedding_runner_->prefill({cur_token});
+  tok_embedding_runner_->prefill({cur_token});
   const TensorStruct<float>& text_embeddings =
-      embedding_runner_->get_prompt_embeddings();
+      tok_embedding_runner_->get_prompt_embeddings();
   int64_t embedding_dim = text_embeddings.tensor->size(2);
   // Copy embedding to input buffer
   std::memcpy(
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h
index b010bf3748e..9eb9c79aaa4 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h
@@ -7,7 +7,7 @@
  */
 
 #pragma once
-#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/token_generator.h>
 
 namespace example {
@@ -34,7 +34,7 @@ class MultimodalTokenGenerator : public example::TokenGenerator<T> {
   // Constructor with embedding generator support
   MultimodalTokenGenerator(
       tokenizers::Tokenizer* tokenizer,
-      EmbeddingProcessor* embedding_runner,
+      TokenEmbeddingProcessor* tok_embedding_runner,
       DecoderRunner* decoder_runner,
       KVManager<T>* kv_manager,
       const std::string& method_name,
@@ -86,7 +86,7 @@ class MultimodalTokenGenerator : public example::TokenGenerator<T> {
   using TokenGenerator<T>::v_cache_out_;
 
   // Additional members specific to multimodal
-  EmbeddingProcessor* embedding_runner_;
+  TokenEmbeddingProcessor* tok_embedding_runner_;
 
   /**
    * @brief Fill in I/O buffers with prompt token and position.
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.cpp
similarity index 86%
rename from examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.cpp
rename to examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.cpp
index 1278a1df7d9..d0566941b06 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.h>
 #include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 
@@ -19,11 +19,11 @@ using executorch::runtime::TensorInfo;
 
 namespace example {
 
-EmbeddingProcessor::EmbeddingProcessor(
-    EmbeddingRunner* embedding_runner,
+TokenEmbeddingProcessor::TokenEmbeddingProcessor(
+    TokenEmbeddingRunner* tok_embedding_runner,
     const std::string& method_name,
     Metadata metadata)
-    : embedding_runner_(embedding_runner),
+    : tok_embedding_runner_(tok_embedding_runner),
       method_name_(method_name),
       metadata_(metadata) {
   input_toks_.size = metadata_.ar_len * sizeof(int64_t);
@@ -31,7 +31,7 @@ EmbeddingProcessor::EmbeddingProcessor(
   prompt_embeddings_.size = 0; // Will be set in prefill()
 }
 
-void EmbeddingProcessor::init_io(
+void TokenEmbeddingProcessor::init_io(
     IMemAlloc* buffer_manager,
     Result<MethodMeta> method_meta) {
   input_tensors_.reserve(method_meta->num_inputs());
@@ -73,7 +73,7 @@ void EmbeddingProcessor::init_io(
   }
 }
 
-void EmbeddingProcessor::update_prompt_embedding(
+void TokenEmbeddingProcessor::update_prompt_embedding(
     int32_t num_prompt_tokens,
     int64_t prompt_pos) {
   for (int i = 0; i < metadata_.ar_len; i++) {
@@ -86,7 +86,8 @@ void EmbeddingProcessor::update_prompt_embedding(
   }
 }
 
-void EmbeddingProcessor::prefill(const std::vector<uint64_t>& prompt_tokens) {
+void TokenEmbeddingProcessor::prefill(
+    const std::vector<uint64_t>& prompt_tokens) {
   int64_t prompt_pos = 0;
   int32_t num_prompt_tokens = prompt_tokens.size();
   prompt_embeddings_.size =
@@ -99,19 +100,18 @@ void EmbeddingProcessor::prefill(const std::vector<uint64_t>& prompt_tokens) {
   // Create TensorImpl for prompt_embeddings_ with shape [1, num_prompt_tokens,
   // dim] Store sizes and dim_order as member variables to keep them
   // alive
-  prompt_embeddings_sizes_ = {1, num_prompt_tokens, metadata_.embedding_dim};
-  prompt_embeddings_dim_order_ = {0, 1, 2};
+  std::vector<TensorImpl::SizesType> sizes = {
+      1, num_prompt_tokens, metadata_.embedding_dim};
   prompt_embeddings_.tensor = std::make_unique<TensorImpl>(
       executorch::aten::ScalarType::Float,
-      prompt_embeddings_sizes_.size(),
-      prompt_embeddings_sizes_.data(),
-      prompt_embeddings_.data,
-      prompt_embeddings_dim_order_.data());
+      sizes.size(),
+      sizes.data(),
+      prompt_embeddings_.data);
 
   int num_iters = 1 + ((num_prompt_tokens - 1) / metadata_.ar_len);
 
   ET_CHECK_MSG(
-      embedding_runner_->set_outputs(method_name_, output_tensors_) ==
+      tok_embedding_runner_->set_outputs(method_name_, output_tensors_) ==
           executorch::runtime::Error::Ok,
       "Failed to set output tensor for module %s",
       method_name_.c_str());
@@ -119,7 +119,7 @@ void EmbeddingProcessor::prefill(const std::vector<uint64_t>& prompt_tokens) {
   for (int32_t i = 0; i < num_iters; ++i) {
     prepare_io(prompt_tokens, prompt_pos);
 
-    embedding_runner_->step(method_name_, inputs_);
+    tok_embedding_runner_->step(method_name_, inputs_);
 
     // Update prompt_embedding
     update_prompt_embedding(num_prompt_tokens, prompt_pos);
@@ -128,7 +128,7 @@ void EmbeddingProcessor::prefill(const std::vector<uint64_t>& prompt_tokens) {
   }
 }
 
-void EmbeddingProcessor::prepare_io(
+void TokenEmbeddingProcessor::prepare_io(
     const std::vector<uint64_t>& prompt_tokens,
     int64_t prompt_pos) {
   for (int i = 0; i < metadata_.ar_len; i++) {
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.h
similarity index 87%
rename from examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.h
rename to examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.h
index 0ece8bf2d03..f5dee69bf3a 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.h
@@ -9,18 +9,18 @@
 #pragma once
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/cache_utils.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h>
-#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_runner.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/utils.h>
 #include <memory>
 #include <string>
 
 namespace example {
 /**
- * @class EmbeddingProcessor
+ * @class TokenEmbeddingProcessor
  * @brief Class for processing prompts to generate embeddings using embedding
  * runner.
  */
-class EmbeddingProcessor {
+class TokenEmbeddingProcessor {
  public:
   struct Metadata {
     int32_t context_len;
@@ -30,8 +30,8 @@ class EmbeddingProcessor {
     int32_t embedding_dim;
   };
 
-  EmbeddingProcessor(
-      EmbeddingRunner* embedding_runner,
+  TokenEmbeddingProcessor(
+      TokenEmbeddingRunner* token_tok_embedding_runner,
       const std::string& method_name,
       Metadata metadata);
 
@@ -75,7 +75,7 @@ class EmbeddingProcessor {
       const std::vector<uint64_t>& prompt_tokens,
       int64_t prompt_pos);
 
-  EmbeddingRunner* embedding_runner_;
+  TokenEmbeddingRunner* tok_embedding_runner_;
   std::string method_name_;
 
   // metadata
@@ -86,9 +86,6 @@ class EmbeddingProcessor {
   TensorStruct<float> embeddings_;
   TensorStruct<float> prompt_embeddings_;
   std::vector<float> prompt_embeddings_buffer_;
-  std::vector<executorch::aten::TensorImpl::SizesType> prompt_embeddings_sizes_;
-  std::vector<executorch::aten::TensorImpl::DimOrderType>
-      prompt_embeddings_dim_order_;
 
   std::vector<executorch::runtime::EValue> inputs_;
   std::vector<executorch::aten::Tensor> input_tensors_;
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_runner.cpp
similarity index 82%
rename from examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.cpp
rename to examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_runner.cpp
index bf1008e34b1..cd8a521062f 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_runner.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.h>
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_runner.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 
 using executorch::aten::Tensor;
@@ -17,9 +17,9 @@ using executorch::runtime::Result;
 
 namespace example {
 
-EmbeddingRunner::EmbeddingRunner(Module* module) : module_(module) {}
+TokenEmbeddingRunner::TokenEmbeddingRunner(Module* module) : module_(module) {}
 
-Result<Tensor> EmbeddingRunner::step(
+Result<Tensor> TokenEmbeddingRunner::step(
     const std::string& method_name,
     std::vector<EValue>& inputs) {
   // Execute embedding module
@@ -35,7 +35,7 @@ Result<Tensor> EmbeddingRunner::step(
   return outputs_res.get()[0].toTensor();
 }
 
-Error EmbeddingRunner::set_outputs(
+Error TokenEmbeddingRunner::set_outputs(
     const std::string& method_name,
     std::vector<executorch::aten::Tensor> output_values) {
   for (size_t i = 0; i < output_values.size(); ++i) {
@@ -45,7 +45,7 @@ Error EmbeddingRunner::set_outputs(
   return Error::Ok;
 }
 
-Error EmbeddingRunner::load(const std::vector<std::string>& method_names) {
+Error TokenEmbeddingRunner::load(const std::vector<std::string>& method_names) {
   if (is_method_loaded(method_names)) {
     return Error::Ok;
   }
@@ -55,7 +55,7 @@ Error EmbeddingRunner::load(const std::vector<std::string>& method_names) {
   return Error::Ok;
 }
 
-bool EmbeddingRunner::is_method_loaded(
+bool TokenEmbeddingRunner::is_method_loaded(
     const std::vector<std::string>& method_names) {
   bool method_loaded = true;
   for (const std::string& method_name : method_names) {
@@ -64,7 +64,7 @@ bool EmbeddingRunner::is_method_loaded(
   return method_loaded;
 }
 
-bool EmbeddingRunner::is_loaded() const {
+bool TokenEmbeddingRunner::is_loaded() const {
   return module_ != nullptr && module_->is_loaded();
 }
 
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_runner.h
similarity index 93%
rename from examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.h
rename to examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_runner.h
index d5155a45252..dc6951395bf 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_runner.h
@@ -18,12 +18,12 @@
 namespace example {
 
 /**
- * @class EmbeddingRunner
+ * @class TokenEmbeddingRunner
  * @brief Class for running embedding module, similar to DecoderRunner
  */
-class EmbeddingRunner {
+class TokenEmbeddingRunner {
  public:
-  EmbeddingRunner(executorch::extension::Module* module);
+  TokenEmbeddingRunner(executorch::extension::Module* module);
 
   /**
    * Run embedding module with inputs to generate embeddings.
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/utils.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/utils.h
new file mode 100644
index 00000000000..4b16cf646cb
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/utils.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/extension/llm/runner/image.h>
+#include <executorch/extension/llm/runner/multimodal_input.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/runtime/core/span.h>
+#include <executorch/runtime/platform/log.h>
+
+#include <cstring>
+#include <fstream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+using ::executorch::aten::ScalarType;
+using ::executorch::extension::llm::Image;
+using ::executorch::extension::llm::MultimodalInput;
+
+namespace example {
+
+inline std::vector<std::string> load_raw_files(
+    const std::string& input_list_file_path) {
+  std::vector<std::string> input_files;
+
+  std::ifstream input_list(input_list_file_path);
+  ET_CHECK_MSG(
+      input_list.is_open(),
+      "Failed to open input list file: %s",
+      input_list_file_path.c_str());
+
+  auto split = [](std::string s, std::string delimiter) {
+    size_t pos_start = 0, pos_end, delim_len = delimiter.length();
+    std::string token;
+    std::vector<std::string> res;
+
+    while ((pos_end = s.find(delimiter, pos_start)) != std::string::npos) {
+      token = s.substr(pos_start, pos_end - pos_start);
+      pos_start = pos_end + delim_len;
+      res.push_back(token);
+    }
+    res.push_back(s.substr(pos_start));
+    return res;
+  };
+
+  std::string file_path_line;
+  while (std::getline(input_list, file_path_line)) {
+    if (!file_path_line.empty() && file_path_line.back() == '\r') {
+      file_path_line.pop_back();
+    }
+    if (file_path_line.empty()) {
+      continue;
+    }
+
+    auto line_files = split(file_path_line, " ");
+    if (line_files.empty()) {
+      continue;
+    }
+
+    input_files.insert(input_files.end(), line_files.begin(), line_files.end());
+  }
+  return input_files;
+}
+
+void load_image(
+    const std::string& image_path,
+    Image& image,
+    const std::vector<int32_t>& expected_size,
+    const ScalarType& expected_dtype) {
+  const size_t n = expected_size.size();
+  ET_CHECK_MSG(n >= 3, "expected dim should at least be 3, but got %zu", n);
+  const int32_t channels = expected_size[n - 3];
+  const int32_t height = expected_size[n - 2];
+  const int32_t width = expected_size[n - 1];
+
+  size_t num_elems = std::accumulate(
+      expected_size.begin(),
+      expected_size.end(),
+      size_t{1},
+      std::multiplies<size_t>());
+
+  std::streamsize expected_length = num_elems * sizeof(float);
+
+  std::ifstream file(image_path, std::ios::binary | std::ios::ate);
+  ET_CHECK_MSG(
+      file.is_open(), "Failed to open input file: %s", image_path.c_str());
+
+  std::streamsize file_size = file.tellg();
+  ET_CHECK_MSG(
+      file_size == expected_length,
+      "Input image size mismatch. file bytes: %ld, expected bytes: %zu (file: "
+      "%s)",
+      file_size,
+      expected_length,
+      image_path.c_str());
+  file.seekg(0, std::ios::beg);
+  std::vector<float> buffer(num_elems);
+  file.read(reinterpret_cast<char*>(buffer.data()), expected_length);
+  file.close();
+
+  image = Image(std::move(buffer), width, height, channels);
+  ET_LOG(
+      Info,
+      "image Channels: %" PRId32 ", Height: %" PRId32 ", Width: %" PRId32,
+      image.channels(),
+      image.height(),
+      image.width());
+}
+
+} // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/vision_chat_template.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/vision_chat_template.h
new file mode 100644
index 00000000000..283080f9935
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/vision_chat_template.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h>
+#include <executorch/runtime/platform/log.h>
+#include <string>
+#include <vector>
+
+const std::string IMG_TOKEN = "<image>";
+
+/**
+ * Special tokens structure for vision modality
+ */
+struct SpecialTokens {
+  std::string image_token;
+  std::string global_img;
+  std::string fake_wrap_start;
+  std::string fake_wrap_end;
+};
+
+/**
+ * Get special tokens based on model version
+ */
+inline SpecialTokens get_special_tokens(
+    example::VisionLanguageModel model_version) {
+  SpecialTokens tokens;
+
+  switch (model_version) {
+    case example::VisionLanguageModel::kSmolvlm:
+      tokens.image_token = "<image>";
+      tokens.global_img = "<global-img>";
+      tokens.fake_wrap_start = "<fake_token_around_image>";
+      tokens.fake_wrap_end = "<fake_token_around_image>";
+      break;
+    case example::VisionLanguageModel::kInternvl3:
+      tokens.image_token = "<IMG_CONTEXT>";
+      tokens.global_img = "";
+      tokens.fake_wrap_start = "<img>";
+      tokens.fake_wrap_end = "</img>";
+      break;
+    default:
+      break;
+  }
+
+  return tokens;
+}
+
+/**
+ * Expand image tokens in prompt with model-specific wrapping tokens
+ * Replaces each <image> token with the full format including special wrapper
+ * tokens
+ */
+inline std::string expand_image_tokens(
+    const std::string& prompt,
+    const SpecialTokens& specials) {
+  // Create image prompt with repeated image tokens
+  std::string image_prompt = specials.fake_wrap_start;
+  image_prompt += specials.global_img;
+  image_prompt += specials.image_token;
+  image_prompt += specials.fake_wrap_end;
+
+  // Replace single image token with expanded version
+  size_t pos = 0;
+  std::string expanded = prompt;
+  while ((pos = expanded.find(IMG_TOKEN, pos)) != std::string::npos) {
+    expanded.replace(pos, IMG_TOKEN.size(), image_prompt);
+    pos += image_prompt.size();
+  }
+  ET_LOG(Info, "Prompt after expanding image token: %s", expanded.c_str());
+
+  return expanded;
+}
+
+/**
+ * Format prompt based on model version with multimodal token expansion
+ */
+inline std::string apply_chat_template(
+    const std::string& system_prompt,
+    const std::string& prompt,
+    example::VisionLanguageModel model_version) {
+  std::string formatted_prompt;
+  SpecialTokens specials = get_special_tokens(model_version);
+
+  switch (model_version) {
+    case example::VisionLanguageModel::kSmolvlm: {
+      if (!system_prompt.empty()) {
+        formatted_prompt.append(
+            "<|start_header_id|>system<|end_header_id|>\n\n");
+        formatted_prompt.append(system_prompt);
+        formatted_prompt.append("<|eot_id|>");
+      }
+      formatted_prompt.append("<|im_start|>User:");
+      formatted_prompt.append(expand_image_tokens(prompt, specials));
+      formatted_prompt.append("<end_of_utterance>\nAssistant:");
+      break;
+    }
+    case example::VisionLanguageModel::kInternvl3: {
+      if (!system_prompt.empty()) {
+        formatted_prompt.append("<|im_start|>system<|im_end|>\n\n");
+        formatted_prompt.append(system_prompt);
+        formatted_prompt.append("<|im_end|>");
+      }
+      formatted_prompt.append("<|im_start|>user:\n");
+      formatted_prompt.append(expand_image_tokens(prompt, specials));
+      formatted_prompt.append("<|im_end|>assistant\n");
+      break;
+    }
+    default:
+      ET_CHECK_MSG(false, "unsupported VLM version");
+      break;
+  }
+  return formatted_prompt;
+}
diff --git a/examples/qualcomm/oss_scripts/llama/tokenizer.py b/examples/qualcomm/oss_scripts/llama/tokenizer.py
index b55cd61d616..3befa71168b 100644
--- a/examples/qualcomm/oss_scripts/llama/tokenizer.py
+++ b/examples/qualcomm/oss_scripts/llama/tokenizer.py
@@ -7,7 +7,9 @@
 import argparse
 import json
 import logging
-from typing import Callable
+import re
+import warnings
+from typing import Callable, List
 
 from executorch.examples.qualcomm.oss_scripts.llama import LLMModelConfig
 from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import (
@@ -18,16 +20,19 @@
 
 from transformers import AutoTokenizer
 
+IMG_TOKEN = "<image>"
+AUDIO_TOKEN = "<audio>"
+
 # Special tokens for Vision-Language Model
 VLM_SPECIAL_TOKENS = {
     "smolvlm_500m_instruct": {
-        "image_token": "<image>",
+        IMG_TOKEN: "<image>",
         "global_img": "<global-img>",
         "fake_wrap_start": "<fake_token_around_image>",
         "fake_wrap_end": "<fake_token_around_image>",
     },
     "internvl3_1b": {
-        "image_token": "<IMG_CONTEXT>",
+        IMG_TOKEN: "<IMG_CONTEXT>",
         "fake_wrap_start": "<img>",
         "fake_wrap_end": "</img>",
     },
@@ -53,6 +58,7 @@ def __init__(self, control_args: argparse.Namespace, config: LLMModelConfig):
         self.decoder_model = control_args.decoder_model
         self.verbose = control_args.verbose
 
+        self.control_args = control_args
         self.config = config
         self.repo_id = config.repo_id
         self.apply_chat_template = config.instruct_model
@@ -118,6 +124,100 @@ def get_runtime_tokenizer(self, tokenizer_model, tokenizer_bin):
 
         return runtime_tokenizer_path, tokenizer, chat_template
 
+    def prepare_messages(self, prompts: List[str]):
+        """
+        Validate and normalize a multi-turn prompt sequence, then prepare it into
+        a message list.
+
+        This function checks image-token usage against provided image paths, auto-injects
+        image tokens when none of them were present, and constructs a per-turn message structure.
+
+        Args:
+            prompts (List[str]):
+                A list of user prompts representing a multi-turn conversation.
+                If `VISION_ENCODER` is present in `self.config`, image usage is validated:
+                - The total count of image tokens (IMG_TOKEN) across all prompts must
+                    match the number of image paths, unless no image token is present at all
+                    (in which case tokens will be auto-prepended to the first prompt).
+
+        Returns:
+            List[Dict[str, Any]]:
+                A list of message dictionaries, one per prompt/turn, in the same order as `prompts`.
+                Each message has the following schema:
+
+                - `id` (int): 0-based turn index (i.e., position in `prompts`).
+                - `text` (str): The raw prompt text for this turn. If no image tokens were
+                present anywhere and images were provided/assumed, the first prompt's text
+                is auto-prefixed with `IMG_TOKEN * num_images`.
+                - `files_path` (List[str]): Image paths (local or URLs) associated with this
+                turn, assigned left-to-right based on the number of `IMG_TOKEN` occurrences
+                in `text`. Empty when the turn contains no image tokens.
+
+                Example return value:
+                [
+                    {"id": 0, "text": "<image><image> Compare these images", "files_path": ["a.png", "b.png"]},
+                    {"id": 1, "text": "Answer the question: What's the main object in first image?", "files_path": []},
+                ]
+
+        Raises:
+            ValueError:
+                Raised only if the user has already included one or more image tokens (IMG_TOKEN)
+                across `prompts` and the total number of those tokens does not equal the number of
+                provided `image_paths`.
+
+        Examples:
+            >>> self.control_args.image_path = ["img1.jpg", "img2.jpg"]
+            >>> prompts = ["<image><image>Compare these images above and list the differences.", "Answer the question: What's the main object in first image?"]
+            >>> prepare_messages(prompts)
+            [
+                {"id": 0, "text": "<image><image>Compare these images above and list the differences.", "files_path": ["img1.jpg", "img2.jpg"]},
+                {"id": 1, "text": "Answer the question: What's the main object in first image?", "files_path": []},
+            ]
+        """
+
+        messages = []
+
+        image_paths = self.control_args.image_path
+        if hasattr(self.config, VISION_ENCODER):
+            # Load image from user-specified path (URL or local file)
+            # fall back to the default image URL if no image is provided.
+            if not image_paths:
+                image_paths = [getattr(self.config, VISION_ENCODER).img_url]
+                warnings.warn(
+                    f"No image path/URL provided, using default image URL: {image_paths}",
+                    UserWarning,
+                    stacklevel=1,
+                )
+
+            num_images = len(image_paths)
+
+            total_image_tokens = sum(prompt.count(IMG_TOKEN) for prompt in prompts)
+
+            if total_image_tokens == 0:
+                prompts[0] = (IMG_TOKEN * num_images) + prompts[0]
+            elif total_image_tokens != num_images:
+                raise ValueError(
+                    f"Number of <image> tokens ({total_image_tokens}) does not match "
+                    f"number of images ({num_images}). Please check your prompts and image paths."
+                    "Please check your prompts and image paths.\n\n"
+                    f"=== Prompt ===\n{prompts}\n"
+                    f"=== Image paths ===\n{image_paths}"
+                )
+
+        img_idx = 0
+        for i, prompt in enumerate(prompts):
+            message = {"id": i, "text": prompt, "files_path": []}
+            if IMG_TOKEN in prompt:
+                num_img = prompt.count(IMG_TOKEN)
+                message["files_path"] = image_paths[img_idx : img_idx + num_img]
+                img_idx += num_img
+            messages.append(message)
+
+        if self.control_args.verbose:
+            logging.info("Simulation multi-turn:")
+            logging.info(messages)
+        return messages
+
     def prepare_multimodal_prompt(
         self,
         prompt: str,
@@ -153,12 +253,11 @@ def prepare_multimodal_prompt(
             image_prompt = (
                 f"{specials['fake_wrap_start']}"
                 f"{specials.get('global_img', '')}"
-                f"{specials['image_token'] * image_seq_len}"
+                f"{specials[IMG_TOKEN] * image_seq_len}"
                 f"{specials['fake_wrap_end']}"
             )
             # Replace image token with expanded version
-            expanded = prompt.replace(specials["image_token"], image_prompt)
-
+            expanded = prompt.replace(specials[IMG_TOKEN], image_prompt)
             if self.verbose:
                 logging.info(f"Prompt after expanding image token: {expanded}")
 
@@ -169,6 +268,27 @@ def prepare_multimodal_prompt(
                 "Audio-language model expanded tokens still under development"
             )
 
+    def _split_prompt(self, prompt: str):
+        """
+        Split user prompt by special tokens.
+
+        Args:
+            prompt (str): Input prompt containing special tokens
+
+        Returns:
+            List[str]: List of prompt segments split by special tokens
+        """
+        split_tokens = set()
+        if self.decoder_model in VLM_SPECIAL_TOKENS:
+            split_tokens.add(IMG_TOKEN)
+        if self.decoder_model in ALM_SPECIAL_TOKENS:
+            split_tokens.add(AUDIO_TOKEN)
+
+        if not split_tokens:
+            return [prompt]
+        pattern = f"({'|'.join(map(re.escape, split_tokens))})"
+        return [part for part in re.split(pattern, prompt) if part]
+
     def apply_prompt_template(
         self,
         chat_template: Callable,
@@ -186,36 +306,31 @@ def apply_prompt_template(
         Returns:
             Formatted prompt string
         """
+
+        messages = []
+        message = {"role": "user", "content": prompt}
         if self.decoder_model in VLM_SPECIAL_TOKENS:
-            messages = [
-                {
-                    "role": "user",
-                    "content": [
+            contents = self._split_prompt(prompt)
+            message["content"] = []
+            for content in contents:
+                if content == IMG_TOKEN:
+                    message["content"].append(
                         {"type": "image"},
-                        {"type": "text", "text": prompt},
-                    ],
-                }
-            ]
+                    )
+                else:
+                    message["content"].append(
+                        {"type": "text", "text": content},
+                    )
         elif self.decoder_model in ALM_SPECIAL_TOKENS:
-            messages = [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "audio"},
-                        {"type": "text", "text": prompt},
-                    ],
-                }
-            ]
-        else:
-            messages = [{"role": "user", "content": prompt}]
+            message["content"] = prompt
 
+        messages.append(message)
         if system_prompt:
             messages.append({"role": "system", "content": system_prompt})
 
         template_prompt = chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
-        logging.info(f"Prompt after applying template: {template_prompt}")
 
         # edge cases handling:
         # Gemma may produce unexpected output if the prompt contains an extra <bos> token.
diff --git a/examples/qualcomm/oss_scripts/llama/wrappers/base_component.py b/examples/qualcomm/oss_scripts/llama/wrappers/base_component.py
index ea8de423882..532ab718d28 100644
--- a/examples/qualcomm/oss_scripts/llama/wrappers/base_component.py
+++ b/examples/qualcomm/oss_scripts/llama/wrappers/base_component.py
@@ -122,27 +122,24 @@ def process_model_args(
 def get_model_specific_kwargs(control_args: argparse.Namespace, config: LLMModelConfig):
     """
     Retrieve model-specific config required for Static LLaMA.
-        This method handles architecture-specific requirements for both Vision-Language Models (VLMs)
-        and Language-only Models (LLMs), extracting necessary config from HuggingFace configs.
+    This method handles architecture-specific requirements for both Vision-Language Models (VLMs)
+    and Language-only Models (LLMs), extracting necessary config from HuggingFace configs.
 
     """
     kwargs = {}
-
     # Vision-Language Model (VLM)
     # For multimodal models, we need the special token ID that represents image placeholders
     # in the input sequence. This token is used to mark positions where image embeddings
     # should be inserted during inference.
     if hasattr(config, VISION_ENCODER):
         hf_config = AutoConfig.from_pretrained(config.repo_id)
-        kwargs["modality_placeholder_token_id"] = hf_config.image_token_id
-
+        kwargs["image_token_id"] = hf_config.image_token_id
     # TODO: Support Audio modality
-    elif hasattr(config, AUDIO_ENCODER):
+    if hasattr(config, AUDIO_ENCODER):
         raise NotImplementedError(
             "Audio encoder modality is not currently supported. "
-            "Please provide a valid modality_placeholder_token_id in kwargs."
+            "Please provide a valid audio_token_id in kwargs."
         )
-
     return kwargs
 
 
@@ -173,6 +170,7 @@ class Data:
         custom_annotation: Any = ()
         calibration_data: Request.CalibrationData = None
         tokenizer: callable = None
+        skip_quantize: bool = False
         backend: QnnExecuTorchBackendType = QnnExecuTorchBackendType.kHtpBackend
         soc_model: str = "SM8750"
 
diff --git a/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py b/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py
index ebb9bed8b69..1404425ce55 100644
--- a/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py
+++ b/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py
@@ -49,9 +49,9 @@
     DECODE_QDQ_FILENAME,
     DECODER_GRAPH_NAMES,
     TEXT_DECODER,
-    TEXT_EMBEDDING,
-    TEXT_EMBEDDING_GRAPH_NAMES,
     TEXT_ENCODER,
+    TOK_EMBEDDING,
+    TOK_EMBEDDING_GRAPH_NAMES,
     VISION_ENCODER,
 )
 from executorch.examples.qualcomm.oss_scripts.llama.decoder_utils import (
@@ -60,7 +60,9 @@
 from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_quant_recipe import (
     EncoderQuantRecipe,
 )
-from executorch.examples.qualcomm.oss_scripts.llama.model.embedding import TextEmbedding
+from executorch.examples.qualcomm.oss_scripts.llama.model.embedding import (
+    TokenEmbedding,
+)
 from executorch.examples.qualcomm.oss_scripts.llama.model.static_llama import (
     LlamaModel,
     ModelArgs,
@@ -110,7 +112,6 @@ def __init__(
         )
 
         # For multimodal embedding
-        self._modality_placeholder_token_id = None
         self.apply_embedding = apply_embedding
         self.tok_embedding_passes_job = (
             get_capture_program_passes() if apply_embedding else None
@@ -273,7 +274,7 @@ def _get_model_instance(self) -> LlamaModel:
             auto_model = AutoModel.from_pretrained(
                 self.config.repo_id, _attn_implementation="eager"
             )
-            tok_embedding = TextEmbedding(
+            tok_embedding = TokenEmbedding(
                 auto_model.get_input_embeddings().to(torch.float32),
                 self.model_args.max_batch_size,
                 self.model_args.ar_len,
@@ -470,16 +471,15 @@ def _calibrate(
             else None
         )
         # check user's prompt which helps calibrate special token
-        for prompt in user_calibration_data:
+        for turn in zip(intermediate_outputs, user_calibration_data):
+            hidden_states, prompt = turn
             graph_module_inference(
                 use_kv_cache=self.meta["get_use_kv_cache"],
                 get_example_inputs=self.get_example_inputs,
-                hidden_states=intermediate_outputs,  # hidden_states for multimodal
+                hidden_states=hidden_states,  # hidden_states for multimodal
                 module=model,
                 tok_embedding=tok_embedding,
-                modality_placeholder_token_id=self.meta.get(
-                    "modality_placeholder_token_id", None
-                ),
+                image_token_id=self.meta.get("image_token_id", None),
                 tokenizer=tokenizer,
                 ar_len=self.meta["get_ar_len"],
                 max_seq_len=self.meta["get_max_context_len"],
@@ -518,18 +518,25 @@ def quantize(self, request: Request):  # noqa: C901
             )
 
         data = request.method_data[TEXT_DECODER]
-
-        image_embedding = None
-        if self.apply_embedding:
-            # For demo: get first data now
-            image_embedding = request.method_data[
-                VISION_ENCODER
-            ].calibration_data.intermediate_outputs[0]
+        audio_turns = request.method_data[
+            AUDIO_ENCODER
+        ].calibration_data.intermediate_outputs
+        vision_turns = request.method_data[
+            VISION_ENCODER
+        ].calibration_data.intermediate_outputs
+        if audio_turns is None:
+            audio_turns = [[] for _ in range(len(data.calibration_data.datasets))]
+        if vision_turns is None:
+            vision_turns = [[] for _ in range(len(data.calibration_data.datasets))]
+        intermediate_outputs = [
+            [*audio_turn, *vision_turn]
+            for audio_turn, vision_turn in zip(audio_turns, vision_turns)
+        ]
 
         quantizer = make_quantizer(backend=data.backend, soc_model=data.soc_model)
         quantizer.set_recipe(self.quant_recipe.recipe)
 
-        text_embedding_quantizer = make_quantizer(
+        tok_embedding_quantizer = make_quantizer(
             quant_dtype=QuantDtype.use_16a8w,
             per_channel_conv=True,
             per_channel_linear=True,
@@ -555,7 +562,7 @@ def quantize(self, request: Request):  # noqa: C901
             self.decoder = prepare_pt2e(self.decoder, quantizer)
             if self.apply_embedding:
                 self.tok_embedding = prepare_pt2e(
-                    self.tok_embedding, text_embedding_quantizer
+                    self.tok_embedding, tok_embedding_quantizer
                 )
 
             # start calibration (only for kv mode or prefill mode without kv cache)
@@ -566,7 +573,7 @@ def quantize(self, request: Request):  # noqa: C901
                     event="prepare_pt2e",
                     user_calibration_data=data.calibration_data.datasets,
                     tok_embedding=self.tok_embedding,
-                    intermediate_outputs=image_embedding,
+                    intermediate_outputs=intermediate_outputs,
                 )
             else:
                 # one dummy inference to remove affine observer
@@ -589,16 +596,16 @@ def quantize(self, request: Request):  # noqa: C901
 
             if self.control_args.verbose and self.mode == Mode.DECODE:
                 if self.apply_embedding:
-                    image_embedding = request.method_data[
+                    qdq_intermediate_outputs = request.method_data[
                         VISION_ENCODER
-                    ].calibration_data.qdq_intermediate_outputs[0]
+                    ].calibration_data.qdq_intermediate_outputs
                 self._calibrate(
                     model=self.decoder,
                     tokenizer=data.tokenizer,
                     event="convert_pt2e",
                     user_calibration_data=data.calibration_data.datasets,
                     tok_embedding=self.tok_embedding,
-                    intermediate_outputs=image_embedding,
+                    intermediate_outputs=qdq_intermediate_outputs,
                 )
 
         # save logit's quantization attributes to meta
@@ -762,14 +769,14 @@ def compile(self, request: Request):  # noqa: C901
 
         # prepare lowering tok_embedding if applicable
         if self.apply_embedding:
-            tok_embedding_data = request.method_data[TEXT_EMBEDDING]
+            tok_embedding_data = request.method_data[TOK_EMBEDDING]
             models = [
                 d for d in [self.decode, self.prefill] if d.tok_embedding is not None
             ]
             tok_embedding_example_inputs = [
                 m.tok_embedding_export_input for m in models if m is not None
             ]  # tokens
-            tok_embedding_graph_names = TEXT_EMBEDDING_GRAPH_NAMES[: len(models)]
+            tok_embedding_graph_names = TOK_EMBEDDING_GRAPH_NAMES[: len(models)]
 
         # prepare lowering decoder
         data = request.method_data[TEXT_DECODER]
@@ -820,7 +827,7 @@ def compile(self, request: Request):  # noqa: C901
             tok_embedding_exec_prog_mgr = tok_embedding_edge_prog_mgr.to_executorch(
                 executorch_config
             )
-            data = request.method_data[TEXT_EMBEDDING]
+            data = request.method_data[TOK_EMBEDDING]
             with open(
                 f"{self.control_args.artifact}/{data.pte_filename}.pte", "wb"
             ) as file:
@@ -884,7 +891,6 @@ def __init__(
             self.model = self.model.eval()
             self.model.load_state_dict(auto_model.state_dict(), strict=False)
             self.example_input = self.model.get_example_inputs()
-            self.preprocess = self.model.preprocess
 
             # set quant recipe
             self.quant_recipe: EncoderQuantRecipe = (
@@ -911,40 +917,48 @@ def compile(self, request: Request):
         ) as file:
             exec_prog_mgr.write_to_file(file)
 
+    def _calibrate(self, model, calibration_datasets):
+        outputs = []
+        for turn in calibration_datasets:
+            outputs_each_turn = [model(*data) for data in turn]
+            outputs.append(outputs_each_turn)
+        return outputs
+
     def quantize(self, request: Request):
-        if self.model is None or self.quant_recipe is None:
+        if self.model is None:
             return
 
         request_data = request.method_data[self.modality]
+        calibration_datasets = request_data.calibration_data.datasets
+
         with torch.no_grad():
             self.model = torch.export.export(self.model, self.example_input).module()
 
+            if request_data.skip_quantize:
+                logging.info(f"skipping encoder quantization for {self.modality}")
+                intermediate_outputs = self._calibrate(self.model, calibration_datasets)
+                request_data.calibration_data.intermediate_outputs = (
+                    intermediate_outputs
+                )
+                return
+
             quantizer = make_quantizer(
                 backend=request_data.backend, soc_model=request_data.soc_model
             )
             quantizer.set_recipe(self.quant_recipe.recipe)
             self.model = prepare_pt2e(self.model, quantizer)
 
-            # calibration
-            intermediate_outputs = []
-            for data in request_data.calibration_data.datasets:
-                output = self.model(*self.preprocess(data))
-                intermediate_outputs.append(
-                    (output,) if isinstance(output, torch.Tensor) else output
-                )
-            # update intermediate outputs for next modality
+            # start calibration
+            intermediate_outputs = self._calibrate(self.model, calibration_datasets)
             request_data.calibration_data.intermediate_outputs = intermediate_outputs
 
             self.model = convert_pt2e(self.model)
 
-            qdq_intermediate_outputs = []
+            # QDQ inference
             if self.control_args.verbose:
-                for data in request_data.calibration_data.datasets:
-                    output = self.model(*self.preprocess(data))
-                    qdq_intermediate_outputs.append(
-                        (output,) if isinstance(output, torch.Tensor) else output
-                    )
-                # update qdq intermediate outputs for next modality
+                qdq_intermediate_outputs = self._calibrate(
+                    self.model, calibration_datasets
+                )
                 request_data.calibration_data.qdq_intermediate_outputs = (
                     qdq_intermediate_outputs
                 )
@@ -976,7 +990,7 @@ def __init__(self, control_args: argparse.Namespace, config: LLMModelConfig):
             AUDIO_ENCODER,
             TEXT_ENCODER,
             VISION_ENCODER,
-            TEXT_EMBEDDING,
+            TOK_EMBEDDING,
             TEXT_DECODER,
         ]
         # build dependency chain
@@ -1009,6 +1023,7 @@ def compile(
     def quantize(
         self,
         calibration_data: Dict[str, List[Any]],
+        skip_quantize: Dict[str, bool],
         tokenizer,
         backend,
         soc_model,
@@ -1020,6 +1035,7 @@ def quantize(
                     calibration_data=Request.CalibrationData(
                         datasets=calibration_data[m]
                     ),
+                    skip_quantize=skip_quantize.get(m, False),
                     tokenizer=tokenizer,
                     backend=backend,
                     soc_model=soc_model,
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py
index 9f2485f0457..7d3f175fff3 100755
--- a/examples/qualcomm/utils.py
+++ b/examples/qualcomm/utils.py
@@ -1074,13 +1074,18 @@ def parse_skip_delegation_node(args):
     return skip_node_id_set, skip_node_op_set
 
 
-def generate_inputs(dest_path: str, file_name: str, inputs=None):
+def generate_inputs(
+    dest_path: str,
+    input_list_filename: str,
+    inputs=None,
+    prefix_input_filename: str = "",
+):
     input_list_file = None
     input_files = []
 
     def prepare_input_file(tensor, fd, index, sub_index):
         # transform torch.Tensor to raw file
-        input_file_name = f"input_{index}_{sub_index}.raw"
+        input_file_name = f"{prefix_input_filename}_input_{index}_{sub_index}.raw"
         input_file_path = f"{dest_path}/{input_file_name}"
         if not isinstance(tensor, torch.Tensor):
             tensor = torch.tensor(tensor)
@@ -1093,7 +1098,7 @@ def prepare_input_file(tensor, fd, index, sub_index):
 
     # Prepare input data
     if inputs is not None:
-        input_list_file = f"{dest_path}/{file_name}"
+        input_list_file = f"{dest_path}/{input_list_filename}"
 
         with open(input_list_file, "w") as f:
             for idx, data in enumerate(inputs):