diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index dd164593134..82a2ebbf810 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -6703,7 +6703,7 @@ class MLLMSpecs: sm8650_token_rate: float sm8750_token_rate: float encoder_pte_size: float - text_embedding_pte_size: float + tok_embedding_pte_size: float decoder_pte_size: float @dataclass(frozen=True) @@ -6719,7 +6719,7 @@ def setUp(self): sm8650_token_rate=50, sm8750_token_rate=55, encoder_pte_size=110_000_000, # 110MB - text_embedding_pte_size=100_000_000, # 100MB + tok_embedding_pte_size=100_000_000, # 100MB decoder_pte_size=400_000_000, # 400MB image_path="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg", # New York Bay golden_image_feature="city", @@ -6729,7 +6729,7 @@ def setUp(self): sm8650_token_rate=11, sm8750_token_rate=13, encoder_pte_size=425_000_000, # 425MB - text_embedding_pte_size=300_000_000, # 300MB + tok_embedding_pte_size=300_000_000, # 300MB decoder_pte_size=550_000_000, # 550 MB image_path="http://images.cocodataset.org/val2017/000000039769.jpg", # Two cats lying on a blanket golden_image_feature="cats", @@ -6801,16 +6801,16 @@ def test_static_vlm(self): print(f"Answer: {model_out}") if not self.enable_x86_64: encoder_pte_size = msg["encoder_pte_size"] - text_embedding_pte_size = msg["text_embedding_pte_size"] + tok_embedding_pte_size = msg["tok_embedding_pte_size"] decoder_pte_size = msg["pte_size"] self.assertLessEqual(encoder_pte_size, vlm_specs.encoder_pte_size) self.assertLessEqual( - text_embedding_pte_size, vlm_specs.text_embedding_pte_size + tok_embedding_pte_size, vlm_specs.tok_embedding_pte_size ) self.assertLessEqual(decoder_pte_size, vlm_specs.decoder_pte_size) print(f"Encoder PTE Size: {encoder_pte_size} bytes") - print(f"Text Embedding PTE Size: {text_embedding_pte_size} bytes") - print(f"Decoder PTE Size: {decoder_pte_size} bytes") + print(f"Token Embedding PTE Size: {tok_embedding_pte_size} bytes") + print(f"Text Decoder PTE Size: {decoder_pte_size} bytes") attr_name = f"{self.model.lower()}_token_rate" if ( diff --git a/examples/qualcomm/oss_scripts/llama/CMakeLists.txt b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt index 78a8410f034..5c6939546a6 100644 --- a/examples/qualcomm/oss_scripts/llama/CMakeLists.txt +++ b/examples/qualcomm/oss_scripts/llama/CMakeLists.txt @@ -94,12 +94,15 @@ list( ${CMAKE_CURRENT_LIST_DIR}/qnn_multimodal_runner.cpp ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_runner.cpp ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_runner.h + ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_embedding_merger.cpp + ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_embedding_merger.h + ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/utils.h ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/encoder.cpp ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/encoder.h - ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_runner.cpp - ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_runner.h - ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_processor.cpp - ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_processor.h + ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/tok_embedding_runner.cpp + ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/tok_embedding_runner.h + ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/tok_embedding_processor.cpp + ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/tok_embedding_processor.h ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_prompt_processor.cpp ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_prompt_processor.h ${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_token_generator.cpp diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md index 0a4629ac132..fb926e9f613 100644 --- a/examples/qualcomm/oss_scripts/llama/README.md +++ b/examples/qualcomm/oss_scripts/llama/README.md @@ -308,6 +308,37 @@ If you have already compiled a VLM model, you can run inference with pre-generat python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode kv --max_seq_len 1024 --prompt "Can you describe this image?" --image_path "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE} ``` +### Multi-Turn Conversation with VLM + +The framework supports multi-turn conversations with VLMs, allowing you to conduct dialogues that can involve multiple images. + +- **Multi-Turn Prompts**: To engage in a conversation, provide multiple prompts sequentially using the `--prompt` argument. Each string will be treated as a separate turn. +- **Multiple Images**: You can supply multiple images (from URLs or local paths) using the `--image_path` argument. +- **Flexible Image Placement**: Use the `` token within your prompt to specify exactly where each image's embeddings should be placed. The images provided via `--image_path` will replace the `` tokens in the order they appear. + +**Example**: + +In this example, the first turn compares two images, the second turn asks a follow-up question about the first image, and the third turn asks for a caption for a third image. + +```bash +# Define image URLs and prompts for a 3-turn conversation +IMAGE1_URL="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" +IMAGE2_URL="http://images.cocodataset.org/val2017/000000039769.jpg" +IMAGE3_URL="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg" + +PROMPT1="Compare these images above and list the differences." +PROMPT2="Answer the question: What's the main object in first image?" +PROMPT3="Caption this image." + +# Execute the multi-turn conversation +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model smolvlm_500m_instruct --model_mode kv --max_seq_len 2048 --prompt "$PROMPT1" "$PROMPT2" "$PROMPT3" --image_path "$IMAGE1_URL" "$IMAGE2_URL" "$IMAGE3_URL" +``` + +**How it works:** +- **Turn 1**: The prompt `"Compare these images above and list the differences."` uses the first two images (`$IMAGE1_URL`, `$IMAGE2_URL`). +- **Turn 2**: The prompt `"Answer the question: What's the main object in first image?"` is a text-only follow-up. The conversation context is maintained from the previous turn. +- **Turn 3**: The prompt `"Caption this image."` uses the third image (`$IMAGE3_URL`). + ### VLM Processing Details The VLM inference pipeline consists of: diff --git a/examples/qualcomm/oss_scripts/llama/dataset.py b/examples/qualcomm/oss_scripts/llama/dataset.py index 72f9e5d766a..2994baaafaa 100644 --- a/examples/qualcomm/oss_scripts/llama/dataset.py +++ b/examples/qualcomm/oss_scripts/llama/dataset.py @@ -5,17 +5,15 @@ # LICENSE file in the root directory of this source tree. import argparse -import warnings -from typing import Callable, List, Optional +from typing import Callable, Dict, List, Optional from executorch.examples.qualcomm.oss_scripts.llama import LLMModelConfig from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import ( AUDIO_ENCODER, TEXT_DECODER, - TEXT_EMBEDDING, TEXT_ENCODER, + TOK_EMBEDDING, VISION_ENCODER, - VISION_ENCODER_INPUT_FILENAME, ) from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_config import ( @@ -23,7 +21,6 @@ VisionModalityConfig, ) from executorch.examples.qualcomm.oss_scripts.llama.tokenizer import TokenizerWrapper - from transformers import AutoProcessor from transformers.image_utils import load_image @@ -43,35 +40,30 @@ def __init__( self.artifact = control_args.artifact self.repo_id = config.repo_id - def _build_vision_dataset(self, config: VisionModalityConfig, prompt: str): + def _build_vision_dataset( + self, config: VisionModalityConfig, prompt: str, files_path: List[str] + ): """ This will processes images using the HuggingFace processor and saves the processed pixel values for runtime evaluation. Args: config (VisionModalityConfig): containing image URL and resize parameters - prompt (str): Text prompt to be processed alongside the image + prompt (str): Text prompt + files_path (List[str]): List of file paths for images. Each path can be either a URL or a local file path. Returns: tuple of pixel values tensors """ - # Load image from user-specified path (URL or local file) - # fall back to the default image URL if no image is provided. - image_path = self.control_args.image_path or config.img_url - if not self.control_args.image_path: - warnings.warn( - f"No image path/URL provided, using default image URL: {config.img_url}", - UserWarning, - stacklevel=1, - ) - image = load_image(image_path) + + images = [load_image(image_path) for image_path in files_path] # Process image with text prompt using HuggingFace processor # Some HF processors (e.g. InternVL3) need to pass text arg or it will cause error and process failed processor = AutoProcessor.from_pretrained(self.repo_id) pixel_values = processor( text=prompt, - images=[image], + images=images, return_tensors="pt", crop_to_patches=False, size={ @@ -80,19 +72,26 @@ def _build_vision_dataset(self, config: VisionModalityConfig, prompt: str): }, ).pixel_values - # save image file for runtime evaluation - pixel_values.detach().numpy().tofile( - f"{self.artifact}/{VISION_ENCODER_INPUT_FILENAME}.raw" + assert pixel_values.dim() in (4, 5), ( + f"Unsupported pixel_values dim={pixel_values.dim()}); " + f"expected 5D (1,N,C,H,W) or 4D (N,C,H,W)." ) - return (pixel_values,) + + # HTP Prepare failed when pixel_values has 5D dimension, so we squeeze the batch dimension here. + if pixel_values.dim() == 5: + pixel_values = pixel_values.squeeze(0) # (N, C, H, W) + + # save image file for runtime evaluation + return [(pixel_values[i][None, ...],) for i in range(len(pixel_values))] def _build_dataset_for_encoder( self, config: MultiModalityConfig, prompt: str, + files_path: List[str], ) -> Optional[tuple]: if issubclass(config, VisionModalityConfig): - return self._build_vision_dataset(config, prompt) + return self._build_vision_dataset(config, prompt, files_path) else: # Audio and text encoder dataset building are not yet implemented # TODO: Add support for AudioModalityConfig and TextModalityConfig @@ -106,22 +105,33 @@ def prepare_calibration_dataset( prompts: List[str], chat_template: Callable, ): - calibration_data = { - AUDIO_ENCODER: [], - TEXT_ENCODER: [], - VISION_ENCODER: [], - TEXT_EMBEDDING: [], - TEXT_DECODER: [], + # 1. Initialize data + # Shape convention: (num_samples, num_turns). + # Currently, user prompt calibration is one-shot per prompt (num_samples = 1). + calibration_data: Dict[str, List[List]] = { + # Encoders / embeddings: initialize an empty turn list for each prompt. + AUDIO_ENCODER: [[] for _ in range(len(prompts))], + TEXT_ENCODER: [[] for _ in range(len(prompts))], + VISION_ENCODER: [[] for _ in range(len(prompts))], + TOK_EMBEDDING: [[] for _ in range(len(prompts))], + # Decoder targets: one string per prompt. + TEXT_DECODER: ["" for _ in range(len(prompts))], } + # 2. Prepare messages for multi-turn conversation + messages = self.tokenizer_wrapper.prepare_messages(prompts) + + # 3. build dataset by modality is_multimodal = any( [ hasattr(self.config, AUDIO_ENCODER), hasattr(self.config, VISION_ENCODER), ] ) - for prompt in prompts: - # Apply chat template formatting if available (for instruction-tuned/reasoning models) + for turn_idx, message in enumerate(messages): + prompt = message["text"] + + # 3.1. Apply chat template formatting if available (for instruction-tuned/reasoning models) prompt = ( self.tokenizer_wrapper.apply_prompt_template( chat_template, prompt, self.control_args.system_prompt @@ -130,16 +140,19 @@ def prepare_calibration_dataset( else prompt ) - # Build calibration datasets for each available encoder modality + # 3.2 Build calibration datasets for each available encoder modality for modality in [AUDIO_ENCODER, TEXT_ENCODER, VISION_ENCODER]: - if hasattr(self.config, modality): - data = self._build_dataset_for_encoder( - getattr(self.config, modality), - prompt, - ) - calibration_data[modality].append(data) - - # Expand multimodal tokens in prompt for decoder + if not hasattr(self.config, modality) or not message["files_path"]: + continue + + data = self._build_dataset_for_encoder( + getattr(self.config, modality), + prompt, + message["files_path"], + ) + calibration_data[modality][turn_idx] = data + + # 3.3. Expand multimodal tokens in prompt for decoder prompt = ( self.tokenizer_wrapper.prepare_multimodal_prompt(prompt) if is_multimodal @@ -147,6 +160,6 @@ def prepare_calibration_dataset( ) # Add prompt to decoder calibration data - calibration_data[TEXT_DECODER].append(prompt) + calibration_data[TEXT_DECODER][turn_idx] = prompt return calibration_data diff --git a/examples/qualcomm/oss_scripts/llama/decoder_constants.py b/examples/qualcomm/oss_scripts/llama/decoder_constants.py index 74e3959a86e..7a4c3e20be6 100644 --- a/examples/qualcomm/oss_scripts/llama/decoder_constants.py +++ b/examples/qualcomm/oss_scripts/llama/decoder_constants.py @@ -12,20 +12,21 @@ TASKS_EVAL = "tasks_eval" SQNR_EVAL = "sqnr_eval" -# filenames for vision model -VISION_ENCODER_INPUT_FILENAME = "vision_encoder_input" - - # Component identifiers AUDIO_ENCODER = "audio_encoder" VISION_ENCODER = "vision_encoder" TEXT_ENCODER = "text_encoder" -TEXT_EMBEDDING = "text_embedding" +TOK_EMBEDDING = "tok_embedding" TEXT_DECODER = "text_decoder" ATTENTION_SINK_EVICTOR = "attention_sink_evictor" +# Mapping of input flags for the runner +MODALITY_INPUT_FLAG_MAP = { + VISION_ENCODER: "image_path", +} + # Text embedding graph names -TEXT_EMBEDDING_GRAPH_NAMES = [ +TOK_EMBEDDING_GRAPH_NAMES = [ "tok_embedding_kv_forward", "tok_embedding_prefill_forward", ] diff --git a/examples/qualcomm/oss_scripts/llama/decoder_runtime_evaluator.py b/examples/qualcomm/oss_scripts/llama/decoder_runtime_evaluator.py index ff173b1f753..55d7409a1e6 100644 --- a/examples/qualcomm/oss_scripts/llama/decoder_runtime_evaluator.py +++ b/examples/qualcomm/oss_scripts/llama/decoder_runtime_evaluator.py @@ -20,16 +20,20 @@ ATTENTION_SINK_EVICTOR, DECODER_MODEL_VERSION, EVAL_MODE, + MODALITY_INPUT_FLAG_MAP, TEXT_DECODER, - TEXT_EMBEDDING, + TOK_EMBEDDING, VISION_ENCODER, - VISION_ENCODER_INPUT_FILENAME, ) from executorch.examples.qualcomm.oss_scripts.llama.decoder_utils import ( INFERENCE_REGISTRY, retrieve_info_from_pte, ) -from executorch.examples.qualcomm.utils import make_output_dir, SimpleADB +from executorch.examples.qualcomm.utils import ( + generate_inputs, + make_output_dir, + SimpleADB, +) from pytorch_tokenizers.hf_tokenizer import HuggingFaceTokenizer from pytorch_tokenizers.llama2c import Llama2cTokenizer as SentencePieceTokenizer from pytorch_tokenizers.tiktoken import TiktokenTokenizer @@ -82,19 +86,20 @@ def __init__( args: argparse.Namespace, pte_paths: Dict, runtime_tokenizer_path: str, - is_modality: bool, + is_multimodal: bool, + modality_inputs=None, ): self.args = args self.pte_paths = pte_paths self.runtime_tokenizer_path = runtime_tokenizer_path self.qnn_sdk = os.getenv("QNN_SDK_ROOT") - self.is_modality = is_modality + self.is_multimodal = is_multimodal self.device_workspace = ( f"/data/local/tmp/{getpass.getuser()}/executorch/static_llm" ) self.runner = ( - "qnn_multimodal_runner" if self.is_modality else "qnn_llama_runner" + "qnn_multimodal_runner" if self.is_multimodal else "qnn_llama_runner" ) device_output_path = self._get_adb().output_folder if args.enable_x86_64: @@ -130,14 +135,13 @@ def _init_runner_base_cmd(self): f"--performance_output_path {self.device_performance_path}", ] ) - if self.is_modality: + if self.is_multimodal: base_cmd = " ".join( [ base_cmd, f"--decoder_path {self.pte_paths[TEXT_DECODER]}", f"--encoder_path {self.pte_paths[VISION_ENCODER]}", - f"--embedding_path {self.pte_paths[TEXT_EMBEDDING]}", - f"--image_path {args.artifact}/{VISION_ENCODER_INPUT_FILENAME}.raw", + f"--tok_embedding_path {self.pte_paths[TOK_EMBEDDING]}", ] ) else: @@ -165,14 +169,13 @@ def _init_runner_base_cmd(self): ] ) - if self.is_modality: + if self.is_multimodal: base_cmd = " ".join( [ base_cmd, f"--decoder_path {os.path.basename(self.pte_paths[TEXT_DECODER])}", f"--encoder_path {os.path.basename(self.pte_paths[VISION_ENCODER])}", - f"--embedding_path {os.path.basename(self.pte_paths[TEXT_EMBEDDING])}", - f"--image_path {VISION_ENCODER_INPUT_FILENAME}.raw", + f"--tok_embedding_path {os.path.basename(self.pte_paths[TOK_EMBEDDING])}", ] ) else: @@ -218,11 +221,47 @@ def run(self) -> Any: class DefaultEval(EvalBase): - def __init__(self, args, pte_paths, runtime_tokenizer_path, is_modality): - super().__init__(args, pte_paths, runtime_tokenizer_path, is_modality) + def __init__( + self, + args, + pte_paths, + runtime_tokenizer_path, + is_multimodal, + modality_inputs=None, + ): + super().__init__( + args, pte_paths, runtime_tokenizer_path, is_multimodal, modality_inputs + ) self.adb = self._get_adb() self.inference_speed = 0 + modality_input_cmd = [] + self.modality_input_files = [] + for modality, data in modality_inputs.items(): + if ( + not modality_inputs[modality] + or modality not in MODALITY_INPUT_FLAG_MAP + or modality is TEXT_DECODER + ): + continue + + # Specify the input list filename by it's modality. + # This helps distinguish inputs coming from different encoders, + # especially in models like OMI where vision and audio encoders coexist. + input_list_filename = f"{modality}_input_list.txt" + input_list_file, input_files = generate_inputs( + self.args.artifact, + input_list_filename=input_list_filename, + inputs=data, + prefix_input_filename=modality, + ) + self.modality_input_files.append(input_list_file) + self.modality_input_files.extend(input_files) + modality_input_cmd.append( + f"--{MODALITY_INPUT_FLAG_MAP[modality]} {input_list_filename}" + ) + self.modality_input_cmd = " ".join(modality_input_cmd) + lookahead_args = " ".join( [ f"--window {args.window}", @@ -282,10 +321,9 @@ def run(self, prompt): ) extra_files = [self.runtime_tokenizer_path] - if self.is_modality: - extra_files = extra_files + [ - f"{self.args.artifact}/{VISION_ENCODER_INPUT_FILENAME}.raw" - ] + if self.is_multimodal: + extra_files.extend(self.modality_input_files) + runner_cmd = " ".join([runner_cmd, self.modality_input_cmd]) self.adb.push(inputs=[], files=extra_files) self.adb.execute(custom_runner_cmd=runner_cmd) self.adb.pull( @@ -324,9 +362,9 @@ def __init__( pte_paths, tokenizer, runtime_tokenizer_path, - is_modality, + is_multimodal, ): - super().__init__(args, pte_paths, runtime_tokenizer_path, is_modality) + super().__init__(args, pte_paths, runtime_tokenizer_path, is_multimodal) self.inference_speed = 0 self.source_model = source_model self.get_example_inputs = get_example_inputs @@ -613,12 +651,14 @@ def _model_call(self, inps): self.inference_speed = output_performance_holder[0] return output_logits_holder[0][:, :seq_len, :] - def __init__(self, args, pte_paths, tokenizer, runtime_tokenizer_path, is_modality): + def __init__( + self, args, pte_paths, tokenizer, runtime_tokenizer_path, is_multimodal + ): super().__init__( args=args, pte_paths=pte_paths, runtime_tokenizer_path=runtime_tokenizer_path, - is_modality=is_modality, + is_multimodal=is_multimodal, ) self.inference_speed = None self.tasks = args.tasks diff --git a/examples/qualcomm/oss_scripts/llama/decoder_utils.py b/examples/qualcomm/oss_scripts/llama/decoder_utils.py index 20a7ab99c8d..d3261e1bb68 100644 --- a/examples/qualcomm/oss_scripts/llama/decoder_utils.py +++ b/examples/qualcomm/oss_scripts/llama/decoder_utils.py @@ -41,7 +41,7 @@ def _modality_inputs_merger( input_ids: torch.LongTensor, inputs_embeds: torch.Tensor, image_hidden_states: torch.Tensor, - modality_placeholder_token_id, + image_token_id, ): """ This method aims at merging the token embeddings with the image hidden states into one single sequence of vectors that are fed to the transformer LM. @@ -53,7 +53,7 @@ def _modality_inputs_merger( - To fit the format of that sequence, `input_ids`, `input_embeds`, `attention_mask` are all 3 adapted to insert the image hidden states. """ - special_image_mask = input_ids == modality_placeholder_token_id + special_image_mask = input_ids == image_token_id special_image_mask = ( special_image_mask.unsqueeze(-1) .expand_as(inputs_embeds) @@ -501,7 +501,7 @@ def _generate( pos, module: torch.fx.GraphModule, tokenizer, - text_embedding, + tok_embedding, ar_len: int, max_seq_len: int, k_caches, @@ -527,7 +527,7 @@ def _generate( if inputs.input_ids is None: # Get text_embedding - embedding = text_embedding(tmp_token_list) + embedding = tok_embedding(tmp_token_list) # Prepare tmp_pos (padded with zeros). tmp_pos = torch.zeros((1, ar_len), dtype=torch.int32) @@ -606,7 +606,7 @@ def _generate( ) else: logits, new_k_caches, new_v_caches = module( - text_embedding( + tok_embedding( torch.tensor( input_tokens, dtype=inputs.input_ids_dtype ).unsqueeze(0) @@ -667,8 +667,8 @@ def kv_inference( # noqa: C901 module: torch.fx.GraphModule, tokenizer, tok_embedding=None, - hidden_states=None, - modality_placeholder_token_id=None, + hidden_states: Tuple = (), + image_token_id=None, ar_len=1, max_seq_len=512, use_i64_token=False, @@ -679,8 +679,7 @@ def kv_inference( # noqa: C901 is_multimodal = all( [ tok_embedding is not None, - hidden_states is not None, - modality_placeholder_token_id is not None, + image_token_id is not None, ] ) @@ -706,8 +705,9 @@ def kv_inference( # noqa: C901 # pyre-ignore prompt_token_list = prompt.flatten().tolist() - # 2. forward text embedding + # 2. process embedding if is_multimodal: + # 2.1 forward text embedding input_ids = torch.tensor([prompt_token_list]) input_ids = ( input_ids.to(torch.int64) if use_i64_token else input_ids.to(torch.int32) @@ -716,11 +716,12 @@ def kv_inference( # noqa: C901 padded_seq_len = max(input_ids_len, ar_len) padded_seq_len = ((padded_seq_len + ar_len - 1) // ar_len) * ar_len + embedding_dim = [p for _, p in tok_embedding.named_parameters()][0].shape[-1] text_embeddings = torch.zeros( ( 1, padded_seq_len, - hidden_states[0].shape[-1], + embedding_dim, ), dtype=torch.float32, ) @@ -745,12 +746,18 @@ def kv_inference( # noqa: C901 :, chunk_start_idx : chunk_start_idx + actual_chunk_len, : ] = embedding - multimodal_embedding = _modality_inputs_merger( - input_ids, - text_embeddings[:, :input_ids_len, :], # Only use actual prompt length - torch.cat(hidden_states, dim=1), - modality_placeholder_token_id, - ) + # 2.2 merge text and multimodality embedding + if hidden_states: + multimodal_embedding = _modality_inputs_merger( + input_ids, + text_embeddings[ + :, :input_ids_len, : + ], # Only use actual prompt length + torch.cat(hidden_states, dim=1), + image_token_id, + ) + else: + multimodal_embedding = text_embeddings[:, :input_ids_len, :] # record total input tokens and generated tokens total_token_list = prompt_token_list @@ -809,7 +816,7 @@ def prefill_inference( tokenizer, tok_embedding=None, hidden_states=None, - modality_placeholder_token_id=None, + image_token_id=None, max_seq_len=512, use_i64_token=False, collect_logits=False, @@ -818,7 +825,7 @@ def prefill_inference( [ tok_embedding is not None, hidden_states is not None, - modality_placeholder_token_id is not None, + image_token_id is not None, ] ) @@ -863,7 +870,7 @@ def prefill_inference( tmp_token_list, text_embeddings, torch.cat(hidden_states, dim=1), - modality_placeholder_token_id, + image_token_id, ) results = module(multimodal_embedding, *atten_mask) else: @@ -891,8 +898,8 @@ def graph_module_inference( max_seq_len=512, prompt=None, tok_embedding=None, - hidden_states=None, - modality_placeholder_token_id=None, + hidden_states: Tuple = (), + image_token_id=None, tasks=None, tasks_limit=1, num_fewshot=None, @@ -923,7 +930,7 @@ def graph_module_inference( tokenizer, tok_embedding=tok_embedding, hidden_states=hidden_states, - modality_placeholder_token_id=modality_placeholder_token_id, + image_token_id=image_token_id, max_seq_len=max_seq_len, use_i64_token=use_i64_token, collect_logits=False, @@ -941,7 +948,6 @@ def graph_module_inference( use_i64_token=use_i64_token, seq_mse_candidates=seq_mse_candidates, ) - # Evaluate the model with torch.no_grad(): eval_results = simple_evaluate( model=calibration_wrapper, diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py index 2e7ae6d57d4..3200cdcd728 100755 --- a/examples/qualcomm/oss_scripts/llama/llama.py +++ b/examples/qualcomm/oss_scripts/llama/llama.py @@ -36,9 +36,9 @@ SQNR_EVAL, TASKS_EVAL, TEXT_DECODER, - TEXT_EMBEDDING, - TEXT_EMBEDDING_GRAPH_NAMES, TEXT_ENCODER, + TOK_EMBEDDING, + TOK_EMBEDDING_GRAPH_NAMES, VISION_ENCODER, ) from executorch.examples.qualcomm.oss_scripts.llama.decoder_runtime_evaluator import ( @@ -101,68 +101,80 @@ def compile( os.makedirs(args.artifact, exist_ok=True) multi_modal_mgr = MultiModalManager(control_args=args, config=decoder_model_config) - # perform ptq - multi_modal_mgr.quantize( - calibration_data=calibration_data, - tokenizer=tokenizer, - backend=get_backend_type(args.backend), - soc_model=args.model, + skip_quantize = {} + is_multimodal = any( + [ + hasattr(decoder_model_config, VISION_ENCODER), + hasattr(decoder_model_config, AUDIO_ENCODER), + ] ) - # Prepare dataset + # Prepare ptq option and compile spec compile_specs = { AUDIO_ENCODER: None, TEXT_ENCODER: None, VISION_ENCODER: None, - TEXT_EMBEDDING: None, + TOK_EMBEDDING: None, TEXT_DECODER: None, } - is_modality = False - # compile spec for multimodality encoder for modality in compile_specs: - if not hasattr(decoder_model_config, modality): - continue - - backend_options = generate_htp_compiler_spec( - use_fp16=False, - ) - encoder_compile_specs = generate_qnn_executorch_compiler_spec( - soc_model=get_soc_to_chipset_map()[args.model], - backend_options=backend_options, - ) - compile_specs[modality] = encoder_compile_specs - is_modality = True - - # text embedding compilation spec: default we use quantization version, since embedding is huge - if is_modality: - backend_options = generate_htp_compiler_spec( - use_fp16=False, - # x86 emulator does not support weight sharing - use_weight_sharing=not args.enable_x86_64, - ) - compile_specs[TEXT_EMBEDDING] = [ - generate_qnn_executorch_compiler_spec( + if is_multimodal and modality in {AUDIO_ENCODER, TEXT_ENCODER, VISION_ENCODER}: + # Encoder quantization is enabled only when the input contains a single image in each conversation. + # In multi‑image scenarios, we skip encoder quantization by default to preserve modality feature quality, + # because the encoder is quite sensitive and quantization can make it harder for the model to distinguish + # between images within the same conversation. + to_skip = len(args.image_path) > 1 + backend_options = generate_htp_compiler_spec( + use_fp16=to_skip, + ) + encoder_compile_specs = generate_qnn_executorch_compiler_spec( soc_model=get_soc_to_chipset_map()[args.model], backend_options=backend_options, - shared_buffer=not args.enable_x86_64, # x86 emulator does not support shared buffer + # x86 emulator does not support shared buffer + shared_buffer=not args.enable_x86_64, ) - ] * len(TEXT_EMBEDDING_GRAPH_NAMES) - - # compile spec for text decoder - backend_options = generate_htp_compiler_spec( - use_fp16=False, - use_multi_contexts=decoder_model_config.num_sharding > 1, - # x86 emulator does not support weight sharing - use_weight_sharing=not args.enable_x86_64, - ) - compile_specs[TEXT_DECODER] = [ - generate_qnn_executorch_compiler_spec( - soc_model=get_soc_to_chipset_map()[args.model], - backend_options=backend_options, - shared_buffer=not args.enable_x86_64, - use_mha2sha=True, - ) - ] * len(DECODER_GRAPH_NAMES) + skip_quantize[modality] = to_skip + compile_specs[modality] = encoder_compile_specs + elif is_multimodal and modality == TOK_EMBEDDING: + backend_options = generate_htp_compiler_spec( + use_fp16=False, + # x86 emulator does not support weight sharing + use_weight_sharing=not args.enable_x86_64, + ) + compile_specs[modality] = [ + generate_qnn_executorch_compiler_spec( + soc_model=get_soc_to_chipset_map()[args.model], + backend_options=backend_options, + # x86 emulator does not support shared buffer + shared_buffer=not args.enable_x86_64, + ) + ] * len(TOK_EMBEDDING_GRAPH_NAMES) + elif modality == TEXT_DECODER: + # compile spec for text decoder + backend_options = generate_htp_compiler_spec( + use_fp16=False, + use_multi_contexts=decoder_model_config.num_sharding > 1, + # x86 emulator does not support weight sharing + use_weight_sharing=not args.enable_x86_64, + ) + compile_specs[modality] = [ + generate_qnn_executorch_compiler_spec( + soc_model=get_soc_to_chipset_map()[args.model], + backend_options=backend_options, + # x86 emulator does not support shared buffer + shared_buffer=not args.enable_x86_64, + use_mha2sha=True, + ) + ] * len(DECODER_GRAPH_NAMES) + + # perform ptq + multi_modal_mgr.quantize( + calibration_data=calibration_data, + skip_quantize=skip_quantize, + tokenizer=tokenizer, + backend=get_backend_type(args.backend), + soc_model=args.model, + ) # perform compilation multi_modal_mgr.compile(compile_specs=compile_specs, pte_filenames=pte_filenames) @@ -176,14 +188,18 @@ def inference( chat_template, text_decoder_pte_path: str, encoder_pte_path: str, - text_embedding_pte_path: str, + tok_embedding_pte_path: str, attention_sink_evictor_pte_path: str, + calibration_data, ): assert args.model_mode in EVAL_MODE, f"Unknown model_mode: {args.model_mode}." - is_modality = hasattr(decoder_model_config, VISION_ENCODER) or hasattr( - decoder_model_config, AUDIO_ENCODER + is_multimodal = any( + [ + hasattr(decoder_model_config, VISION_ENCODER), + hasattr(decoder_model_config, AUDIO_ENCODER), + ] ) pte_paths = {TEXT_DECODER: text_decoder_pte_path} eval_results = { @@ -200,17 +216,17 @@ def inference( } ) - if is_modality: + if is_multimodal: eval_results.update( { "encoder_pte_size": os.path.getsize(encoder_pte_path), - "text_embedding_pte_size": os.path.getsize(text_embedding_pte_path), + "tok_embedding_pte_size": os.path.getsize(tok_embedding_pte_path), } ) pte_paths.update( { VISION_ENCODER: encoder_pte_path, - TEXT_EMBEDDING: text_embedding_pte_path, + TOK_EMBEDDING: tok_embedding_pte_path, } ) @@ -219,7 +235,8 @@ def inference( args=args, pte_paths=pte_paths, runtime_tokenizer_path=runtime_tokenizer_path, - is_modality=is_modality, + is_multimodal=is_multimodal, + modality_inputs=calibration_data, ) output_prompt = prompt_evaluator.run(prompt=args.prompt) eval_results.update( @@ -232,7 +249,7 @@ def inference( logging.info(f"Device Inference Results[{idx}]:\n{output}") if SQNR_EVAL in args.eval_methods: - assert not is_modality, "Modality Model does not support SQNR_EVAL." + assert not is_multimodal, "Modality Model does not support SQNR_EVAL." tokenizer_wrapper = TokenizerWrapper( args, decoder_model_config, @@ -255,7 +272,7 @@ def inference( pte_paths=pte_paths, tokenizer=tokenizer, runtime_tokenizer_path=runtime_tokenizer_path, - is_modality=is_modality, + is_multimodal=is_multimodal, ) sqnr, golden_logits, _ = sqnr_evaluator.run(prompt=prompt) logging.info(f"SQNR Eval Score between FP32 nn.Module and QNN: {sqnr}") @@ -280,7 +297,7 @@ def inference( pte_paths=pte_paths, tokenizer=tokenizer, runtime_tokenizer_path=runtime_tokenizer_path, - is_modality=is_modality, + is_multimodal=is_multimodal, ) qdq_sqnr, cpu_qdq_logits, _ = qdq_sqnr_evaluator.run(prompt=prompt) eval_results["qdq_sqnr"] = qdq_sqnr @@ -294,14 +311,14 @@ def inference( ) if TASKS_EVAL in args.eval_methods: - assert not is_modality, "Modality Model does not support TASKS_EVAL." + assert not is_multimodal, "Multimodal does not support TASKS_EVAL." # Generate the eval wrapper ppl_evaluator = TaskEval( args=args, pte_paths=pte_paths, tokenizer=tokenizer, runtime_tokenizer_path=runtime_tokenizer_path, - is_modality=is_modality, + is_multimodal=is_multimodal, ) ppl_eval_result = ppl_evaluator.run() eval_results["inference_speed"] = ppl_evaluator.inference_speed @@ -470,8 +487,9 @@ def _build_parser(): parser.add_argument( "--image_path", help="Path to the image file for multimodal language models (MLLM). If not specified, the default image from encoder/encoder_config.py will be used. The image should be preprocessed and saved in raw binary format.", - default=None, + default=[], type=str, + nargs="+", ) parser.add_argument( @@ -563,7 +581,7 @@ def export_llama(args) -> None: AUDIO_ENCODER: f"{AUDIO_ENCODER}_qnn", TEXT_ENCODER: f"{TEXT_ENCODER}_qnn", VISION_ENCODER: f"{VISION_ENCODER}_qnn", - TEXT_EMBEDDING: f"{TEXT_EMBEDDING}_qnn", + TOK_EMBEDDING: f"{TOK_EMBEDDING}_qnn", } # Prepare tokenizer tokenizer_wrapper = TokenizerWrapper( @@ -584,7 +602,7 @@ def export_llama(args) -> None: text_decoder_pte_path = f"{args.artifact}/{pte_filenames[TEXT_DECODER]}.pte" attention_sink_evictor_pte_path = f"{args.artifact}/{ATTENTION_SINK_EVICTOR}.pte" encoder_pte_path = f"{args.artifact}/{pte_filenames[VISION_ENCODER]}.pte" - text_embedding_pte_path = f"{args.artifact}/{pte_filenames[TEXT_EMBEDDING]}.pte" + tok_embedding_pte_path = f"{args.artifact}/{pte_filenames[TOK_EMBEDDING]}.pte" # TODO: Implement attention sink support for multimodal models (vision/audio). assert ( @@ -596,25 +614,14 @@ def export_llama(args) -> None: "Multimodal models currently do not support attention sink feature." ) - # TODO: Implement multi-turn conversation support for multimodal models (vision/audio). - assert ( - not ( - hasattr(decoder_model_config, VISION_ENCODER) - or hasattr(decoder_model_config, AUDIO_ENCODER) - ) - ) or (len(args.prompt) <= 1), ( - "Multimodal models currently do not support multi-turn. " - "Please set `--prompt` to 1 or switch to a unimodal (text-only) decoder." - ) - if args.pre_gen_pte: text_decoder_pte_path = f"{args.pre_gen_pte}/{pte_filenames[TEXT_DECODER]}.pte" attention_sink_evictor_pte_path = ( f"{args.pre_gen_pte}/{ATTENTION_SINK_EVICTOR}.pte" ) encoder_pte_path = f"{args.pre_gen_pte}/{pte_filenames[VISION_ENCODER]}.pte" - text_embedding_pte_path = ( - f"{args.pre_gen_pte}/{pte_filenames[TEXT_EMBEDDING]}.pte" + tok_embedding_pte_path = ( + f"{args.pre_gen_pte}/{pte_filenames[TOK_EMBEDDING]}.pte" ) if args.use_attention_sink: @@ -632,8 +639,9 @@ def export_llama(args) -> None: chat_template, text_decoder_pte_path, encoder_pte_path, - text_embedding_pte_path, + tok_embedding_pte_path, attention_sink_evictor_pte_path, + calibration_data, ) print(f"Finish the running pre_gen_pte from {args.pre_gen_pte}") return @@ -655,16 +663,36 @@ def export_llama(args) -> None: if args.compile_only: if args.ip and args.port != -1: - pte_path = f"{args.artifact}/{pte_filename}.pte" - pte_size = os.path.getsize(pte_path) - with Client((args.ip, args.port)) as conn: - conn.send( - json.dumps( - { - "pte_size": pte_size, - } - ) + # Prepare validation results for CI system + text_decoder_pte_path = f"{args.artifact}/{pte_filename}.pte" + text_decoder_pte_path = os.path.getsize(text_decoder_pte_path) + validation_results = { + "pte_size": text_decoder_pte_path, + } + if any( + [ + hasattr(decoder_model_config, VISION_ENCODER), + hasattr(decoder_model_config, AUDIO_ENCODER), + ] + ): + encoder_pte_path = ( + f"{args.artifact}/{pte_filenames[VISION_ENCODER]}.pte" + ) + tok_embedding_pte_path = ( + f"{args.artifact}/{pte_filenames[TOK_EMBEDDING]}.pte" ) + validation_results.update( + { + "encoder_pte_size": os.path.getsize(encoder_pte_path), + "tok_embedding_pte_size": os.path.getsize( + tok_embedding_pte_path + ), + } + ) + + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps(validation_results)) + print(f"Finish compile_only and save to {args.artifact}") return @@ -676,8 +704,9 @@ def export_llama(args) -> None: chat_template, text_decoder_pte_path, encoder_pte_path, - text_embedding_pte_path, + tok_embedding_pte_path, attention_sink_evictor_pte_path, + calibration_data, ) diff --git a/examples/qualcomm/oss_scripts/llama/model/embedding.py b/examples/qualcomm/oss_scripts/llama/model/embedding.py index 4956012baf0..8d1c92913c1 100644 --- a/examples/qualcomm/oss_scripts/llama/model/embedding.py +++ b/examples/qualcomm/oss_scripts/llama/model/embedding.py @@ -8,7 +8,7 @@ from torch import nn -class TextEmbedding(nn.Module): +class TokenEmbedding(nn.Module): def __init__( self, input_embedding_module, diff --git a/examples/qualcomm/oss_scripts/llama/model/static_llama.py b/examples/qualcomm/oss_scripts/llama/model/static_llama.py index e90e69e4114..3ccee2d7749 100755 --- a/examples/qualcomm/oss_scripts/llama/model/static_llama.py +++ b/examples/qualcomm/oss_scripts/llama/model/static_llama.py @@ -842,18 +842,9 @@ def __init__( use_i64_token=use_i64_token, ) - # Initialize modality placeholder token ID - # Default value of -1 indicates embeddings come from text encoder - # Note: Text encoder modality is not currently supported - self.modality_placeholder_token_id = kwargs.get( - "modality_placeholder_token_id", -1 - ) - - if self.modality_placeholder_token_id == -1: - raise NotImplementedError( - "Text encoder modality (modality_placeholder_token_id=-1) is not currently supported. " - "Please provide a valid modality_placeholder_token_id in kwargs." - ) + # Set the image token ID from keyword arguments. It defaults to None if not provided. + # If an ID is provided, it will be stored in the model's metadata. + self.image_token_id = kwargs.get("image_token_id", None) def forward( self, @@ -943,7 +934,8 @@ def get_example_inputs(self): def get_metadata(self): meta_data = super().get_metadata() - meta_data["modality_placeholder_token_id"] = self.modality_placeholder_token_id + if self.image_token_id: + meta_data["image_token_id"] = self.image_token_id return meta_data diff --git a/examples/qualcomm/oss_scripts/llama/model/vision_encoder.py b/examples/qualcomm/oss_scripts/llama/model/vision_encoder.py index 431e28a20d6..5eb107d67b7 100644 --- a/examples/qualcomm/oss_scripts/llama/model/vision_encoder.py +++ b/examples/qualcomm/oss_scripts/llama/model/vision_encoder.py @@ -4,8 +4,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from typing import Tuple - import torch from executorch.examples.qualcomm.utils import replace_module_with_custom_class @@ -177,11 +175,6 @@ def __init__( extra_custom_kwargs={"config": config.vision_config}, ) - def preprocess(self, pixel_values: Tuple[torch.FloatTensor]) -> Tuple[torch.Tensor]: - # HTP Prepare failed when pixel_values has 5D dimension, so we squeeze the batch dimension here. - pixel_values = pixel_values[0] - return (pixel_values.squeeze(0),) - def get_example_inputs(self): # pixel values - use config dimensions instead of hardcoded values return ( @@ -257,9 +250,6 @@ def __init__( self.img_resized_h = img_resized_h self.img_resized_w = img_resized_w - def preprocess(self, pixel_values: Tuple[torch.FloatTensor]) -> Tuple[torch.Tensor]: - return pixel_values - def get_example_inputs(self): # pixel values - use config dimensions instead of hardcoded values return ( diff --git a/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner.cpp index 7cadc0bb0dd..f6379d9243d 100644 --- a/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner.cpp @@ -15,21 +15,32 @@ */ #include +#include #include #include +#include #include #include +#include #include #include #include #include +using executorch::aten::ScalarType; +using executorch::extension::llm::Image; +using ::executorch::extension::llm::make_image_input; +using ::executorch::extension::llm::make_text_input; +using executorch::extension::llm::MultimodalInput; +using executorch::runtime::MethodMeta; +using executorch::runtime::Result; + // Model paths DEFINE_string( - embedding_path, - "embedding.pte", - "Path to embedding model serialized in flatbuffer format."); + tok_embedding_path, + "tok_embedding.pte", + "Path to tok_embedding model serialized in flatbuffer format."); DEFINE_string( encoder_path, "encoder.pte", @@ -119,128 +130,11 @@ std::vector CollectPrompts(int argc, char** argv) { return prompts; } -/** - * Special tokens structure for different models - */ -struct SpecialTokens { - std::string image_token; - std::string global_img; - std::string fake_wrap_start; - std::string fake_wrap_end; -}; - -/** - * Get special tokens based on decoder model version - */ -SpecialTokens get_special_tokens( - example::MultimodalDecoderModelVersion decoder_model_version) { - SpecialTokens tokens; - - switch (decoder_model_version) { - case example::MultimodalDecoderModelVersion:: - kSmolvlm: // smolvlm_500m_instruct - tokens.image_token = ""; - tokens.global_img = ""; - tokens.fake_wrap_start = ""; - tokens.fake_wrap_end = ""; - break; - case example::MultimodalDecoderModelVersion::kInternvl3: // internvl3_1b - tokens.image_token = ""; - tokens.global_img = ""; - tokens.fake_wrap_start = ""; - tokens.fake_wrap_end = ""; - break; - default: - break; - } - - return tokens; -} - -/** - * Prepare multimodal token IDs by expanding image tokens - * This implements the logic from prepare_multimodal_token_ids in Python - */ -std::string prepare_multimodal_prompt( - const std::string& prompt, - int image_seq_len, - const SpecialTokens& specials) { - // Create image prompt with repeated image tokens - std::string image_prompt = specials.fake_wrap_start; - image_prompt += specials.global_img; - for (int i = 0; i < image_seq_len; ++i) { - image_prompt += specials.image_token; - } - image_prompt += specials.fake_wrap_end; - - // Replace single image token with expanded version - size_t pos = 0; - std::string expanded = prompt; - while ((pos = expanded.find(specials.image_token, pos)) != - std::string::npos) { - expanded.replace(pos, specials.image_token.size(), image_prompt); - pos += image_prompt.size(); - } - ET_LOG(Info, "Prompt after expanding image token: %s", expanded.c_str()); - - return expanded; -} - -/** - * Format prompt based on model version with multimodal token expansion - */ -std::string get_formatted_prompt( - const std::string& prompt, - const std::string& system_prompt, - example::MultimodalDecoderModelVersion decoder_model_version, - int32_t img_seq_len = 0) { - std::string formatted_prompt; - - // Get special tokens for this model - SpecialTokens specials = get_special_tokens(decoder_model_version); - - switch (decoder_model_version) { - case example::MultimodalDecoderModelVersion::kSmolvlm: - if (!system_prompt.empty()) { - formatted_prompt.append( - "<|start_header_id|>system<|end_header_id|>\n\n"); - formatted_prompt.append(system_prompt); - formatted_prompt.append("<|eot_id|>"); - } - formatted_prompt.append("<|im_start|>User:"); - formatted_prompt.append(specials.image_token); - formatted_prompt.append(prompt); - formatted_prompt.append("\nAssistant:"); - break; - case example::MultimodalDecoderModelVersion::kInternvl3: - if (!system_prompt.empty()) { - formatted_prompt.append("<|im_start|>system<|im_end|>\n\n"); - formatted_prompt.append(system_prompt); - formatted_prompt.append("<|im_end|>"); - } - formatted_prompt.append("<|im_start|>user:\n"); - formatted_prompt.append(specials.image_token); - formatted_prompt.append("\n"); - formatted_prompt.append(prompt); - formatted_prompt.append("<|im_end|>assistant\n"); - break; - default: - ET_CHECK_MSG(false, "unsupported VLM version"); - break; - } - - // Expand image tokens - formatted_prompt = - prepare_multimodal_prompt(formatted_prompt, img_seq_len, specials); - - return formatted_prompt; -} - template void start_multimodal_runner( - std::unique_ptr encoder_runner, - std::unique_ptr module, - std::unique_ptr embedding, + std::unique_ptr encoder, + std::unique_ptr tok_embedding, + std::unique_ptr text_decoder, std::vector& prompts) { ET_LOG(Info, "Starting multimodal runner"); @@ -248,32 +142,12 @@ void start_multimodal_runner( gflags::GetCommandLineFlagInfoOrDie("tokenized_prompt").is_default ? false : true; - // Load image, run encoder forward pass, and set image hidden states if - // provided - bool has_image = !FLAGS_image_path.empty(); - - // Load encoder - if (encoder_runner->load() != executorch::runtime::Error::Ok) { - ET_LOG(Error, "Failed to load encoder"); - return; - } - - // Encode image from file - auto encode_result = - encoder_runner->encode_from_file(FLAGS_image_path.c_str()); - if (!encode_result.ok()) { - ET_LOG(Error, "Failed to encode image"); - return; - } - - auto image_hidden_states = encode_result.get(); - // Create multimodal runner - example::MultimodalRunner runner( - std::move(module), - std::move(embedding), + example::QNNMultimodalRunner runner( + std::move(encoder), + std::move(tok_embedding), + std::move(text_decoder), FLAGS_decoder_model_version.c_str(), - FLAGS_decoder_path.c_str(), FLAGS_tokenizer_path.c_str(), FLAGS_dump_logits_path.c_str(), FLAGS_performance_output_path.c_str(), @@ -282,23 +156,25 @@ void start_multimodal_runner( FLAGS_shared_buffer, FLAGS_ngram, FLAGS_window, - FLAGS_gcap, - std::make_unique(image_hidden_states)); + FLAGS_gcap); + + auto model_version = runner.get_model_version().get(); - auto decoder_model_version = runner.get_decoder_model_version(); + if (modality_of(model_version) == example::Modality::kVision) { + ET_CHECK_MSG( + !FLAGS_image_path.empty(), + "For VLM models, please specify image path."); + } // Prepare output buffer (similar to qnn_llama_runner.cpp) std::vector buf; buf.reserve(5 * FLAGS_seq_len); // assume each token is around 5 char std::ofstream fout(FLAGS_output_path.c_str()); - auto callback = [&](const std::string& piece) { for (const char c : piece) { buf.push_back(c); } }; - - // Configure generation executorch::extension::llm::GenerationConfig config{ true, false, @@ -309,25 +185,45 @@ void start_multimodal_runner( 0, 0}; - // Get image sequence length from encoder - int32_t img_seq_len = encoder_runner->get_image_seq_len(); - if (use_tokenized_prompt) { - runner.generate_from_prompt_or_file( - FLAGS_tokenizer_path.c_str(), use_tokenized_prompt, config, callback); - } else { - // generate tokens & store inference output - for (int i = 0; i < FLAGS_num_iters; i++) { - for (size_t j = 0; j < prompts.size(); ++j) { - const auto& prompt = prompts[j]; - std::string formatted_prompt; - formatted_prompt = get_formatted_prompt( - prompt, - FLAGS_system_prompt, - decoder_model_version.get(), - img_seq_len); - runner.generate_from_prompt_or_file( - formatted_prompt.c_str(), use_tokenized_prompt, config, callback); + // 1. [Multi-modality] Get raw files from input_list.txt + std::vector raw_files = + example::load_raw_files(FLAGS_image_path.c_str()); + + // 2. Prepare messages for multi-turn simulation + std::vector messages = prepare_messages(prompts, raw_files); + + // 3. Get expected input size/dtype for encoder + Result method_meta = runner.get_encoder_method_meta(); + auto input_meta_result = method_meta->input_tensor_meta(0); + std::vector expected_size( + input_meta_result->sizes().begin(), input_meta_result->sizes().end()); + ScalarType expected_dtype = input_meta_result->scalar_type(); + + // TODO: add use_tokenized_prompt for enable running static Llama models + // inside LlamaDemo Android + // 4. generate tokens & store inference output + for (int i = 0; i < FLAGS_num_iters; i++) { + for (size_t j = 0; j < messages.size(); ++j) { + const auto& prompt = messages[j].text; + const std::vector files_path = messages[j].files_path; + + // 4.1 prepare image input + std::vector inputs; + if (modality_of(model_version) == example::Modality::kVision) { + for (const std::string& file_path : files_path) { + Image image; + example::load_image(file_path, image, expected_size, expected_dtype); + inputs.emplace_back(make_image_input(image)); + } } + + // 4.2 prepare prompt input + std::string formatted_prompt = + apply_chat_template(prompt, FLAGS_system_prompt, model_version); + inputs.emplace_back(make_text_input(formatted_prompt)); + + // 4.3 generate text + runner.generate(inputs, config, callback); } } fout.write(buf.data(), buf.size()); @@ -346,22 +242,24 @@ int main(int argc, char** argv) { ET_CHECK_MSG( false, "Only TokenGenerator(kv) mode is supported to dump all logits."); } - ET_LOG(Info, "Embedding: %s", FLAGS_embedding_path.c_str()); - ET_LOG(Info, "Encoder: %s", FLAGS_encoder_path.c_str()); - ET_LOG(Info, "Decoder: %s", FLAGS_decoder_path.c_str()); - // Create encoder runner - std::unique_ptr encoder_runner = - std::make_unique(FLAGS_encoder_path.c_str()); + // Load encoder + ET_LOG(Info, "Load Encoder: %s", FLAGS_encoder_path.c_str()); + std::unique_ptr encoder = + std::make_unique( + FLAGS_encoder_path.c_str(), + executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors); - // load embedding - std::unique_ptr embedding = + // Load token embedding + ET_LOG(Info, "Load Token Embedding: %s", FLAGS_tok_embedding_path.c_str()); + std::unique_ptr tok_embedding = std::make_unique( - FLAGS_embedding_path.c_str(), + FLAGS_tok_embedding_path.c_str(), executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors); - // load decoder - std::unique_ptr module = + // Load text decoder + ET_LOG(Info, "Load Text Decoder: %s", FLAGS_decoder_path.c_str()); + std::unique_ptr text_decoder = std::make_unique( FLAGS_decoder_path.c_str(), executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors); @@ -369,22 +267,25 @@ int main(int argc, char** argv) { // Using 8bit as default since this meta is introduced with 16bit kv io // support and older models only have 8bit kv io. example::KvBitWidth kv_bitwidth = example::KvBitWidth::kWidth8; - if (module->method_names()->count("get_kv_io_bit_width") > 0) { + if (text_decoder->method_names()->count("get_kv_io_bit_width") > 0) { kv_bitwidth = static_cast( - module->get("get_kv_io_bit_width").get().toScalar().to()); + text_decoder->get("get_kv_io_bit_width") + .get() + .toScalar() + .to()); } // Start runner with appropriate KV bitwidth if (kv_bitwidth == example::KvBitWidth::kWidth8) { start_multimodal_runner( - std::move(encoder_runner), - std::move(module), - std::move(embedding), + std::move(encoder), + std::move(tok_embedding), + std::move(text_decoder), prompts); } else if (kv_bitwidth == example::KvBitWidth::kWidth16) { start_multimodal_runner( - std::move(encoder_runner), - std::move(module), - std::move(embedding), + std::move(encoder), + std::move(tok_embedding), + std::move(text_decoder), prompts); } else { ET_CHECK_MSG( diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/chat_template.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/chat_template.h new file mode 100644 index 00000000000..1b8fb57408d --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/chat_template.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include + +/** + * Message structure for multi-turn conversations + */ +struct Message { + size_t id; + std::string text; + std::vector files_path; +}; + +/** + * Prepare messages for multi-turn simulation + * This function validates that the number of image tokens matches the number of + * images, and distributes images across messages based on image token + * positions. + */ +inline std::vector prepare_messages( + std::vector& prompts, + const std::vector& image_paths) { + size_t num_images = image_paths.size(); + size_t total_image_tokens = 0; + + // Count total image tokens across all prompts + for (const auto& prompt : prompts) { + size_t pos = 0; + while ((pos = prompt.find(IMG_TOKEN, pos)) != std::string::npos) { + total_image_tokens++; + pos += IMG_TOKEN.length(); + } + } + + // If no image tokens but images provided, prepend image tokens to prompt in + // first turn and check the number of image tokens given by user are equal to + // image num. + if (total_image_tokens == 0 && num_images > 0) { + std::string prefix; + for (size_t i = 0; i < num_images; ++i) { + prefix += IMG_TOKEN; + } + prompts[0] = prefix + prompts[0]; + } + ET_CHECK_MSG( + total_image_tokens == num_images, + "Number of %s tokens (%zu) does not match number of images (%zu). Please check your prompts and image paths.", + IMG_TOKEN.c_str(), + total_image_tokens, + num_images); + + // Build messages and dispatch images + std::vector messages; + size_t img_idx = 0; + ET_LOG(Info, "Simulation multi-turn:"); + + for (size_t i = 0; i < prompts.size(); ++i) { + Message msg; + msg.id = i; + msg.text = prompts[i]; + + // Count image tokens in this prompt + size_t count = 0; + size_t pos = 0; + while ((pos = msg.text.find(IMG_TOKEN, pos)) != std::string::npos) { + count++; + pos += IMG_TOKEN.length(); + } + + // Assign corresponding images to this message + if (count > 0) { + for (size_t k = 0; k < count && img_idx < image_paths.size(); ++k) { + msg.files_path.emplace_back(image_paths[img_idx++]); + } + } + + // Log message info + std::string paths_str = "["; + for (size_t i = 0; i < msg.files_path.size(); ++i) { + paths_str += "'"; + paths_str += msg.files_path[i]; + paths_str += "'"; + if (i < msg.files_path.size() - 1) + paths_str += ", "; + } + paths_str += "]"; + ET_LOG( + Info, + "Turn-%zu: {id: %zu, text: \"%s\", files_path: %s}", + i, + i, + msg.text.c_str(), + paths_str.c_str()); + + messages.emplace_back(std::move(msg)); + } + + return messages; +} + +inline std::string apply_chat_template( + const std::string& prompt, + const std::string& system_prompt, + example::ModelVersion model_version) { + return std::visit( + [&](const auto& model) { + return apply_chat_template(system_prompt, prompt, model); + }, + model_version); +} diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.cpp index 91789f07a90..9304d2e4688 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.cpp @@ -7,6 +7,7 @@ */ #include +#include #include using executorch::aten::Tensor; @@ -18,12 +19,8 @@ using executorch::runtime::Result; namespace example { -EncoderRunner::EncoderRunner(const std::string& model_path) - : image_seq_len_(0) { - module_ = std::make_unique( - model_path, Module::LoadMode::MmapUseMlockIgnoreErrors); - ET_LOG(Info, "Creating encoder module: model_path=%s", model_path.c_str()); -} +EncoderRunner::EncoderRunner(executorch::extension::Module* module) + : module_(module) {} bool EncoderRunner::is_method_loaded() const { return module_->is_method_loaded(kEncoderForwardName); @@ -47,17 +44,9 @@ Error EncoderRunner::load() { return method_meta.error(); } - // vision embedding output shape: [1, seq_len, dim] - image_seq_len_ = method_meta->output_tensor_meta(0)->sizes()[1]; - ET_LOG(Info, "Encoder loaded successfully, image_seq_len=%d", image_seq_len_); - return Error::Ok; } -int32_t EncoderRunner::get_image_seq_len() const { - return image_seq_len_; -} - Result EncoderRunner::encode(TensorPtr& image_tensor) { ET_CHECK_MSG(is_method_loaded(), "Encoder method not loaded"); @@ -77,54 +66,4 @@ Result EncoderRunner::encode(TensorPtr& image_tensor) { return image_hidden_states; } -Result EncoderRunner::encode_from_file( - const std::string& image_file_path) { - ET_CHECK_MSG(is_method_loaded(), "Encoder method not loaded"); - - // Get input tensor metadata - Result method_meta = module_->method_meta(kEncoderForwardName); - auto sizes_span = method_meta->input_tensor_meta(0)->sizes(); - - // Calculate total number of elements - int64_t num_elem = 1; - for (const auto& size : sizes_span) { - num_elem *= size; - } - - // Read image data from file - ET_LOG( - Info, - "Reading image from file: %s, num_elements=%ld", - image_file_path.c_str(), - num_elem); - std::ifstream file(image_file_path, std::ios::binary | std::ios::ate); - ET_CHECK_MSG( - file.is_open(), "Failed to open image file: %s", image_file_path.c_str()); - - // To prevent users from passing images that have not been - // resized to match the encoder input size. - std::streamsize file_size = file.tellg(); - std::streamsize expected_size = num_elem * sizeof(float); - ET_CHECK_MSG( - file_size == expected_size, - "Image file size mismatch: expected %ld bytes but got %ld bytes (file: %s)", - expected_size, - file_size, - image_file_path.c_str()); - - file.seekg(0, std::ios::beg); - std::vector buffer(num_elem); - file.read(reinterpret_cast(buffer.data()), expected_size); - file.close(); - - // Create tensor from buffer - TensorPtr tensor = executorch::extension::from_blob( - buffer.data(), - std::vector(sizes_span.begin(), sizes_span.end()), - executorch::aten::ScalarType::Float); - - // Encode the tensor - return encode(tensor); -} - } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h index 0e1becc05b6..e8c8a948877 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -28,7 +29,7 @@ class EncoderRunner { * @brief Constructor for EncoderRunner * @param model_path Path to the encoder model PTE file */ - explicit EncoderRunner(const std::string& model_path); + explicit EncoderRunner(executorch::extension::Module* module); /** * @brief Check if the encoder method is loaded @@ -56,18 +57,11 @@ class EncoderRunner { executorch::runtime::Result encode( executorch::extension::TensorPtr& image_tensor); - /** - * @brief Encode image from raw file - * @param image_file_path Path to raw image file - * @return Result containing the image hidden states tensor - */ - executorch::runtime::Result encode_from_file( - const std::string& image_file_path); - private: - std::unique_ptr module_; + executorch::extension::Module* module_; inline static const std::string kEncoderForwardName = "forward"; - int32_t image_seq_len_; + std::list> output_buffers_; + std::list output_tensors_; }; } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.cpp new file mode 100644 index 00000000000..d45ce10a9af --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.cpp @@ -0,0 +1,203 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +namespace example { + +MultimodalEmbeddingMerger::MultimodalEmbeddingMerger(int32_t embedding_dim) + : embedding_dim_(embedding_dim) { + ET_CHECK_MSG(embedding_dim_ > 0, "Embedding dimension must be positive"); +} + +void MultimodalEmbeddingMerger::reset() { + text_embedding_buffers_.clear(); + text_embedding_token_counts_.clear(); + image_embedding_buffers_.clear(); + image_embedding_token_counts_.clear(); + total_tokens_ = 0; +} + +void MultimodalEmbeddingMerger::add_embeddings( + const executorch::aten::Tensor& embeddings, + const float* data, + EmbeddingType type) { + // shape: [1, num_tokens, embedding_dim] + ET_CHECK_MSG(embeddings.dim() == 3, "Embeddings must be a 3D tensor"); + + size_t batch_size = embeddings.sizes()[0]; + size_t num_tokens = embeddings.sizes()[1]; + size_t dim = embeddings.sizes()[2]; + + ET_CHECK_MSG(batch_size == 1, "Batch size must be 1"); + ET_CHECK_MSG( + dim == embedding_dim_, + "Embedding dimension mismatch: expected %zu, got %zu", + embedding_dim_, + dim); + + // Copy embedding data to prevent it from being overwritten + size_t num_elements = num_tokens * dim; + std::vector buffer(data, data + num_elements); + + std::string type_str = (type == EmbeddingType::kText) ? "text" : "image"; + if (type == EmbeddingType::kText) { + text_embedding_buffers_.emplace_back(std::move(buffer)); + text_embedding_token_counts_.push_back(num_tokens); + } else { + image_embedding_buffers_.emplace_back(std::move(buffer)); + image_embedding_token_counts_.push_back(num_tokens); + } + + ET_LOG( + Info, + "Added %s embeddings: num_tokens=%zu", + type_str.c_str(), + num_tokens); +} + +void MultimodalEmbeddingMerger::add_text_embeddings( + const TensorStruct& text_embeddings) { + ET_CHECK_MSG( + text_embeddings.tensor != nullptr, + "Text embeddings tensor cannot be null"); + ET_CHECK_MSG( + text_embeddings.data != nullptr, "Text embeddings data cannot be null"); + + executorch::aten::Tensor tensor_wrapper(text_embeddings.tensor.get()); + + add_embeddings(tensor_wrapper, text_embeddings.data, EmbeddingType::kText); +} + +void MultimodalEmbeddingMerger::add_image_embeddings( + const executorch::aten::Tensor& image_embeddings) { + add_embeddings( + image_embeddings, + image_embeddings.const_data_ptr(), + EmbeddingType::kImage); +} + +TensorStruct MultimodalEmbeddingMerger::merge( + const std::vector& input_ids, + uint64_t image_token_id) { + ET_CHECK_MSG(!input_ids.empty(), "input_ids cannot be empty"); + ET_CHECK_MSG( + !text_embedding_buffers_.empty(), + "No text embeddings added. Call add_text_embeddings() first."); + + // Final merged embeddings + std::vector merged_buffer; + std::vector sizes; + TensorStruct merged_embeddings; + + size_t num_placeholder_tokens = 0; + if (image_token_id != 0) { + for (uint64_t token_id : input_ids) { + if (token_id == image_token_id) { + num_placeholder_tokens++; + } + } + } + + ET_CHECK_MSG( + num_placeholder_tokens == image_embedding_buffers_.size(), + "Number of placeholder tokens (%zu) must match number of image embeddings (%zu)", + num_placeholder_tokens, + image_embedding_buffers_.size()); + + // Calculate total tokens: sum of all text tokens + all image tokens + for (int64_t count : text_embedding_token_counts_) { + total_tokens_ += count; + } + for (int64_t count : image_embedding_token_counts_) { + total_tokens_ += count; + } + total_tokens_ = total_tokens_ - num_placeholder_tokens; + + size_t total_elements = total_tokens_ * embedding_dim_; + merged_buffer.resize(total_elements); + + // Merge embeddings based on input_ids + size_t text_emb_idx = 0; // Which text embedding chunk in current turn + size_t text_token_idx = 0; // Token index within current text embedding chunk + size_t image_emb_idx = 0; // Which image embedding chunk in current turn + size_t output_offset = 0; // Output buffer offset + + for (int i = 0; i < input_ids.size(); i++) { + uint64_t token_id = input_ids[i]; + + if (image_token_id != 0 && token_id == image_token_id) { + // Insert entire image embedding + ET_CHECK_MSG( + image_emb_idx < image_embedding_buffers_.size(), + "Image index out of bounds"); + + const std::vector& image_buffer = + image_embedding_buffers_[image_emb_idx]; + int64_t num_image_tokens = image_embedding_token_counts_[image_emb_idx]; + + size_t num_elements = num_image_tokens * embedding_dim_; + std::memcpy( + merged_buffer.data() + output_offset, + image_buffer.data(), + num_elements * sizeof(float)); + + output_offset += num_elements; + image_emb_idx++; + text_token_idx++; // Skip this image placeholder token + } else { + // Insert one text token embedding + ET_CHECK_MSG( + text_emb_idx < text_embedding_buffers_.size(), + "Text embedding index out of bounds"); + + const std::vector& text_buffer = + text_embedding_buffers_[text_emb_idx]; + std::memcpy( + merged_buffer.data() + output_offset, + text_buffer.data() + text_token_idx * embedding_dim_, + embedding_dim_ * sizeof(float)); + + output_offset += embedding_dim_; + text_token_idx++; + } + } + + ET_CHECK_MSG( + image_emb_idx == image_embedding_buffers_.size(), + "Not all image embeddings were used: used %zu, expected %zu", + image_emb_idx, + image_embedding_buffers_.size()); + + // Setup tensor metadata + merged_embeddings.data = merged_buffer.data(); + merged_embeddings.size = total_elements * sizeof(float); + + // Setup sizes and dim_order: [1, total_tokens, embedding_dim] + sizes = {1, total_tokens_, embedding_dim_}; + + // Create TensorImpl + merged_embeddings.tensor = std::make_unique( + executorch::aten::ScalarType::Float, + sizes.size(), + sizes.data(), + merged_embeddings.data); + + ET_LOG( + Info, + "Merged embeddings: total_tokens=%d, text=%zu, images=%zu, embedding_dim=%d", + total_tokens_, + text_embedding_buffers_.size(), + image_embedding_buffers_.size(), + embedding_dim_); + + return merged_embeddings; +} + +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.h new file mode 100644 index 00000000000..f545a80a354 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +#include +#include +#include + +namespace example { + +/** + * @class MultimodalEmbeddingMerger + * @brief Merges text and image embeddings based on token IDs + * + * This class collects text and image embeddings separately, then merges them + * based on input token IDs. When a placeholder token ID is encountered, + * it inserts the corresponding image embedding. Otherwise, it inserts the text + * embedding for that token position. + */ +enum class EmbeddingType { kText, kImage }; + +class MultimodalEmbeddingMerger { + public: + /** + * @brief Construct a new Multimodal Embedding Merger + * + * @param embedding_dim Expected embedding dimension for all inputs + */ + explicit MultimodalEmbeddingMerger(int32_t embedding_dim); + + /** + * @brief Reset the merger state for a new sequence + */ + void reset(); + + /** + * @brief Add text embeddings to the collection + * + * @param text_embeddings Text embedding tensor [1, num_tokens, embedding_dim] + */ + void add_text_embeddings(const TensorStruct& text_embeddings); + + /** + * @brief Add image embeddings to the collection + * + * @param image_embeddings Image embedding tensor [1, num_tokens, + * embedding_dim] + */ + void add_image_embeddings(const executorch::aten::Tensor& image_embeddings); + + /** + * @brief Merge collected embeddings based on input token IDs + * + * This method examines each token ID in input_ids. When it encounters + * placeholder_token_id, it inserts the next image embedding. Otherwise, + * it inserts the text embedding at the corresponding position. + * + * @param input_ids Vector of token IDs (including placeholder tokens) + * @param image_token_id Token ID that represents image modality placeholder + * @return TensorStruct Merged embeddings [1, total_tokens, + * embedding_dim] + */ + TensorStruct merge( + const std::vector& input_ids, + uint64_t image_token_id); + + /** + * @brief Get the total number of tokens after merging + * @return int64_t Total token count + */ + inline size_t get_total_tokens() const { + return total_tokens_; + } + + private: + void add_embeddings( + const executorch::aten::Tensor& embeddings, + const float* data, + EmbeddingType type); + + // Expected embedding dimension + int32_t embedding_dim_; + + // Total tokens after merge + int32_t total_tokens_{0}; + + // Collected embeddings before merge + // Text embeddings are copied to prevent external modifications + std::vector> text_embedding_buffers_; + std::vector text_embedding_token_counts_; + + // Image embeddings are copied since they're temporary + std::vector> image_embedding_buffers_; + std::vector image_embedding_token_counts_; +}; + +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.cpp index fc6ec382cef..14a93104e1a 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.cpp @@ -30,9 +30,9 @@ void MultimodalLhdTokenGenerator::prepare_io( std::vector tokens_to_process( input_tokens.begin(), input_tokens.begin() + num_tokens_to_process); - embedding_runner_->prefill(tokens_to_process); + tok_embedding_runner_->prefill(tokens_to_process); const TensorStruct& text_embeddings = - embedding_runner_->get_prompt_embeddings(); + tok_embedding_runner_->get_prompt_embeddings(); int64_t embedding_dim = text_embeddings.tensor->size(2); // Copy embedding to input buffer from the left diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h index 2c9e54b49d2..83da9e7a6ba 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h @@ -35,7 +35,7 @@ class MultimodalLhdTokenGenerator }; MultimodalLhdTokenGenerator( tokenizers::Tokenizer* tokenizer, - EmbeddingProcessor* embedding_runner, + TokenEmbeddingProcessor* embedding_runner, DecoderRunner* decoder_runner, KVManager* kv_manager, const std::string& forward_name, @@ -60,7 +60,7 @@ class MultimodalLhdTokenGenerator metadata.cache_mode, metadata.embedding_dim}, stats), - embedding_runner_(embedding_runner), + tok_embedding_runner_(embedding_runner), metadata_(metadata), lhd_branch_(metadata.ngram - 1, std::vector(metadata.window)), lhd_branch_prev_(metadata.window), @@ -123,7 +123,7 @@ class MultimodalLhdTokenGenerator void update_ngrams_pool(); // Additional members specific to multimodal - EmbeddingProcessor* embedding_runner_; + TokenEmbeddingProcessor* tok_embedding_runner_; struct NgramData { bool active = false; diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.cpp index 9ddfa5e78f6..2859e16a42a 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.cpp @@ -266,15 +266,6 @@ Result MultimodalPromptProcessor::prefill( prepare_io(prompt_embedding, num_prompt_tokens, prompt_pos, pos); // Run inference - for (int layer = 0; layer < metadata_.num_layers; ++layer) { - std::vector> k_cache_ptrs = kv_manager_->get_k_cache_(); - T* k_cache_data = k_cache_ptrs[layer].buffer; - } - for (int layer = 0; layer < metadata_.num_layers; ++layer) { - std::vector> v_cache_ptrs = kv_manager_->get_v_cache_(); - T* v_cache_data = v_cache_ptrs[layer].buffer; - } - decoder_runner_->step(method_name_, inputs_); if (dump_logits) { prompt_all_logits_.insert( diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.h index 51ed0b829ee..fcfc07c9590 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.h +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.h @@ -7,7 +7,7 @@ */ #pragma once -#include +#include #include namespace example { diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp index a7ced9c138d..0ac38308bfe 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp @@ -7,10 +7,12 @@ */ // A llama 3.2 runner that includes preprocessing and post processing -// logic. The module takes in a string as input and emits a string as output. +// logic. The text_decoder takes in a string as input and emits a string as +// output. #include #include +#include #include #include #include @@ -31,6 +33,8 @@ using executorch::aten::Tensor; using executorch::extension::Module; using executorch::extension::llm::get_rss_bytes; +using executorch::extension::llm::Image; +using executorch::extension::llm::MultimodalInput; using executorch::extension::llm::print_report; using executorch::extension::llm::Stats; using executorch::extension::llm::time_in_ms; @@ -89,11 +93,11 @@ void save_logits( } // namespace template -MultimodalRunner::MultimodalRunner( - std::unique_ptr module, - std::unique_ptr embedding_module, - const std::string& decoder_model_version, - const std::string& model_path, +QNNMultimodalRunner::QNNMultimodalRunner( + std::unique_ptr encoder, + std::unique_ptr tok_embedding, + std::unique_ptr text_decoder, + const std::string& model_version, const std::string& tokenizer_path, const std::string& dump_logits_path, const std::string& performance_output_path, @@ -102,11 +106,22 @@ MultimodalRunner::MultimodalRunner( const bool shared_buffer, const int ngram, const int window, - const int gcap, - std::unique_ptr image_hidden_states) - : module_(std::move(module)), - embedding_module_(std::move(embedding_module)), - image_hidden_states_(std::move(image_hidden_states)), + const int gcap) + // TODO: The arguments for the MultimodalRunner base class are currently + // unused in this runner implementation. QNNMultimodalRunner will be + // refactored in the future to align with MultimodalRunner's usage. + : MultimodalRunner( + {}, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + std::make_unique()), + encoder_(std::move(encoder)), + tok_embedding_(std::move(tok_embedding)), + text_decoder_(std::move(text_decoder)), ngram_(ngram), window_(window), gcap_(gcap), @@ -118,39 +133,38 @@ MultimodalRunner::MultimodalRunner( shared_buffer_(shared_buffer) { stats_.reset(); - if (decoder_model_version == "smolvlm") { - decoder_model_version_ = MultimodalDecoderModelVersion::kSmolvlm; - } else if (decoder_model_version == "internvl3") { - decoder_model_version_ = MultimodalDecoderModelVersion::kInternvl3; + if (model_version == "smolvlm") { + model_version_ = VisionLanguageModel::kSmolvlm; + } else if (model_version == "internvl3") { + model_version_ = VisionLanguageModel::kInternvl3; } else { ET_CHECK_MSG(false, "Unsupported Decoder Model"); } - ET_LOG(Info, "creating module: model_path=%s", model_path.c_str()); ET_LOG(Info, "creating runner: tokenizer_path=%s", tokenizer_path_.c_str()); ET_LOG(Info, "eval mode=%d", eval_mode_); } template -bool MultimodalRunner::is_loaded() const { - return module_->is_loaded() && embedding_module_->is_loaded() && tokenizer_ && +bool QNNMultimodalRunner::is_loaded() const { + return encoder_->is_loaded() && tok_embedding_->is_loaded() && + text_decoder_->is_loaded() && embedding_merger_ && tokenizer_ && decoder_runner_ && prompt_processor_ && token_generator_ && kv_manager_ && buffer_manager_; } template -Error MultimodalRunner::load() { +Error QNNMultimodalRunner::load() { if (is_loaded()) { return Error::Ok; } - - std::string prompt_embedding_method_name, token_embedding_method_name; + std::string prompt_embedding_method_name, tok_embedding_method_name; std::string token_generator_method_name, prompt_processor_method_name; std::vector method_names; switch (eval_mode_) { case EvalMode::kKVCached: prompt_embedding_method_name = "tok_embedding_kv_forward"; - token_embedding_method_name = "tok_embedding_kv_forward"; + tok_embedding_method_name = "tok_embedding_kv_forward"; prompt_processor_method_name = "kv_forward"; token_generator_method_name = "kv_forward"; method_names.emplace_back(prompt_processor_method_name); @@ -159,7 +173,7 @@ Error MultimodalRunner::load() { case EvalMode::kHybrid: case EvalMode::kLookaheadDecoding: prompt_embedding_method_name = "tok_embedding_prefill_forward"; - token_embedding_method_name = "tok_embedding_kv_forward"; + tok_embedding_method_name = "tok_embedding_kv_forward"; prompt_processor_method_name = "prefill_forward"; token_generator_method_name = "kv_forward"; method_names.emplace_back(prompt_processor_method_name); @@ -183,29 +197,29 @@ Error MultimodalRunner::load() { } eos_ids->insert(tokenizer_->eos_tok()); } - if (decoder_model_version_ == MultimodalDecoderModelVersion::kSmolvlm) { - eos_ids->insert(tokenizer_->encode("", 0, 0).get()[0]); - } else if ( - decoder_model_version_ == MultimodalDecoderModelVersion::kInternvl3) { - eos_ids->insert(tokenizer_->encode("<|im_end|>", 0, 0).get()[0]); + if (const auto* vlm = std::get_if(&model_version_)) { + if (*vlm == VisionLanguageModel::kSmolvlm) { + eos_ids->insert(tokenizer_->encode("", 0, 0).get()[0]); + } else if (*vlm == VisionLanguageModel::kInternvl3) { + eos_ids->insert(tokenizer_->encode("<|im_end|>", 0, 0).get()[0]); + } } - // Try avoid getMetadataHelper as it is time consuming. Result method_meta = - module_->method_meta(token_generator_method_name); + text_decoder_->method_meta(token_generator_method_name); // For some tokenizer.json, runtime vocab_size might be different, use output // shape to get vocab size. int32_t vocab_size = method_meta->output_tensor_meta(0)->sizes()[2]; - decoder_runner_ = - std::make_unique(module_.get(), vocab_size, temperature_); + decoder_runner_ = std::make_unique( + text_decoder_.get(), vocab_size, temperature_); ET_CHECK_OK_OR_RETURN_ERROR(decoder_runner_->load(method_names)); ET_LOG(Info, "Reading metadata from model"); // retrieve any method meta, can be either prefill or kv int64_t num_layers = - ET_UNWRAP(module_->get("get_n_layers")).toScalar().to(); + ET_UNWRAP(text_decoder_->get("get_n_layers")).toScalar().to(); ET_CHECK_MSG(num_layers != -1, "Could not retrieve num layers"); // k_cache: [1, n_heads, head_dim, seq_len] @@ -213,12 +227,12 @@ Error MultimodalRunner::load() { int64_t num_heads = k_cache_shape[1]; int64_t head_dim = k_cache_shape[2]; - // TODO: filter shape hidden_state: [1, ar_len, dim] - int64_t dim = embedding_module_->method_meta(token_embedding_method_name) + // hidden_state: [1, ar_len, dim] + int64_t dim = tok_embedding_->method_meta(tok_embedding_method_name) ->output_tensor_meta(0) ->sizes()[2]; bool use_int64_token = - embedding_module_->method_meta(token_embedding_method_name) + tok_embedding_->method_meta(tok_embedding_method_name) ->input_tensor_meta(0) ->scalar_type() == executorch::aten::ScalarType::Long; @@ -238,7 +252,7 @@ Error MultimodalRunner::load() { eval_mode_ == EvalMode::kHybrid || eval_mode_ == EvalMode::kLookaheadDecoding) { auto atten_mask_meta_prompt = - module_->method_meta(prompt_processor_method_name) + text_decoder_->method_meta(prompt_processor_method_name) ->input_tensor_meta(1); prompt_processor_ar_len = atten_mask_meta_prompt->sizes()[1]; } @@ -249,15 +263,19 @@ Error MultimodalRunner::load() { std::min(token_generator_ar_len, prompt_processor_ar_len); max_ar_len = std::max(token_generator_ar_len, prompt_processor_ar_len); - embedding_runner_ = - std::make_unique(embedding_module_.get()); - ET_CHECK_OK_OR_RETURN_ERROR(embedding_runner_->load( - {prompt_embedding_method_name, token_embedding_method_name})); - // Initialize EmbeddingProcessor - embedding_processor_ = std::make_unique( - embedding_runner_.get(), + // Initialize Encoder + encoder_runner_ = std::make_unique(encoder_.get()); + ET_CHECK_OK_OR_RETURN_ERROR(encoder_runner_->load()); + + // Initialize TokenEmbeddingProcessor + tok_embedding_runner_ = + std::make_unique(tok_embedding_.get()); + ET_CHECK_OK_OR_RETURN_ERROR(tok_embedding_runner_->load( + {prompt_embedding_method_name, tok_embedding_method_name})); + tok_embedding_processor_ = std::make_unique( + tok_embedding_runner_.get(), prompt_embedding_method_name, - EmbeddingProcessor::Metadata{ + TokenEmbeddingProcessor::Metadata{ context_len_, prompt_processor_ar_len, vocab_size, @@ -268,8 +286,9 @@ Error MultimodalRunner::load() { // This is used to configure the attention mask for models with window // attention int32_t sliding_window = context_len_; - if (module_->method_names()->count("get_sliding_window") > 0) { - sliding_window = ET_UNWRAP(module_->get("get_sliding_window")).toInt(); + if (text_decoder_->method_names()->count("get_sliding_window") > 0) { + sliding_window = + ET_UNWRAP(text_decoder_->get("get_sliding_window")).toInt(); } kv_manager_ = std::make_unique>(typename KVManager::Metadata{ context_len_, @@ -295,10 +314,10 @@ Error MultimodalRunner::load() { static_cast(dim)}); // Initialize EmbeddingGenerator - embedding_generator_ = std::make_unique( - embedding_runner_.get(), - token_embedding_method_name, - EmbeddingProcessor::Metadata{ + tok_embedding_generator_ = std::make_unique( + tok_embedding_runner_.get(), + tok_embedding_method_name, + TokenEmbeddingProcessor::Metadata{ context_len_, token_generator_ar_len, vocab_size, @@ -308,7 +327,7 @@ Error MultimodalRunner::load() { // Initialize TokenGenerator token_generator_ = std::make_unique>( tokenizer_.get(), - embedding_generator_.get(), + tok_embedding_generator_.get(), decoder_runner_.get(), kv_manager_.get(), token_generator_method_name, @@ -330,7 +349,7 @@ Error MultimodalRunner::load() { } else { token_generator_ = std::make_unique>( tokenizer_.get(), - embedding_generator_.get(), + tok_embedding_generator_.get(), decoder_runner_.get(), kv_manager_.get(), token_generator_method_name, @@ -354,8 +373,8 @@ Error MultimodalRunner::load() { kv_manager_->total_cache_size_in_bytes(), prompt_processor_->total_prompt_processor_io_size_in_bytes(), token_generator_->total_token_generator_io_size_in_bytes(), - embedding_processor_->total_embedding_processor_io_size_in_bytes(), - embedding_generator_->total_embedding_processor_io_size_in_bytes()); + tok_embedding_processor_->total_embedding_processor_io_size_in_bytes(), + tok_embedding_generator_->total_embedding_processor_io_size_in_bytes()); } ET_LOG(Info, "creating io_memory"); @@ -363,44 +382,50 @@ Error MultimodalRunner::load() { kv_manager_->init_cache(buffer_manager_.get(), prompt_processor_ar_len); prompt_processor_->init_io( buffer_manager_.get(), - module_->method_meta(prompt_processor_method_name)); + text_decoder_->method_meta(prompt_processor_method_name)); token_generator_->init_io( - buffer_manager_.get(), module_->method_meta(token_generator_method_name)); + buffer_manager_.get(), + text_decoder_->method_meta(token_generator_method_name)); // Prepare io for embedding - embedding_processor_->init_io( + tok_embedding_processor_->init_io( buffer_manager_.get(), - embedding_module_->method_meta(prompt_embedding_method_name)); - embedding_generator_->init_io( + tok_embedding_->method_meta(prompt_embedding_method_name)); + tok_embedding_generator_->init_io( buffer_manager_.get(), - embedding_module_->method_meta(token_embedding_method_name)); - return Error::Ok; -} + tok_embedding_->method_meta(tok_embedding_method_name)); -template -Error MultimodalRunner::generate( - const std::string& prompt, - const llm::GenerationConfig& config, - std::function token_callback, - std::function stats_callback) { - return generate_from_prompt_or_file( - prompt, false, config, token_callback, stats_callback); + // Get image token ID from text_decoder + if (modality_of(model_version_) == Modality::kVision) { + ET_CHECK_MSG( + text_decoder_->method_names()->count("image_token_id") > 0, + "Vision model is missing the required 'image_token_id' in metadata."); + image_token_id_ = ET_UNWRAP(text_decoder_->get("image_token_id")).toInt(); + ET_LOG( + Info, + "Image placeholder token ID for vision modality loaded: %zu", + image_token_id_); + } + + // Initialize embedding merger + embedding_merger_ = + std::make_unique(static_cast(dim)); + + return Error::Ok; } template -Error MultimodalRunner::generate_from_prompt_or_file( - const std::string& prompt, - bool tokenized_prompt, +executorch::runtime::Error QNNMultimodalRunner::generate( + const std::vector& inputs, const llm::GenerationConfig& config, std::function token_callback, std::function stats_callback) { - ET_CHECK_MSG(!prompt.empty(), "prompt cannot be null"); + ET_CHECK_MSG(!inputs.empty(), "inputs cannot be empty"); if (!is_loaded()) { stats_.model_load_start_ms = time_in_ms(); ET_CHECK_OK_OR_RETURN_ERROR(load()); stats_.model_load_end_ms = time_in_ms(); } stats_.inference_start_ms = time_in_ms(); - int32_t seq_len = config.seq_len; if (seq_len > context_len_) { ET_LOG( @@ -421,62 +446,67 @@ Error MultimodalRunner::generate_from_prompt_or_file( // For multimodal, we will disable n_bos int32_t n_bos = 0; - // encode the (string) prompt into tokens sequence + std::string prompt; std::vector prompt_tokens; - if (tokenized_prompt) { - std::ifstream inFile(prompt, std::ios::binary); - if (inFile.is_open()) { - // Get file size - inFile.seekg(0, std::ios::end); - size_t fileSize = inFile.tellg(); - inFile.seekg(0, std::ios::beg); - - // Resize vector and read raw data - prompt_tokens.resize(fileSize / sizeof(uint64_t)); - - inFile.read(reinterpret_cast(prompt_tokens.data()), fileSize); - inFile.close(); + bool dump_logits = !dump_logits_path_.empty(); + + // Reset merger for new generation + embedding_merger_->reset(); + + // Process each input and add embeddings to merger + for (const MultimodalInput& input : inputs) { + if (input.is_text()) { + std::string text = input.get_text(); + prompt += text; + + tokenizers::Result> encode_res = + tokenizer_->encode(text, n_bos, 0); + ET_CHECK_TK_OK_OR_RETURN_ERROR( + encode_res.error(), "failed to encode prompt %s", text.c_str()); + + std::vector tokens = encode_res.get(); + tok_embedding_processor_->prefill(tokens); + const TensorStruct& text_embeddings = + tok_embedding_processor_->get_prompt_embeddings(); + + // Add text embeddings to merger + embedding_merger_->add_text_embeddings(text_embeddings); + + prompt_tokens.insert(prompt_tokens.end(), tokens.begin(), tokens.end()); + + } else if (input.is_image()) { + const Image& image = input.get_image(); + auto image_tensor_res = image.toTensor(/*with_batch*/ true); + executorch::extension::TensorPtr image_tensor_ptr = + image_tensor_res.get(); + + auto encode_res = encoder_runner_->encode(image_tensor_ptr); + executorch::aten::Tensor image_embeddings_tensor = encode_res.get(); + + // Add image embeddings to merger + embedding_merger_->add_image_embeddings(image_embeddings_tensor); + } else { - ET_CHECK_MSG( - false, - "Unable to read tokenized prompt from file: %s", - prompt.c_str()); + ET_CHECK_MSG(false, "Unsupported input data type"); } - } else { - tokenizers::Result> encode_res = - tokenizer_->encode(prompt, n_bos, 0); - ET_CHECK_TK_OK_OR_RETURN_ERROR( - encode_res.error(), "failed to encode prompt %s", prompt.c_str()); - prompt_tokens = encode_res.get(); } - int num_prompt_tokens = prompt_tokens.size(); + + // Fuse embeddings by placeholder_token_id from model + TensorStruct merged_embeddings = + embedding_merger_->merge(prompt_tokens, image_token_id_); + int num_prompt_tokens = embedding_merger_->get_total_tokens(); + ET_CHECK_MSG(num_prompt_tokens >= 1, "Expected at least 1 prompt token"); ET_CHECK_MSG( cur_pos_ + num_prompt_tokens < seq_len, "sequence length exceeded - please increase the seq_len value"); - // Prompt Processor first if (token_callback && config.echo) { token_callback(prompt); } - bool dump_logits = dump_logits_path_.empty() ? false : true; - embedding_processor_->prefill(prompt_tokens); - const TensorStruct& text_embeddings = - embedding_processor_->get_prompt_embeddings(); - int64_t embedding_dim = text_embeddings.tensor->size(2); - - uint64_t placeholder_token_id = 0; - if (module_->method_names()->count("modality_placeholder_token_id") > 0) { - placeholder_token_id = - module_->get("modality_placeholder_token_id")->toInt(); - } - - ET_LOG(Info, "Merging text embeddings with image hidden states"); - merge_multimodal_embeddings( - prompt_tokens, text_embeddings, placeholder_token_id); auto prefill_res = prompt_processor_->prefill( - merged_embeddings_, cur_pos_, dump_logits, nullptr); + merged_embeddings, cur_pos_, dump_logits, nullptr); ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error()); uint64_t cur_token = prefill_res.get(); cur_pos_ += num_prompt_tokens; @@ -526,85 +556,25 @@ Error MultimodalRunner::generate_from_prompt_or_file( } template -void MultimodalRunner::merge_multimodal_embeddings( - const std::vector& input_ids, - const TensorStruct& text_embeddings, - uint64_t placeholder_token_id) { - // This implements the modality_inputs_merger logic from decoder_utils.py - // Find positions where placeholder tokens appear - std::vector placeholder_positions; - for (size_t i = 0; i < input_ids.size(); ++i) { - if (input_ids[i] == placeholder_token_id) { - placeholder_positions.push_back(i); - } - } - - int64_t embedding_dim; - int64_t num_tokens = input_ids.size(); - if (text_embeddings.tensor) { - embedding_dim = text_embeddings.tensor->size(2); - num_tokens = text_embeddings.tensor->size(1); - } else { - ET_CHECK_MSG( - false, - "text_embeddings.tensor is null; cannot determine embedding dim during multimodal embedding merge"); - } - - // Allocate new buffer for merged embeddings - size_t total_elements = num_tokens * embedding_dim; - multimodal_embeddings_buffer_.resize(total_elements); - - // First, copy all text embeddings to the new buffer - std::memcpy( - multimodal_embeddings_buffer_.data(), - text_embeddings.data, - total_elements * sizeof(float)); - - // Then replace placeholder positions with image hidden states - auto* image_data = image_hidden_states_->const_data_ptr(); - auto* merged_data = multimodal_embeddings_buffer_.data(); - - int64_t image_seq_len = image_hidden_states_->size(1); - - // Copy image hidden states to placeholder positions - for (int32_t i = 0; i < placeholder_positions.size(); ++i) { - int32_t pos = placeholder_positions[i]; - std::memcpy( - merged_data + pos * embedding_dim, - image_data + i * embedding_dim, - embedding_dim * sizeof(float)); +Result QNNMultimodalRunner::get_model_version() { + if (!is_loaded()) { + stats_.model_load_start_ms = time_in_ms(); + ET_CHECK_OK_OR_RETURN_ERROR(load()); + stats_.model_load_end_ms = time_in_ms(); } - - merged_embeddings_.data = multimodal_embeddings_buffer_.data(); - merged_embeddings_.size = total_elements * sizeof(float); - - // Create TensorImpl with proper shape [1, num_tokens, embedding_dim] - multimodal_embeddings_sizes_ = { - 1, static_cast(num_tokens), static_cast(embedding_dim)}; - multimodal_embeddings_dim_order_ = {0, 1, 2}; - merged_embeddings_.tensor = std::make_unique( - executorch::aten::ScalarType::Float, - multimodal_embeddings_sizes_.size(), - multimodal_embeddings_sizes_.data(), - merged_embeddings_.data, - multimodal_embeddings_dim_order_.data()); - - ET_LOG(Info, "Multimodal embeddings merged successfully"); + return model_version_; } template -Result -MultimodalRunner::get_decoder_model_version() { +Result QNNMultimodalRunner::get_encoder_method_meta() { if (!is_loaded()) { - stats_.model_load_start_ms = time_in_ms(); ET_CHECK_OK_OR_RETURN_ERROR(load()); - stats_.model_load_end_ms = time_in_ms(); } - return decoder_model_version_; + return encoder_->method_meta(kEncoderForwardName); } // Explicit instantiations -template class MultimodalRunner; -template class MultimodalRunner; +template class QNNMultimodalRunner; +template class QNNMultimodalRunner; } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h index 0d56f52341f..4bf58c10339 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h @@ -14,41 +14,70 @@ #include #include #include +#include #include #include #include #include -#include -#include +#include +#include #include #include +#include +#include +#include #include +#include +#include #include #include #include namespace example { -// Extend DecoderModelVersion enum with multimodal models -enum MultimodalDecoderModelVersion { +enum class Modality { + kAudio = 0, + kVision, +}; + +enum class VisionLanguageModel { kSmolvlm = 0, kInternvl3, }; +// TODO: Add audio models when they are supported +enum class AudioLanguageModel {}; + +using ModelVersion = std::variant; + +constexpr Modality modality_of(const VisionLanguageModel& vlm) { + return Modality::kVision; +} + +constexpr Modality modality_of(const AudioLanguageModel& alm) { + return Modality::kAudio; +} + +inline Modality modality_of(const ModelVersion& model_version) { + return std::visit( + [](const auto& model) { return modality_of(model); }, model_version); +} + enum KvBitWidth { kWidth8 = 8, kWidth16 = 16, }; template -class MultimodalRunner : public executorch::extension::llm::IRunner { +class QNNMultimodalRunner + : public executorch::extension::llm::MultimodalRunner { public: - explicit MultimodalRunner( - std::unique_ptr module, - std::unique_ptr embedding_module, - const std::string& decoder_model, - const std::string& model_path, + explicit QNNMultimodalRunner( + std::unique_ptr encoder, + std::unique_ptr tok_embedding, + std::unique_ptr text_decoder, + const std::string& model_version, const std::string& tokenizer_path, const std::string& performance_output_path, const std::string& dump_logits_path, @@ -57,37 +86,21 @@ class MultimodalRunner : public executorch::extension::llm::IRunner { const bool shared_buffer = false, const int ngram = 0, const int window = 0, - const int gcap = 0, - std::unique_ptr image_hidden_states = nullptr); + const int gcap = 0); bool is_loaded() const override; executorch::runtime::Error load() override; - // Override generate to support multimodal inputs executorch::runtime::Error generate( - const std::string& prompt, + const std::vector& inputs, const executorch::extension::llm::GenerationConfig& config, std::function token_callback = {}, std::function stats_callback = {}) override; - // Multimodal-specific generation with image embeddings - executorch::runtime::Error generate_from_prompt_or_file( - const std::string& prompt, - bool tokenized_prompt, - const executorch::extension::llm::GenerationConfig& config, - std::function token_callback = {}, - std::function stats_callback = {}); - void stop() override {}; - void reset() override {}; - executorch::runtime::Result - get_decoder_model_version(); - - // Multimodal-specific method for merging embeddings - void merge_multimodal_embeddings( - const std::vector& input_ids, - const TensorStruct& text_embeddings, - uint64_t placeholder_token_id); + executorch::runtime::Result get_model_version(); + executorch::runtime::Result + get_encoder_method_meta(); private: enum EvalMode { @@ -98,8 +111,11 @@ class MultimodalRunner : public executorch::extension::llm::IRunner { }; // Modules - std::unique_ptr module_; - std::unique_ptr embedding_module_; + std::unique_ptr encoder_; + std::unique_ptr tok_embedding_; + std::unique_ptr text_decoder_; + + inline static const std::string kEncoderForwardName = "forward"; int32_t context_len_{0}; @@ -119,27 +135,23 @@ class MultimodalRunner : public executorch::extension::llm::IRunner { EvalMode eval_mode_; bool shared_buffer_; - MultimodalDecoderModelVersion decoder_model_version_; + ModelVersion model_version_; std::unique_ptr buffer_manager_; std::unique_ptr> kv_manager_; std::unique_ptr tokenizer_; std::unique_ptr decoder_runner_; std::unique_ptr> prompt_processor_; std::unique_ptr> token_generator_; - std::unique_ptr embedding_runner_; - std::unique_ptr embedding_processor_; - std::unique_ptr embedding_generator_; - - // Image hidden states storage - std::unique_ptr image_hidden_states_; - - // Multimodal embeddings storage - std::vector multimodal_embeddings_buffer_; - std::vector - multimodal_embeddings_sizes_; - std::vector - multimodal_embeddings_dim_order_; - TensorStruct merged_embeddings_; + std::unique_ptr encoder_runner_; + std::unique_ptr tok_embedding_runner_; + std::unique_ptr tok_embedding_processor_; + std::unique_ptr tok_embedding_generator_; + std::unique_ptr embedding_merger_; + + // Placeholder token ID for image inputs. This value will be set from the + // model's metadata. A default of 0 indicates that the vision modality is not + // supported. + uint64_t image_token_id_{0}; // stats executorch::llm::Stats stats_; diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.cpp index 89b8614d407..2ed8ae51f1d 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.cpp @@ -18,7 +18,7 @@ namespace example { template MultimodalTokenGenerator::MultimodalTokenGenerator( tokenizers::Tokenizer* tokenizer, - EmbeddingProcessor* embedding_runner, + TokenEmbeddingProcessor* tok_embedding_runner, DecoderRunner* decoder_runner, KVManager* kv_manager, const std::string& method_name, @@ -40,7 +40,7 @@ MultimodalTokenGenerator::MultimodalTokenGenerator( metadata.sliding_window, metadata.cache_mode}, stats), - embedding_runner_(embedding_runner), + tok_embedding_runner_(tok_embedding_runner), metadata_(metadata) { // Set input_toks_.size to 0 since we use embeddings instead input_toks_.size = 0; @@ -195,9 +195,9 @@ void MultimodalTokenGenerator::prepare_io( uint64_t cur_token, int64_t start_pos) { // Generate embedding for current token using embedding runner - embedding_runner_->prefill({cur_token}); + tok_embedding_runner_->prefill({cur_token}); const TensorStruct& text_embeddings = - embedding_runner_->get_prompt_embeddings(); + tok_embedding_runner_->get_prompt_embeddings(); int64_t embedding_dim = text_embeddings.tensor->size(2); // Copy embedding to input buffer std::memcpy( diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h index b010bf3748e..9eb9c79aaa4 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h @@ -7,7 +7,7 @@ */ #pragma once -#include +#include #include namespace example { @@ -34,7 +34,7 @@ class MultimodalTokenGenerator : public example::TokenGenerator { // Constructor with embedding generator support MultimodalTokenGenerator( tokenizers::Tokenizer* tokenizer, - EmbeddingProcessor* embedding_runner, + TokenEmbeddingProcessor* tok_embedding_runner, DecoderRunner* decoder_runner, KVManager* kv_manager, const std::string& method_name, @@ -86,7 +86,7 @@ class MultimodalTokenGenerator : public example::TokenGenerator { using TokenGenerator::v_cache_out_; // Additional members specific to multimodal - EmbeddingProcessor* embedding_runner_; + TokenEmbeddingProcessor* tok_embedding_runner_; /** * @brief Fill in I/O buffers with prompt token and position. diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.cpp similarity index 86% rename from examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.cpp rename to examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.cpp index 1278a1df7d9..d0566941b06 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.cpp @@ -6,7 +6,7 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include #include #include @@ -19,11 +19,11 @@ using executorch::runtime::TensorInfo; namespace example { -EmbeddingProcessor::EmbeddingProcessor( - EmbeddingRunner* embedding_runner, +TokenEmbeddingProcessor::TokenEmbeddingProcessor( + TokenEmbeddingRunner* tok_embedding_runner, const std::string& method_name, Metadata metadata) - : embedding_runner_(embedding_runner), + : tok_embedding_runner_(tok_embedding_runner), method_name_(method_name), metadata_(metadata) { input_toks_.size = metadata_.ar_len * sizeof(int64_t); @@ -31,7 +31,7 @@ EmbeddingProcessor::EmbeddingProcessor( prompt_embeddings_.size = 0; // Will be set in prefill() } -void EmbeddingProcessor::init_io( +void TokenEmbeddingProcessor::init_io( IMemAlloc* buffer_manager, Result method_meta) { input_tensors_.reserve(method_meta->num_inputs()); @@ -73,7 +73,7 @@ void EmbeddingProcessor::init_io( } } -void EmbeddingProcessor::update_prompt_embedding( +void TokenEmbeddingProcessor::update_prompt_embedding( int32_t num_prompt_tokens, int64_t prompt_pos) { for (int i = 0; i < metadata_.ar_len; i++) { @@ -86,7 +86,8 @@ void EmbeddingProcessor::update_prompt_embedding( } } -void EmbeddingProcessor::prefill(const std::vector& prompt_tokens) { +void TokenEmbeddingProcessor::prefill( + const std::vector& prompt_tokens) { int64_t prompt_pos = 0; int32_t num_prompt_tokens = prompt_tokens.size(); prompt_embeddings_.size = @@ -99,19 +100,18 @@ void EmbeddingProcessor::prefill(const std::vector& prompt_tokens) { // Create TensorImpl for prompt_embeddings_ with shape [1, num_prompt_tokens, // dim] Store sizes and dim_order as member variables to keep them // alive - prompt_embeddings_sizes_ = {1, num_prompt_tokens, metadata_.embedding_dim}; - prompt_embeddings_dim_order_ = {0, 1, 2}; + std::vector sizes = { + 1, num_prompt_tokens, metadata_.embedding_dim}; prompt_embeddings_.tensor = std::make_unique( executorch::aten::ScalarType::Float, - prompt_embeddings_sizes_.size(), - prompt_embeddings_sizes_.data(), - prompt_embeddings_.data, - prompt_embeddings_dim_order_.data()); + sizes.size(), + sizes.data(), + prompt_embeddings_.data); int num_iters = 1 + ((num_prompt_tokens - 1) / metadata_.ar_len); ET_CHECK_MSG( - embedding_runner_->set_outputs(method_name_, output_tensors_) == + tok_embedding_runner_->set_outputs(method_name_, output_tensors_) == executorch::runtime::Error::Ok, "Failed to set output tensor for module %s", method_name_.c_str()); @@ -119,7 +119,7 @@ void EmbeddingProcessor::prefill(const std::vector& prompt_tokens) { for (int32_t i = 0; i < num_iters; ++i) { prepare_io(prompt_tokens, prompt_pos); - embedding_runner_->step(method_name_, inputs_); + tok_embedding_runner_->step(method_name_, inputs_); // Update prompt_embedding update_prompt_embedding(num_prompt_tokens, prompt_pos); @@ -128,7 +128,7 @@ void EmbeddingProcessor::prefill(const std::vector& prompt_tokens) { } } -void EmbeddingProcessor::prepare_io( +void TokenEmbeddingProcessor::prepare_io( const std::vector& prompt_tokens, int64_t prompt_pos) { for (int i = 0; i < metadata_.ar_len; i++) { diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.h similarity index 87% rename from examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.h rename to examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.h index 0ece8bf2d03..f5dee69bf3a 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_processor.h +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.h @@ -9,18 +9,18 @@ #pragma once #include #include -#include +#include #include #include #include namespace example { /** - * @class EmbeddingProcessor + * @class TokenEmbeddingProcessor * @brief Class for processing prompts to generate embeddings using embedding * runner. */ -class EmbeddingProcessor { +class TokenEmbeddingProcessor { public: struct Metadata { int32_t context_len; @@ -30,8 +30,8 @@ class EmbeddingProcessor { int32_t embedding_dim; }; - EmbeddingProcessor( - EmbeddingRunner* embedding_runner, + TokenEmbeddingProcessor( + TokenEmbeddingRunner* token_tok_embedding_runner, const std::string& method_name, Metadata metadata); @@ -75,7 +75,7 @@ class EmbeddingProcessor { const std::vector& prompt_tokens, int64_t prompt_pos); - EmbeddingRunner* embedding_runner_; + TokenEmbeddingRunner* tok_embedding_runner_; std::string method_name_; // metadata @@ -86,9 +86,6 @@ class EmbeddingProcessor { TensorStruct embeddings_; TensorStruct prompt_embeddings_; std::vector prompt_embeddings_buffer_; - std::vector prompt_embeddings_sizes_; - std::vector - prompt_embeddings_dim_order_; std::vector inputs_; std::vector input_tensors_; diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_runner.cpp similarity index 82% rename from examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.cpp rename to examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_runner.cpp index bf1008e34b1..cd8a521062f 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_runner.cpp @@ -6,7 +6,7 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include #include using executorch::aten::Tensor; @@ -17,9 +17,9 @@ using executorch::runtime::Result; namespace example { -EmbeddingRunner::EmbeddingRunner(Module* module) : module_(module) {} +TokenEmbeddingRunner::TokenEmbeddingRunner(Module* module) : module_(module) {} -Result EmbeddingRunner::step( +Result TokenEmbeddingRunner::step( const std::string& method_name, std::vector& inputs) { // Execute embedding module @@ -35,7 +35,7 @@ Result EmbeddingRunner::step( return outputs_res.get()[0].toTensor(); } -Error EmbeddingRunner::set_outputs( +Error TokenEmbeddingRunner::set_outputs( const std::string& method_name, std::vector output_values) { for (size_t i = 0; i < output_values.size(); ++i) { @@ -45,7 +45,7 @@ Error EmbeddingRunner::set_outputs( return Error::Ok; } -Error EmbeddingRunner::load(const std::vector& method_names) { +Error TokenEmbeddingRunner::load(const std::vector& method_names) { if (is_method_loaded(method_names)) { return Error::Ok; } @@ -55,7 +55,7 @@ Error EmbeddingRunner::load(const std::vector& method_names) { return Error::Ok; } -bool EmbeddingRunner::is_method_loaded( +bool TokenEmbeddingRunner::is_method_loaded( const std::vector& method_names) { bool method_loaded = true; for (const std::string& method_name : method_names) { @@ -64,7 +64,7 @@ bool EmbeddingRunner::is_method_loaded( return method_loaded; } -bool EmbeddingRunner::is_loaded() const { +bool TokenEmbeddingRunner::is_loaded() const { return module_ != nullptr && module_->is_loaded(); } diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_runner.h similarity index 93% rename from examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.h rename to examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_runner.h index d5155a45252..dc6951395bf 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/embedding_runner.h +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_runner.h @@ -18,12 +18,12 @@ namespace example { /** - * @class EmbeddingRunner + * @class TokenEmbeddingRunner * @brief Class for running embedding module, similar to DecoderRunner */ -class EmbeddingRunner { +class TokenEmbeddingRunner { public: - EmbeddingRunner(executorch::extension::Module* module); + TokenEmbeddingRunner(executorch::extension::Module* module); /** * Run embedding module with inputs to generate embeddings. diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/utils.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/utils.h new file mode 100644 index 00000000000..4b16cf646cb --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/utils.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +using ::executorch::aten::ScalarType; +using ::executorch::extension::llm::Image; +using ::executorch::extension::llm::MultimodalInput; + +namespace example { + +inline std::vector load_raw_files( + const std::string& input_list_file_path) { + std::vector input_files; + + std::ifstream input_list(input_list_file_path); + ET_CHECK_MSG( + input_list.is_open(), + "Failed to open input list file: %s", + input_list_file_path.c_str()); + + auto split = [](std::string s, std::string delimiter) { + size_t pos_start = 0, pos_end, delim_len = delimiter.length(); + std::string token; + std::vector res; + + while ((pos_end = s.find(delimiter, pos_start)) != std::string::npos) { + token = s.substr(pos_start, pos_end - pos_start); + pos_start = pos_end + delim_len; + res.push_back(token); + } + res.push_back(s.substr(pos_start)); + return res; + }; + + std::string file_path_line; + while (std::getline(input_list, file_path_line)) { + if (!file_path_line.empty() && file_path_line.back() == '\r') { + file_path_line.pop_back(); + } + if (file_path_line.empty()) { + continue; + } + + auto line_files = split(file_path_line, " "); + if (line_files.empty()) { + continue; + } + + input_files.insert(input_files.end(), line_files.begin(), line_files.end()); + } + return input_files; +} + +void load_image( + const std::string& image_path, + Image& image, + const std::vector& expected_size, + const ScalarType& expected_dtype) { + const size_t n = expected_size.size(); + ET_CHECK_MSG(n >= 3, "expected dim should at least be 3, but got %zu", n); + const int32_t channels = expected_size[n - 3]; + const int32_t height = expected_size[n - 2]; + const int32_t width = expected_size[n - 1]; + + size_t num_elems = std::accumulate( + expected_size.begin(), + expected_size.end(), + size_t{1}, + std::multiplies()); + + std::streamsize expected_length = num_elems * sizeof(float); + + std::ifstream file(image_path, std::ios::binary | std::ios::ate); + ET_CHECK_MSG( + file.is_open(), "Failed to open input file: %s", image_path.c_str()); + + std::streamsize file_size = file.tellg(); + ET_CHECK_MSG( + file_size == expected_length, + "Input image size mismatch. file bytes: %ld, expected bytes: %zu (file: " + "%s)", + file_size, + expected_length, + image_path.c_str()); + file.seekg(0, std::ios::beg); + std::vector buffer(num_elems); + file.read(reinterpret_cast(buffer.data()), expected_length); + file.close(); + + image = Image(std::move(buffer), width, height, channels); + ET_LOG( + Info, + "image Channels: %" PRId32 ", Height: %" PRId32 ", Width: %" PRId32, + image.channels(), + image.height(), + image.width()); +} + +} // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/vision_chat_template.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/vision_chat_template.h new file mode 100644 index 00000000000..283080f9935 --- /dev/null +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/vision_chat_template.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) Qualcomm Innovation Center, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +const std::string IMG_TOKEN = ""; + +/** + * Special tokens structure for vision modality + */ +struct SpecialTokens { + std::string image_token; + std::string global_img; + std::string fake_wrap_start; + std::string fake_wrap_end; +}; + +/** + * Get special tokens based on model version + */ +inline SpecialTokens get_special_tokens( + example::VisionLanguageModel model_version) { + SpecialTokens tokens; + + switch (model_version) { + case example::VisionLanguageModel::kSmolvlm: + tokens.image_token = ""; + tokens.global_img = ""; + tokens.fake_wrap_start = ""; + tokens.fake_wrap_end = ""; + break; + case example::VisionLanguageModel::kInternvl3: + tokens.image_token = ""; + tokens.global_img = ""; + tokens.fake_wrap_start = ""; + tokens.fake_wrap_end = ""; + break; + default: + break; + } + + return tokens; +} + +/** + * Expand image tokens in prompt with model-specific wrapping tokens + * Replaces each token with the full format including special wrapper + * tokens + */ +inline std::string expand_image_tokens( + const std::string& prompt, + const SpecialTokens& specials) { + // Create image prompt with repeated image tokens + std::string image_prompt = specials.fake_wrap_start; + image_prompt += specials.global_img; + image_prompt += specials.image_token; + image_prompt += specials.fake_wrap_end; + + // Replace single image token with expanded version + size_t pos = 0; + std::string expanded = prompt; + while ((pos = expanded.find(IMG_TOKEN, pos)) != std::string::npos) { + expanded.replace(pos, IMG_TOKEN.size(), image_prompt); + pos += image_prompt.size(); + } + ET_LOG(Info, "Prompt after expanding image token: %s", expanded.c_str()); + + return expanded; +} + +/** + * Format prompt based on model version with multimodal token expansion + */ +inline std::string apply_chat_template( + const std::string& system_prompt, + const std::string& prompt, + example::VisionLanguageModel model_version) { + std::string formatted_prompt; + SpecialTokens specials = get_special_tokens(model_version); + + switch (model_version) { + case example::VisionLanguageModel::kSmolvlm: { + if (!system_prompt.empty()) { + formatted_prompt.append( + "<|start_header_id|>system<|end_header_id|>\n\n"); + formatted_prompt.append(system_prompt); + formatted_prompt.append("<|eot_id|>"); + } + formatted_prompt.append("<|im_start|>User:"); + formatted_prompt.append(expand_image_tokens(prompt, specials)); + formatted_prompt.append("\nAssistant:"); + break; + } + case example::VisionLanguageModel::kInternvl3: { + if (!system_prompt.empty()) { + formatted_prompt.append("<|im_start|>system<|im_end|>\n\n"); + formatted_prompt.append(system_prompt); + formatted_prompt.append("<|im_end|>"); + } + formatted_prompt.append("<|im_start|>user:\n"); + formatted_prompt.append(expand_image_tokens(prompt, specials)); + formatted_prompt.append("<|im_end|>assistant\n"); + break; + } + default: + ET_CHECK_MSG(false, "unsupported VLM version"); + break; + } + return formatted_prompt; +} diff --git a/examples/qualcomm/oss_scripts/llama/tokenizer.py b/examples/qualcomm/oss_scripts/llama/tokenizer.py index b55cd61d616..3befa71168b 100644 --- a/examples/qualcomm/oss_scripts/llama/tokenizer.py +++ b/examples/qualcomm/oss_scripts/llama/tokenizer.py @@ -7,7 +7,9 @@ import argparse import json import logging -from typing import Callable +import re +import warnings +from typing import Callable, List from executorch.examples.qualcomm.oss_scripts.llama import LLMModelConfig from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import ( @@ -18,16 +20,19 @@ from transformers import AutoTokenizer +IMG_TOKEN = "" +AUDIO_TOKEN = "