From f4466bae65f0890e7556102b7729e3aaff1c15bf Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Date: Tue, 3 Dec 2024 05:33:10 -0500 Subject: [PATCH] [V1] VLM - preprocessor hashing Signed-off-by: Roger Wang Co-authored-by: Michael Goin Co-authored-by: Roger Wang Signed-off-by: Alexander Matveev --- examples/offline_inference_vision_language.py | 126 ++++++++++++-- vllm/config.py | 10 +- vllm/engine/arg_utils.py | 8 + vllm/v1/engine/__init__.py | 3 +- vllm/v1/engine/core.py | 13 +- vllm/v1/engine/mm_input_mapper.py | 158 ++++++++++++++++-- vllm/v1/engine/processor.py | 35 ++-- vllm/v1/utils.py | 21 +++ 8 files changed, 326 insertions(+), 48 deletions(-) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index c6a274ee5894b..5e210126dc8fe 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -5,6 +5,8 @@ For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ +import random + from transformers import AutoTokenizer from vllm import LLM, SamplingParams @@ -23,7 +25,9 @@ def run_llava(question: str, modality: str): prompt = f"USER: \n{question}\nASSISTANT:" - llm = LLM(model="llava-hf/llava-1.5-7b-hf", max_model_len=4096) + llm = LLM(model="llava-hf/llava-1.5-7b-hf", + max_model_len=4096, + mm_cache_preprocessor=args.mm_cache_preprocessor) stop_token_ids = None return llm, prompt, stop_token_ids @@ -33,7 +37,9 @@ def run_llava_next(question: str, modality: str): assert modality == "image" prompt = f"[INST] \n{question} [/INST]" - llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192) + llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", + max_model_len=8192, + mm_cache_preprocessor=args.mm_cache_preprocessor) stop_token_ids = None return llm, prompt, stop_token_ids @@ -44,7 +50,9 @@ def run_llava_next_video(question: str, modality: str): assert modality == "video" prompt = f"USER: