diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py index 55e000fc88..2ac134f440 100644 --- a/autotest/tools/pipeline/mllm_case.py +++ b/autotest/tools/pipeline/mllm_case.py @@ -5,9 +5,8 @@ from PIL import Image from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline -from lmdeploy.vl import load_image +from lmdeploy.vl import encode_image_base64, load_image from lmdeploy.vl.constants import IMAGE_TOKEN -from lmdeploy.vl.utils import encode_image_base64 gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=10) diff --git a/docs/en/multi_modal/internvl.md b/docs/en/multi_modal/internvl.md index aeab4fe67b..a592ba224e 100644 --- a/docs/en/multi_modal/internvl.md +++ b/docs/en/multi_modal/internvl.md @@ -116,7 +116,7 @@ import numpy as np from lmdeploy import pipeline, GenerationConfig from decord import VideoReader, cpu from lmdeploy.vl.constants import IMAGE_TOKEN -from lmdeploy.vl.utils import encode_image_base64 +from lmdeploy.vl import encode_image_base64 from PIL import Image pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO') diff --git a/docs/en/multi_modal/minicpmv.md b/docs/en/multi_modal/minicpmv.md index 15774de7e7..0f2bf176b9 100644 --- a/docs/en/multi_modal/minicpmv.md +++ b/docs/en/multi_modal/minicpmv.md @@ -97,7 +97,7 @@ print(out.text) ```python from lmdeploy import pipeline, GenerationConfig -from lmdeploy.vl.utils import encode_image_base64 +from lmdeploy.vl import encode_image_base64 import torch from PIL import Image from transformers import AutoModel, AutoTokenizer diff --git a/docs/en/multi_modal/qwen2_5_vl.md b/docs/en/multi_modal/qwen2_5_vl.md index 75c98c5a68..ac2ffa2ce6 100644 --- a/docs/en/multi_modal/qwen2_5_vl.md +++ b/docs/en/multi_modal/qwen2_5_vl.md @@ -99,7 +99,7 @@ import numpy as np from lmdeploy import pipeline, GenerationConfig from decord import VideoReader, cpu from lmdeploy.vl.constants import IMAGE_TOKEN -from lmdeploy.vl.utils import encode_image_base64 +from lmdeploy.vl import encode_image_base64 from PIL import Image pipe = pipeline('Qwen/Qwen2.5-VL-7B-Instruct', log_level='INFO') diff --git a/docs/zh_cn/multi_modal/internvl.md b/docs/zh_cn/multi_modal/internvl.md index e6a37c0ac0..3207550d82 100644 --- a/docs/zh_cn/multi_modal/internvl.md +++ b/docs/zh_cn/multi_modal/internvl.md @@ -116,7 +116,7 @@ import numpy as np from lmdeploy import pipeline, GenerationConfig from decord import VideoReader, cpu from lmdeploy.vl.constants import IMAGE_TOKEN -from lmdeploy.vl.utils import encode_image_base64 +from lmdeploy.vl import encode_image_base64 from PIL import Image pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO') diff --git a/docs/zh_cn/multi_modal/minicpmv.md b/docs/zh_cn/multi_modal/minicpmv.md index b605bc1fcc..eb2a168cdb 100644 --- a/docs/zh_cn/multi_modal/minicpmv.md +++ b/docs/zh_cn/multi_modal/minicpmv.md @@ -97,7 +97,7 @@ print(out.text) ```python from lmdeploy import pipeline, GenerationConfig -from lmdeploy.vl.utils import encode_image_base64 +from lmdeploy.vl import encode_image_base64 import torch from PIL import Image from transformers import AutoModel, AutoTokenizer diff --git a/docs/zh_cn/multi_modal/qwen2_5_vl.md b/docs/zh_cn/multi_modal/qwen2_5_vl.md index f258a783a0..2b1d81c0a4 100644 --- a/docs/zh_cn/multi_modal/qwen2_5_vl.md +++ b/docs/zh_cn/multi_modal/qwen2_5_vl.md @@ -99,7 +99,7 @@ import numpy as np from lmdeploy import pipeline, GenerationConfig from decord import VideoReader, cpu from lmdeploy.vl.constants import IMAGE_TOKEN -from lmdeploy.vl.utils import encode_image_base64 +from lmdeploy.vl import encode_image_base64 from PIL import Image pipe = pipeline('Qwen/Qwen2.5-VL-7B-Instruct', log_level='INFO') diff --git a/lmdeploy/pytorch/model_inputs.py b/lmdeploy/pytorch/model_inputs.py index 538b9c6f3a..2bf93bb530 100644 --- a/lmdeploy/pytorch/model_inputs.py +++ b/lmdeploy/pytorch/model_inputs.py @@ -11,7 +11,7 @@ import lmdeploy.pytorch.distributed as dist from lmdeploy.pytorch.backends import get_backend from lmdeploy.pytorch.config import CacheConfig, DLLMConfig, ModelConfig, QuantizationConfig -from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor +from lmdeploy.pytorch.multimodal.data_type import MultiModalData from lmdeploy.pytorch.utils import CtxMgrBase, singleton if TYPE_CHECKING: @@ -66,7 +66,7 @@ class VisionModelInputs: input_embeddings: List[List[torch.Tensor]] = None input_embedding_ranges: List[torch.LongTensor] = None input_embedding_indexing: torch.BoolTensor = None - input_multimodals: List[MultiModalTensor] = None + input_multimodals: List[MultiModalData] = None def to_device(self, device: str, non_blocking: bool = False): """To device.""" @@ -255,7 +255,7 @@ class StepContext: local_adapter_ids: torch.LongTensor = None input_embeddings: torch.Tensor = None input_embedding_indexing: torch.Tensor = None - input_multimodals: List[MultiModalTensor] = None + input_multimodals: List[MultiModalData] = None vision_inputs: VisionModelInputs = None attn_metadata: Any = None kv_quant_policy: Literal[0, 4, 8] = 0 diff --git a/lmdeploy/pytorch/models/chatglm2.py b/lmdeploy/pytorch/models/chatglm2.py index 56e3169bb7..b690217564 100644 --- a/lmdeploy/pytorch/models/chatglm2.py +++ b/lmdeploy/pytorch/models/chatglm2.py @@ -9,7 +9,7 @@ from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager -from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor +from lmdeploy.pytorch.multimodal.data_type import MultiModalData from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding, build_rotary_params) from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_down_linear, build_gateup_linear, build_o_proj, @@ -866,10 +866,10 @@ def preprocess_input(self, if isinstance(num_pad, torch.Tensor): num_pad = num_pad.item() - mm_data = MultiModalTensor(data=pixel_values, - start=offset, - end=offset + num_pad, - meta=dict(image_token_id=image_token_id)) + mm_data = MultiModalData(data=pixel_values, + start=offset, + end=offset + num_pad, + meta=dict(image_token_id=image_token_id)) input_imgs.append(mm_data) result = PreprocessInputResult( diff --git a/lmdeploy/pytorch/models/cogvlm.py b/lmdeploy/pytorch/models/cogvlm.py index ad8adc9739..9b8d0b5472 100644 --- a/lmdeploy/pytorch/models/cogvlm.py +++ b/lmdeploy/pytorch/models/cogvlm.py @@ -12,7 +12,7 @@ from lmdeploy.pytorch.distributed import get_tp_world_rank from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager -from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor +from lmdeploy.pytorch.multimodal.data_type import MultiModalData from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_merged_colwise_linear, build_qkv_proj, build_rowwise_linear) @@ -901,10 +901,10 @@ def preprocess_input(self, input_ids: List[int], input_multimodals=None, **kwarg if isinstance(num_pad, torch.Tensor): num_pad = num_pad.item() - mm_data = MultiModalTensor(data=pixel_values, - start=offset, - end=offset + num_pad, - meta=dict(image_token_id=image_token_id)) + mm_data = MultiModalData(data=pixel_values, + start=offset, + end=offset + num_pad, + meta=dict(image_token_id=image_token_id)) input_imgs.append(mm_data) result = PreprocessInputResult( diff --git a/lmdeploy/pytorch/models/deepseek_vl2.py b/lmdeploy/pytorch/models/deepseek_vl2.py index 290b9a4fc0..b778c6ebeb 100644 --- a/lmdeploy/pytorch/models/deepseek_vl2.py +++ b/lmdeploy/pytorch/models/deepseek_vl2.py @@ -11,7 +11,7 @@ from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager -from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor +from lmdeploy.pytorch.multimodal.data_type import MultiModalData from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight from .deepseek_v2 import DeepseekV2ForCausalLM @@ -440,13 +440,13 @@ def preprocess_input(self, if isinstance(num_pad, torch.Tensor): num_pad = num_pad.item() - mm_data = MultiModalTensor(data=pixel_values, - start=offset, - end=offset + num_pad, - meta=dict( - image_token_id=image_token_id, - images_spatial_crop=images_spatial_crop, - )) + mm_data = MultiModalData(data=pixel_values, + start=offset, + end=offset + num_pad, + meta=dict( + image_token_id=image_token_id, + images_spatial_crop=images_spatial_crop, + )) input_imgs.append(mm_data) diff --git a/lmdeploy/pytorch/models/gemma3_vl.py b/lmdeploy/pytorch/models/gemma3_vl.py index 8f4ea8e972..cff9615df2 100644 --- a/lmdeploy/pytorch/models/gemma3_vl.py +++ b/lmdeploy/pytorch/models/gemma3_vl.py @@ -8,7 +8,7 @@ from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager -from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor +from lmdeploy.pytorch.multimodal.data_type import MultiModalData from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight from .patch import build_model_from_hf_config @@ -108,10 +108,10 @@ def preprocess_input(self, if isinstance(num_pad, torch.Tensor): num_pad = num_pad.item() - mm_data = MultiModalTensor(data=pixel_values, - start=offset, - end=offset + num_pad, - meta=dict(image_token_id=image_token_id)) + mm_data = MultiModalData(data=pixel_values, + start=offset, + end=offset + num_pad, + meta=dict(image_token_id=image_token_id)) input_imgs.append(mm_data) result = PreprocessInputResult( diff --git a/lmdeploy/pytorch/models/glm4_1v.py b/lmdeploy/pytorch/models/glm4_1v.py index 9b89164bef..8332940247 100644 --- a/lmdeploy/pytorch/models/glm4_1v.py +++ b/lmdeploy/pytorch/models/glm4_1v.py @@ -11,7 +11,7 @@ from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager -from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor +from lmdeploy.pytorch.multimodal.data_type import MultiModalData from lmdeploy.pytorch.nn import ApplyRotaryEmb, FlashAttention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config from lmdeploy.pytorch.nn.linear import build_merged_colwise_linear, build_qkv_proj, build_rowwise_linear from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight @@ -865,10 +865,10 @@ def preprocess_input(self, if isinstance(num_pad, torch.Tensor): num_pad = num_pad.item() - mm_data = MultiModalTensor(data=pixel_values, - start=start, - end=start + num_pad, - meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id)) + mm_data = MultiModalData(data=pixel_values, + start=start, + end=start + num_pad, + meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id)) input_imgs.append(mm_data) result = PreprocessInputResult( diff --git a/lmdeploy/pytorch/models/interns1_pro.py b/lmdeploy/pytorch/models/interns1_pro.py index 51ed9deaf6..773d9e3964 100644 --- a/lmdeploy/pytorch/models/interns1_pro.py +++ b/lmdeploy/pytorch/models/interns1_pro.py @@ -7,8 +7,9 @@ from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager -from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor +from lmdeploy.pytorch.multimodal.data_type import MultiModalData from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight +from lmdeploy.vl.constants import Modality from .interns1_pro_ts import InternS1ProTimeSeriesModel from .patch import add_prefix, get_build_model_context @@ -173,25 +174,28 @@ def prepare_inputs_for_generation( ts_sr = None ts_mask = None if context.input_multimodals is not None: - mm_data = [input_mm.get('image', []) for input_mm in context.input_multimodals] + mm_inputs = [input_mm.get('mm_data', []) for input_mm in context.input_multimodals] # flatten batch - mm_data = [data for im_data in mm_data for data in im_data] + mm_inputs = [item for sublist in mm_inputs for item in sublist] - if len(mm_data) > 0: - is_time_series = mm_data[0].meta.get('ts_token_id', False) + if len(mm_inputs) > 0: + modality = mm_inputs[0].modality + image_token_id = mm_inputs[0].meta.get('image_token_id') + video_token_id = mm_inputs[0].meta.get('video_token_id') + ts_token_id = mm_inputs[0].meta.get('ts_token_id') - if is_time_series: - ts_values = mm_data - ts_token_id = ts_values[0].meta['ts_token_id'] - ts_lens = ts_values[0].meta['ts_lens'] - ts_sr = ts_values[0].meta['ts_sr'] + if modality == Modality.TIME_SERIES: + ts_values = torch.cat([inp.data for inp in mm_inputs]) ts_mask = input_ids == ts_token_id - ts_values = torch.cat([data.data for data in ts_values]) + + ts_lens = mm_inputs[0].meta['ts_lens'] + ts_sr = mm_inputs[0].meta['ts_sr'] else: - pixel_values = torch.cat([data.data for data in mm_data]) - image_token_id = mm_data[0].meta['image_token_id'] - image_mask = input_ids == image_token_id - grid_thw = torch.cat([data.meta['grid_thw'] for data in mm_data]).cpu() + pixel_values = torch.cat([inp.data for inp in mm_inputs]) + mm_token_id = image_token_id if modality == Modality.IMAGE else video_token_id + image_mask = (input_ids == mm_token_id) + + grid_thw = torch.cat([data.meta['grid_thw'] for data in mm_inputs]).cpu() vis_pos_emb = self.visual.rot_pos_emb(grid_thw) pos_embeds = self.visual.fast_pos_embed_interpolate(grid_thw) vis_cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], @@ -365,6 +369,63 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype) -> None: self.config = config self.dtype = dtype + def _make_image_mm_data(self, input_mm: Dict[str, Any]) -> MultiModalData: + """Make image MultiModalData.""" + pixel_values = input_mm['pixel_values'].to(self.dtype) + image_grid_thw = input_mm['image_grid_thw'] + offset = input_mm['offset'] + start = offset + image_token_id = input_mm['image_token_id'] + num_pad = input_mm['image_tokens'] + if isinstance(num_pad, torch.Tensor): + num_pad = num_pad.item() + + mm_data = MultiModalData(modality=Modality.IMAGE, + data=pixel_values, + start=start, + end=start + num_pad, + meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id)) + return mm_data + + def _make_video_mm_data(self, input_mm: Dict[str, Any]) -> MultiModalData: + """Make video MultiModalData.""" + pixel_values_videos = input_mm['pixel_values_videos'].to(self.dtype) + video_grid_thw = input_mm['video_grid_thw'] + offset = input_mm['offset'] + start = offset + video_token_id = input_mm['video_token_id'] + num_pad = input_mm['video_tokens'] + if isinstance(num_pad, torch.Tensor): + num_pad = num_pad.item() + + mm_data = MultiModalData(modality=Modality.VIDEO, + data=pixel_values_videos, + start=start, + end=start + num_pad, + meta=dict( + grid_thw=video_grid_thw, + video_token_id=video_token_id, + )) + return mm_data + + def _make_time_series_mm_data(self, input_mm: Dict[str, Any]) -> MultiModalData: + """Make time series MultiModalData.""" + ts_values = input_mm['ts_values'].to(self.dtype) + offset = input_mm['offset'] + ts_token_id = input_mm['ts_token_id'] + ts_lens = input_mm['ts_lens'] + ts_sr = input_mm['ts_sr'] + num_pad = input_mm['ts_tokens'] + if isinstance(num_pad, torch.Tensor): + num_pad = num_pad.item() + + mm_data = MultiModalData(modality=Modality.TIME_SERIES, + data=ts_values, + start=offset, + end=offset + num_pad, + meta=dict(ts_lens=ts_lens, ts_sr=ts_sr, ts_token_id=ts_token_id)) + return mm_data + def preprocess_input(self, input_ids: List[int], input_multimodals: List[Dict[str, Any]] = None, @@ -373,38 +434,17 @@ def preprocess_input(self, if input_multimodals is None or len(input_multimodals) == 0: return input_ids, input_multimodals - input_imgs = [] + input_mm_data = [] for input_mm in input_multimodals: - if 'ts_values' in input_mm: - ts_values = input_mm['ts_values'].to(self.dtype) - offset = input_mm['offset'] - ts_token_id = input_mm['ts_token_id'] - ts_lens = input_mm['ts_lens'] - ts_sr = input_mm['ts_sr'] - num_pad = input_mm['num_ts_tokens'] - - mm_data = MultiModalTensor(data=ts_values, - start=offset, - end=offset + num_pad, - meta=dict(ts_token_id=ts_token_id, ts_lens=ts_lens, ts_sr=ts_sr)) - else: - pixel_values = input_mm['pixel_values'].to(self.dtype) - image_grid_thw = input_mm['image_grid_thw'] - offset = input_mm['offset'] - start = offset - image_token_id = input_mm['image_token_id'] - num_pad = input_mm['image_tokens'] - if isinstance(num_pad, torch.Tensor): - num_pad = num_pad.item() - - mm_data = MultiModalTensor(data=pixel_values, - start=start, - end=start + num_pad, - meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id)) - input_imgs.append(mm_data) - - result = PreprocessInputResult( - input_ids=input_ids, - input_multimodals=dict(image=input_imgs), - ) + modality = input_mm.get('modality') + if modality == Modality.IMAGE: + mm_data = self._make_image_mm_data(input_mm) + elif modality == Modality.VIDEO: + mm_data = self._make_video_mm_data(input_mm) + elif modality == Modality.TIME_SERIES: + mm_data = self._make_time_series_mm_data(input_mm) + input_mm_data.append(mm_data) + + result = PreprocessInputResult(input_ids=input_ids, input_multimodals=dict(mm_data=input_mm_data)) + return result diff --git a/lmdeploy/pytorch/models/interns1_pro_ts.py b/lmdeploy/pytorch/models/interns1_pro_ts.py index 48ba00fcef..6b400febbd 100644 --- a/lmdeploy/pytorch/models/interns1_pro_ts.py +++ b/lmdeploy/pytorch/models/interns1_pro_ts.py @@ -154,7 +154,7 @@ def __init__(self, d_model, max_len=20000, dtype: torch.dtype = None, device: to div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) * (-math.log(10000.0) / d_model)) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) - # TODO: zhouxinyu, hf forces float32 during init, but becomes bf16 during forward + # hf forces float32 during init, but becomes bf16 during forward pe = pe.unsqueeze(0).transpose(0, 1).to(dtype=dtype, device=device) # (max_len, 1, d_model) self.register_buffer('pe', pe, persistent=True) diff --git a/lmdeploy/pytorch/models/internvl.py b/lmdeploy/pytorch/models/internvl.py index 5b6c261dd2..43b80644f6 100644 --- a/lmdeploy/pytorch/models/internvl.py +++ b/lmdeploy/pytorch/models/internvl.py @@ -12,7 +12,7 @@ from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager from lmdeploy.pytorch.models.utils.micro_batch import enable_micro_batch, split_batch -from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor +from lmdeploy.pytorch.multimodal.data_type import MultiModalData from lmdeploy.pytorch.nn import LayerNorm, RMSNorm from lmdeploy.pytorch.nn.linear import build_colwise_linear, build_o_proj, build_qkv_proj, build_rowwise_linear from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight @@ -992,10 +992,10 @@ def preprocess_input(self, if isinstance(num_pad, torch.Tensor): num_pad = num_pad.item() - mm_data = MultiModalTensor(data=pixel_values, - start=offset, - end=offset + num_pad, - meta=dict(image_token_id=image_token_id)) + mm_data = MultiModalData(data=pixel_values, + start=offset, + end=offset + num_pad, + meta=dict(image_token_id=image_token_id)) input_imgs.append(mm_data) result = PreprocessInputResult( diff --git a/lmdeploy/pytorch/models/internvl3_hf.py b/lmdeploy/pytorch/models/internvl3_hf.py index 7cd4cd940c..4ea2eb2f45 100644 --- a/lmdeploy/pytorch/models/internvl3_hf.py +++ b/lmdeploy/pytorch/models/internvl3_hf.py @@ -13,7 +13,7 @@ from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager from lmdeploy.pytorch.models.utils.micro_batch import enable_micro_batch, split_batch -from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor +from lmdeploy.pytorch.multimodal.data_type import MultiModalData from lmdeploy.pytorch.nn import LayerNorm, RMSNorm from lmdeploy.pytorch.nn.linear import build_colwise_linear, build_o_proj, build_qkv_proj, build_rowwise_linear from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight @@ -736,10 +736,10 @@ def preprocess_input(self, if isinstance(num_pad, torch.Tensor): num_pad = num_pad.item() - mm_data = MultiModalTensor(data=pixel_values, - start=offset, - end=offset + num_pad, - meta=dict(image_token_id=image_token_id)) + mm_data = MultiModalData(data=pixel_values, + start=offset, + end=offset + num_pad, + meta=dict(image_token_id=image_token_id)) input_imgs.append(mm_data) result = PreprocessInputResult( diff --git a/lmdeploy/pytorch/models/llama4.py b/lmdeploy/pytorch/models/llama4.py index 4b3c2196bc..e7711b83d3 100644 --- a/lmdeploy/pytorch/models/llama4.py +++ b/lmdeploy/pytorch/models/llama4.py @@ -8,7 +8,7 @@ import lmdeploy.pytorch.distributed as dist from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager -from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor +from lmdeploy.pytorch.multimodal.data_type import MultiModalData from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_merged_colwise_linear, build_qkv_proj, build_rowwise_linear) @@ -1033,10 +1033,10 @@ def preprocess_input(self, if isinstance(num_pad, torch.Tensor): num_pad = num_pad.item() - mm_data = MultiModalTensor(data=pixel_values, - start=offset, - end=offset + num_pad, - meta=dict(image_token_id=image_token_id)) + mm_data = MultiModalData(data=pixel_values, + start=offset, + end=offset + num_pad, + meta=dict(image_token_id=image_token_id)) input_imgs.append(mm_data) result = PreprocessInputResult( diff --git a/lmdeploy/pytorch/models/llava.py b/lmdeploy/pytorch/models/llava.py index e87242df4c..4004441050 100644 --- a/lmdeploy/pytorch/models/llava.py +++ b/lmdeploy/pytorch/models/llava.py @@ -11,7 +11,7 @@ from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager -from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor +from lmdeploy.pytorch.multimodal.data_type import MultiModalData from lmdeploy.pytorch.nn.linear import build_colwise_linear, build_qkv_proj, build_rowwise_linear from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight @@ -555,10 +555,10 @@ def preprocess_input(self, if isinstance(num_pad, torch.Tensor): num_pad = num_pad.item() - mm_data = MultiModalTensor(data=pixel_values, - start=offset, - end=offset + num_pad, - meta=dict(image_token_id=image_token_id)) + mm_data = MultiModalData(data=pixel_values, + start=offset, + end=offset + num_pad, + meta=dict(image_token_id=image_token_id)) input_imgs.append(mm_data) result = PreprocessInputResult( @@ -834,10 +834,10 @@ def preprocess_input(self, if isinstance(num_pad, torch.Tensor): num_pad = num_pad.item() - mm_data = MultiModalTensor(data=pixel_values, - start=offset, - end=offset + num_pad, - meta=dict(image_sizes=image_sizes, image_token_id=image_token_id)) + mm_data = MultiModalData(data=pixel_values, + start=offset, + end=offset + num_pad, + meta=dict(image_sizes=image_sizes, image_token_id=image_token_id)) input_imgs.append(mm_data) result = PreprocessInputResult( diff --git a/lmdeploy/pytorch/models/phi3_v.py b/lmdeploy/pytorch/models/phi3_v.py index c6804d5586..aff3f78935 100644 --- a/lmdeploy/pytorch/models/phi3_v.py +++ b/lmdeploy/pytorch/models/phi3_v.py @@ -8,7 +8,7 @@ from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager -from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor +from lmdeploy.pytorch.multimodal.data_type import MultiModalData from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight from .phi3 import Phi3ForCausalLM, Phi3Model @@ -379,10 +379,10 @@ def preprocess_input(self, if isinstance(num_pad, torch.Tensor): num_pad = num_pad.item() - mm_data = MultiModalTensor(data=pixel_values, - start=offset, - end=offset + num_pad, - meta=dict(image_sizes=image_sizes, image_token_id=image_token_id)) + mm_data = MultiModalData(data=pixel_values, + start=offset, + end=offset + num_pad, + meta=dict(image_sizes=image_sizes, image_token_id=image_token_id)) input_imgs.append(mm_data) result = PreprocessInputResult( diff --git a/lmdeploy/pytorch/models/qwen2_5_vl.py b/lmdeploy/pytorch/models/qwen2_5_vl.py index 9c19a7de21..0458d01c76 100644 --- a/lmdeploy/pytorch/models/qwen2_5_vl.py +++ b/lmdeploy/pytorch/models/qwen2_5_vl.py @@ -12,7 +12,7 @@ from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager from lmdeploy.pytorch.models.qwen2_vl import Qwen2Model -from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor +from lmdeploy.pytorch.multimodal.data_type import MultiModalData from lmdeploy.pytorch.nn import ApplyRotaryEmb, FlashAttention, RMSNorm, SiluAndMul from lmdeploy.pytorch.nn.linear import build_merged_colwise_linear, build_qkv_proj, build_rowwise_linear from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight @@ -687,9 +687,6 @@ def get_input_processor(self) -> BaseModelInputProcessor: return self.input_processor -InputMultiModalType = List[Dict[str, Any]] - - class Qwen2_5_VLInputProcessor(BaseModelInputProcessor): """Qwen2 input processor.""" @@ -715,10 +712,10 @@ def preprocess_input(self, if isinstance(num_pad, torch.Tensor): num_pad = num_pad.item() - mm_data = MultiModalTensor(data=pixel_values, - start=start, - end=start + num_pad, - meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id)) + mm_data = MultiModalData(data=pixel_values, + start=start, + end=start + num_pad, + meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id)) input_imgs.append(mm_data) result = PreprocessInputResult( diff --git a/lmdeploy/pytorch/models/qwen2_vl.py b/lmdeploy/pytorch/models/qwen2_vl.py index 605f8ded76..888d350aa2 100644 --- a/lmdeploy/pytorch/models/qwen2_vl.py +++ b/lmdeploy/pytorch/models/qwen2_vl.py @@ -8,7 +8,7 @@ from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager -from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor +from lmdeploy.pytorch.multimodal.data_type import MultiModalData from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, FlashAttention, LayerNorm, RMSNorm, SiluAndMul, build_rotary_embedding_from_config) from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_merged_colwise_linear, build_qkv_proj, @@ -890,9 +890,6 @@ def get_input_processor(self) -> BaseModelInputProcessor: return self.input_processor -InputMultiModalType = List[Dict[str, Any]] - - class Qwen2VLInputProcessor(BaseModelInputProcessor): """Qwen2 input processor.""" @@ -918,10 +915,10 @@ def preprocess_input(self, if isinstance(num_pad, torch.Tensor): num_pad = num_pad.item() - mm_data = MultiModalTensor(data=pixel_values, - start=start, - end=start + num_pad, - meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id)) + mm_data = MultiModalData(data=pixel_values, + start=start, + end=start + num_pad, + meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id)) input_imgs.append(mm_data) result = PreprocessInputResult( diff --git a/lmdeploy/pytorch/models/qwen3_5.py b/lmdeploy/pytorch/models/qwen3_5.py index 59f373f075..8e3d51a385 100644 --- a/lmdeploy/pytorch/models/qwen3_5.py +++ b/lmdeploy/pytorch/models/qwen3_5.py @@ -20,11 +20,12 @@ build_rowwise_linear) from lmdeploy.pytorch.nn.rotary_embedding import get_rope_parameters from lmdeploy.pytorch.weight_loader.model_weight_loader import default_weight_loader, load_weight +from lmdeploy.vl.constants import Modality from .patch import add_prefix from .qwen2_5_vl import Qwen2_5_VisionRotaryEmbedding as Qwen3_5VisionRotaryEmbedding -from .qwen2_5_vl import Qwen2_5_VLInputProcessor as Qwen3_5InputProcessor from .qwen2_5_vl import Qwen2_5_VLVisionAttention as Qwen3_5VisionAttention +from .qwen3_vl import Qwen3VLInputProcessor as Qwen3_5InputProcessor from .utils.cudagraph import CudaGraphMeta, CudaGraphMixin from .utils.model import DeployModelMixinV1, vlm_model @@ -1144,14 +1145,20 @@ def prepare_inputs_for_generation( grid_thw = None pos_embeds = None if context.input_multimodals is not None: - image_data = [input_mm.get('image', []) for input_mm in context.input_multimodals] - if len(image_data) > 0: - # flatten batch - image_data = [data for im_data in image_data for data in im_data] - pixel_values = torch.cat([data.data for data in image_data]) - image_token_id = image_data[0].meta['image_token_id'] - image_mask = input_ids == image_token_id - grid_thw = torch.cat([data.meta['grid_thw'] for data in image_data]).cpu() + mm_inputs = [input_mm.get('mm_data', []) for input_mm in context.input_multimodals] + # flatten batch + mm_inputs = [item for sublist in mm_inputs for item in sublist] + + if len(mm_inputs) > 0: + modality = mm_inputs[0].modality + pixel_values = torch.cat([inp.data for inp in mm_inputs]) + + image_token_id = mm_inputs[0].meta.get('image_token_id') + video_token_id = mm_inputs[0].meta.get('video_token_id') + mm_token_id = image_token_id if modality == Modality.IMAGE else video_token_id + image_mask = (input_ids == mm_token_id) + + grid_thw = torch.cat([data.meta['grid_thw'] for data in mm_inputs]).cpu() vis_pos_emb = self.model.visual.rot_pos_emb(grid_thw) pos_embeds = self.model.visual.fast_pos_embed_interpolate(grid_thw) vis_cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], @@ -1341,9 +1348,10 @@ def _update_model_meta_prefilling(self, context: StepContext): mrope_position_ids = [] new_model_metas = [] for pos_ids, model_meta, input_mm in zip(batched_pos_ids, model_metas, input_multimodals): - images = [] + mm_data_list = [] if input_mm is not None: - images = input_mm.get('image', []) + mm_data_list.extend(input_mm.get('mm_data', [])) + if model_meta is None or 'mrope_delta' not in model_meta: mrope_delta = 0 else: @@ -1352,19 +1360,63 @@ def _update_model_meta_prefilling(self, context: StepContext): pos_start = pos_ids[0].item() mrope_pos_ids = pos_ids + mrope_delta mrope_pos_ids = mrope_pos_ids[None].expand(3, -1).clone() - for img in images: - grid_thw = img.meta['grid_thw'][0].tolist() - _, h, w = grid_thw - h //= 2 - w //= 2 - num_pad = img.end - img.start - max(h, w) - mrope_delta -= num_pad - fill_start = img.start - pos_start - fill_end = img.end - pos_start - img_pos_ids = self._get_multimodal_pos_ids(grid_thw, pos_ids.device) - img_pos_ids += mrope_pos_ids[:, fill_start:fill_start + 1] - mrope_pos_ids[:, fill_end:] -= num_pad - mrope_pos_ids[:, fill_start:fill_end] = img_pos_ids + + for mm_data in mm_data_list: + if mm_data.modality == Modality.IMAGE: + grid_thw = mm_data.meta['grid_thw'][0].tolist() + _, h, w = grid_thw + h //= 2 + w //= 2 + num_pad = mm_data.end - mm_data.start - max(h, w) + mrope_delta -= num_pad + fill_start = mm_data.start - pos_start + fill_end = mm_data.end - pos_start + img_pos_ids = self._get_multimodal_pos_ids(grid_thw, pos_ids.device) + img_pos_ids += mrope_pos_ids[:, fill_start:fill_start + 1] + mrope_pos_ids[:, fill_end:] -= num_pad + mrope_pos_ids[:, fill_start:fill_end] = img_pos_ids + elif mm_data.modality == Modality.VIDEO: + video_token_id = self.config.video_token_id + grid_thw = mm_data.meta['grid_thw'] + + grid_thw = torch.repeat_interleave(grid_thw, grid_thw[:, 0], dim=0) + grid_thw[:, 0] = 1 + + position_ids_list = [] + input_tokens = context.input_ids.tolist()[0] + + st = 0 + # treat each frame separately as a single image + for video_idx in range(grid_thw.shape[0]): + # text before video. e.g. <0.3 seconds><|vision_start|> ... + ed_video = input_tokens.index(video_token_id, st) + ed = ed_video + text_len = ed - st + st_idx = position_ids_list[-1].max() + 1 if len(position_ids_list) > 0 else 0 + text_pos_ids = torch.arange(text_len, device=pos_ids.device).view(1, -1).expand(3, -1) + st_idx + position_ids_list.append(text_pos_ids) + + # video frame. ... <|video_end|> + t, h, w = ( + grid_thw[video_idx][0], + grid_thw[video_idx][1] // 2, + grid_thw[video_idx][2] // 2, + ) + video_pos_ids = self._get_multimodal_pos_ids(grid_thw[video_idx], pos_ids.device) + position_ids_list.append(video_pos_ids + text_len + st_idx) + + st = ed + t * h * w + + # text after video, <|vision_end|> ... + if st < len(input_tokens): + st_idx = position_ids_list[-1].max() + 1 if len(position_ids_list) > 0 else 0 + text_len = len(input_tokens) - st + text_pos_ids = torch.arange(text_len, device=pos_ids.device).view(1, -1).expand(3, -1) + st_idx + position_ids_list.append(text_pos_ids) + + mrope_pos_ids = torch.cat(position_ids_list, dim=1).reshape(3, -1) + mrope_delta = mrope_pos_ids.max() + 1 - pos_ids.size(0) + mrope_pos_ids += pos_start # add back the original position offset mrope_position_ids.append(mrope_pos_ids) new_model_metas.append(dict(mrope_delta=mrope_delta)) diff --git a/lmdeploy/pytorch/models/qwen3_5_moe.py b/lmdeploy/pytorch/models/qwen3_5_moe.py index c475b873a5..98d0970ae2 100644 --- a/lmdeploy/pytorch/models/qwen3_5_moe.py +++ b/lmdeploy/pytorch/models/qwen3_5_moe.py @@ -15,10 +15,10 @@ from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight from .patch import add_prefix, get_build_model_context -from .qwen2_5_vl import Qwen2_5_VLInputProcessor as Qwen3_5MoeInputProcessor from .qwen3_5 import (Qwen3_5Attention, Qwen3_5DecoderLayer, Qwen3_5ForConditionalGeneration, Qwen3_5GatedDeltaNet, Qwen3_5MLP, Qwen3_5Model, Qwen3_5TextModel, Qwen3_5TextRotaryEmbedding) from .qwen3_5 import Qwen3_5VisionModel as Qwen3_5MoeVisionModel +from .qwen3_vl import Qwen3VLInputProcessor as Qwen3_5MoeInputProcessor class Qwen3_5MoeTopKRouter(nn.Module): @@ -51,7 +51,6 @@ def __init__(self, device: torch.device | None = None, prefix: str = ''): super().__init__() - # TODO: zhouxinyu, determine modules_to_not_convert from config file quantization_config = getattr(config, 'quantization_config', None) self.layer_idx = layer_idx self.hidden_dim = config.hidden_size diff --git a/lmdeploy/pytorch/models/qwen3_moe.py b/lmdeploy/pytorch/models/qwen3_moe.py index 5157199175..098194b8dd 100644 --- a/lmdeploy/pytorch/models/qwen3_moe.py +++ b/lmdeploy/pytorch/models/qwen3_moe.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import Any, Dict, Iterable, List, Tuple import torch from torch import nn @@ -95,7 +95,7 @@ def forward( self, hidden_states: torch.Tensor, rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + past_key_value: Tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -198,7 +198,6 @@ def __init__(self, device: torch.device = None, prefix: str = ''): super().__init__() - # TODO: zhouxinyu, determine modules_to_not_convert from config file quantization_config = getattr(config, 'quantization_config', None) self.layer_idx = layer_idx self.hidden_dim = config.hidden_size @@ -318,8 +317,8 @@ def forward( self, hidden_states: torch.Tensor, rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + past_key_value: List[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, all_routed_experts: torch.Tensor = None, ): @@ -396,10 +395,10 @@ def __init__(self, def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: List[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, all_routed_experts: torch.Tensor = None, ): """Rewrite of LlamaModel.forward.""" @@ -519,7 +518,7 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" diff --git a/lmdeploy/pytorch/models/qwen3_next.py b/lmdeploy/pytorch/models/qwen3_next.py index 4c56c01aa3..89953ecee9 100644 --- a/lmdeploy/pytorch/models/qwen3_next.py +++ b/lmdeploy/pytorch/models/qwen3_next.py @@ -369,7 +369,6 @@ def __init__(self, dtype: torch.dtype = None, device: torch.device = None): super().__init__() - # TODO: zhouxinyu, determine modules_to_not_convert from config file quantization_config = getattr(config, 'quantization_config', None) self.layer_idx = layer_idx self.hidden_dim = config.hidden_size diff --git a/lmdeploy/pytorch/models/qwen3_vl.py b/lmdeploy/pytorch/models/qwen3_vl.py index a6f694c6f2..0d0434a58f 100644 --- a/lmdeploy/pytorch/models/qwen3_vl.py +++ b/lmdeploy/pytorch/models/qwen3_vl.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from functools import lru_cache -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import Any, Dict, Iterable, List, Tuple import numpy as np import torch @@ -9,16 +9,17 @@ from transformers.configuration_utils import PretrainedConfig from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update -from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor +from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager +from lmdeploy.pytorch.multimodal.data_type import MultiModalData from lmdeploy.pytorch.nn import LayerNorm from lmdeploy.pytorch.nn.linear import build_colwise_linear, build_rowwise_linear from lmdeploy.pytorch.nn.rotary_embedding import get_rope_parameters from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight +from lmdeploy.vl.constants import Modality from .patch import add_prefix from .qwen2_5_vl import Qwen2_5_VisionRotaryEmbedding as Qwen3VLVisionRotaryEmbedding -from .qwen2_5_vl import Qwen2_5_VLInputProcessor as Qwen3VLInputProcessor from .qwen2_5_vl import Qwen2_5_VLVisionAttention as Qwen3VLVisionAttention from .qwen3 import Qwen3model from .utils.cudagraph import CudaGraphMeta, CudaGraphMixin @@ -116,23 +117,16 @@ def __init__(self, def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: List[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, mrope_position_ids: torch.LongTensor = None, # args for deepstack - visual_pos_masks: Optional[torch.Tensor] = None, - deepstack_visual_embeds: Optional[list[torch.Tensor]] = None, + visual_pos_masks: torch.Tensor | None = None, + deepstack_visual_embeds: List[torch.Tensor] | None = None, ): - """visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, - *optional*): - - The mask of the visual positions. deepstack_visual_embeds (`list[torch.Tensor]`, *optional*): The deepstack - visual embeddings. The shape is (num_layers, visual_seqlen, embed_dim). The feature is extracted from the - different visual encoder layers, and fed to the decoder hidden states. It's from the paper DeepStack ( - https://arxiv.org/abs/2406.04) - """ + """Rewrite of LlamaModel.forward.""" # token embedding if inputs_embeds is None: @@ -279,7 +273,7 @@ def __init__( def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, - rotary_pos_emb: Optional[torch.Tensor] = None) -> torch.Tensor: + rotary_pos_emb: torch.Tensor | None = None) -> torch.Tensor: hidden_states = hidden_states + self.attn( self.norm1(hidden_states), cu_seqlens=cu_seqlens, @@ -610,7 +604,7 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -627,14 +621,20 @@ def prepare_inputs_for_generation( grid_thw = None pos_embeds = None if context.input_multimodals is not None: - image_data = [input_mm.get('image', []) for input_mm in context.input_multimodals] - if len(image_data) > 0: - # flatten batch - image_data = [data for im_data in image_data for data in im_data] - pixel_values = torch.cat([data.data for data in image_data]) - image_token_id = image_data[0].meta['image_token_id'] - image_mask = input_ids == image_token_id - grid_thw = torch.cat([data.meta['grid_thw'] for data in image_data]).cpu() + mm_inputs = [input_mm.get('mm_data', []) for input_mm in context.input_multimodals] + # flatten batch + mm_inputs = [item for sublist in mm_inputs for item in sublist] + + if len(mm_inputs) > 0: + modality = mm_inputs[0].modality + pixel_values = torch.cat([inp.data for inp in mm_inputs]) + + image_token_id = mm_inputs[0].meta.get('image_token_id') + video_token_id = mm_inputs[0].meta.get('video_token_id') + mm_token_id = image_token_id if modality == Modality.IMAGE else video_token_id + image_mask = (input_ids == mm_token_id) + + grid_thw = torch.cat([data.meta['grid_thw'] for data in mm_inputs]).cpu() vis_pos_emb = self.visual.rot_pos_emb(grid_thw) pos_embeds = self.visual.fast_pos_embed_interpolate(grid_thw) vis_cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], @@ -793,9 +793,10 @@ def _update_model_meta_prefilling(self, context: StepContext): mrope_position_ids = [] new_model_metas = [] for pos_ids, model_meta, input_mm in zip(batched_pos_ids, model_metas, input_multimodals): - images = [] + mm_data_list = [] if input_mm is not None: - images = input_mm.get('image', []) + mm_data_list.extend(input_mm.get('mm_data', [])) + if model_meta is None or 'mrope_delta' not in model_meta: mrope_delta = 0 else: @@ -804,19 +805,63 @@ def _update_model_meta_prefilling(self, context: StepContext): pos_start = pos_ids[0].item() mrope_pos_ids = pos_ids + mrope_delta mrope_pos_ids = mrope_pos_ids[None].expand(3, -1).clone() - for img in images: - grid_thw = img.meta['grid_thw'][0].tolist() - _, h, w = grid_thw - h //= 2 - w //= 2 - num_pad = img.end - img.start - max(h, w) - mrope_delta -= num_pad - fill_start = img.start - pos_start - fill_end = img.end - pos_start - img_pos_ids = self._get_multimodal_pos_ids(grid_thw, pos_ids.device) - img_pos_ids += mrope_pos_ids[:, fill_start:fill_start + 1] - mrope_pos_ids[:, fill_end:] -= num_pad - mrope_pos_ids[:, fill_start:fill_end] = img_pos_ids + + for mm_data in mm_data_list: + if mm_data.modality == Modality.IMAGE: + grid_thw = mm_data.meta['grid_thw'][0].tolist() + _, h, w = grid_thw + h //= 2 + w //= 2 + num_pad = mm_data.end - mm_data.start - max(h, w) + mrope_delta -= num_pad + fill_start = mm_data.start - pos_start + fill_end = mm_data.end - pos_start + img_pos_ids = self._get_multimodal_pos_ids(grid_thw, pos_ids.device) + img_pos_ids += mrope_pos_ids[:, fill_start:fill_start + 1] + mrope_pos_ids[:, fill_end:] -= num_pad + mrope_pos_ids[:, fill_start:fill_end] = img_pos_ids + elif mm_data.modality == Modality.VIDEO: + video_token_id = self.config.video_token_id + grid_thw = mm_data.meta['grid_thw'] + + grid_thw = torch.repeat_interleave(grid_thw, grid_thw[:, 0], dim=0) + grid_thw[:, 0] = 1 + + position_ids_list = [] + input_tokens = context.input_ids.tolist()[0] + + st = 0 + # treat each frame separately as a single image + for video_idx in range(grid_thw.shape[0]): + # text before video. e.g. <0.3 seconds><|vision_start|> ... + ed_video = input_tokens.index(video_token_id, st) + ed = ed_video + text_len = ed - st + st_idx = position_ids_list[-1].max() + 1 if len(position_ids_list) > 0 else 0 + text_pos_ids = torch.arange(text_len, device=pos_ids.device).view(1, -1).expand(3, -1) + st_idx + position_ids_list.append(text_pos_ids) + + # video frame. ... <|video_end|> + t, h, w = ( + grid_thw[video_idx][0], + grid_thw[video_idx][1] // 2, + grid_thw[video_idx][2] // 2, + ) + video_pos_ids = self._get_multimodal_pos_ids(grid_thw[video_idx], pos_ids.device) + position_ids_list.append(video_pos_ids + text_len + st_idx) + + st = ed + t * h * w + + # text after video, <|vision_end|> ... + if st < len(input_tokens): + st_idx = position_ids_list[-1].max() + 1 if len(position_ids_list) > 0 else 0 + text_len = len(input_tokens) - st + text_pos_ids = torch.arange(text_len, device=pos_ids.device).view(1, -1).expand(3, -1) + st_idx + position_ids_list.append(text_pos_ids) + + mrope_pos_ids = torch.cat(position_ids_list, dim=1).reshape(3, -1) + mrope_delta = mrope_pos_ids.max() + 1 - pos_ids.size(0) + mrope_pos_ids += pos_start # add back the original position offset mrope_position_ids.append(mrope_pos_ids) new_model_metas.append(dict(mrope_delta=mrope_delta)) @@ -828,7 +873,7 @@ def _update_model_meta_prefilling(self, context: StepContext): def update_model_metas(self, past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + inputs_embeds: torch.Tensor | None = None, context: StepContext = None): """Update model meta.""" if context.is_decoding: @@ -841,4 +886,68 @@ def get_input_processor(self) -> BaseModelInputProcessor: return self.input_processor -InputMultiModalType = List[Dict[str, Any]] +class Qwen3VLInputProcessor(BaseModelInputProcessor): + """Qwen3 input processor.""" + + def __init__(self, config: PretrainedConfig) -> None: + self.config = config + + def _make_image_mm_data(self, input_mm: Dict[str, Any]) -> MultiModalData: + """Make image MultiModalData.""" + pixel_values = input_mm['pixel_values'] + image_grid_thw = input_mm['image_grid_thw'] + offset = input_mm['offset'] + start = offset + image_token_id = input_mm['image_token_id'] + num_pad = input_mm['image_tokens'] + if isinstance(num_pad, torch.Tensor): + num_pad = num_pad.item() + + mm_data = MultiModalData(modality=Modality.IMAGE, + data=pixel_values, + start=start, + end=start + num_pad, + meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id)) + return mm_data + + def _make_video_mm_data(self, input_mm: Dict[str, Any]) -> MultiModalData: + """Make video MultiModalData.""" + pixel_values_videos = input_mm['pixel_values_videos'] + video_grid_thw = input_mm['video_grid_thw'] + offset = input_mm['offset'] + start = offset + video_token_id = input_mm['video_token_id'] + num_pad = input_mm['video_tokens'] + if isinstance(num_pad, torch.Tensor): + num_pad = num_pad.item() + + mm_data = MultiModalData(modality=Modality.VIDEO, + data=pixel_values_videos, + start=start, + end=start + num_pad, + meta=dict( + grid_thw=video_grid_thw, + video_token_id=video_token_id, + )) + return mm_data + + def preprocess_input(self, + input_ids: List[int], + input_multimodals: List[Dict[str, Any]] = None, + **kwargs) -> PreprocessInputResult: + """Prepare multimodal input.""" + if input_multimodals is None or len(input_multimodals) == 0: + return input_ids, input_multimodals + + input_mm_data = [] + for input_mm in input_multimodals: + modality = input_mm.get('modality') + if modality == Modality.IMAGE: + mm_data = self._make_image_mm_data(input_mm) + elif modality == Modality.VIDEO: + mm_data = self._make_video_mm_data(input_mm) + input_mm_data.append(mm_data) + + result = PreprocessInputResult(input_ids=input_ids, input_multimodals=dict(mm_data=input_mm_data)) + + return result diff --git a/lmdeploy/pytorch/models/qwen3_vl_moe.py b/lmdeploy/pytorch/models/qwen3_vl_moe.py index 5810ab9b11..0b453f0a39 100644 --- a/lmdeploy/pytorch/models/qwen3_vl_moe.py +++ b/lmdeploy/pytorch/models/qwen3_vl_moe.py @@ -44,14 +44,7 @@ def forward( visual_pos_masks: Optional[torch.Tensor] = None, deepstack_visual_embeds: Optional[list[torch.Tensor]] = None, ): - """visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, - *optional*): - - The mask of the visual positions. deepstack_visual_embeds (`list[torch.Tensor]`, *optional*): The deepstack - visual embeddings. The shape is (num_layers, visual_seqlen, embed_dim). The feature is extracted from the - different visual encoder layers, and fed to the decoder hidden states. It's from the paper DeepStack ( - https://arxiv.org/abs/2406.04) - """ + """Rewrite of LlamaModel.forward.""" # token embedding if inputs_embeds is None: diff --git a/lmdeploy/pytorch/models/utils/multimodal.py b/lmdeploy/pytorch/models/utils/multimodal.py deleted file mode 100644 index 699f88021f..0000000000 --- a/lmdeploy/pytorch/models/utils/multimodal.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from typing import List, Tuple - -from lmdeploy.pytorch.multimodal.data_type import MultiModalInputs - -PreparedInputs = Tuple[List[int], MultiModalInputs] - - -class MultiModalMixin: - - def prepare_multimodal_input(self, input_ids, input_multimodals, **kwargs) -> PreparedInputs: - """Prepare multimodals inputs.""" - raise NotImplementedError('prepare input not implemented.') diff --git a/lmdeploy/pytorch/multimodal/__init__.py b/lmdeploy/pytorch/multimodal/__init__.py index c3e8c6a16f..fc2d5890d9 100644 --- a/lmdeploy/pytorch/multimodal/__init__.py +++ b/lmdeploy/pytorch/multimodal/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .data_type import MultiModalData, MultiModalTensor +from .data_type import MultiModalData -__all__ = ['MultiModalData', 'MultiModalTensor'] +__all__ = ['MultiModalData'] diff --git a/lmdeploy/pytorch/multimodal/data_type.py b/lmdeploy/pytorch/multimodal/data_type.py index dd3ec9a37d..4ff71fdcbc 100644 --- a/lmdeploy/pytorch/multimodal/data_type.py +++ b/lmdeploy/pytorch/multimodal/data_type.py @@ -4,24 +4,20 @@ from torch import Tensor - -class MultiModalData: - pass - - -MultiModalDataList = List[MultiModalData] +from lmdeploy.vl.constants import Modality NestedTensor = Union[Tensor, List[Tensor]] @dataclass -class MultiModalTensor: +class MultiModalData: data: NestedTensor start: int end: int = None - encoder_len: int = None meta: Dict[str, Any] = None + modality: Modality = Modality.IMAGE + def __post_init__(self): if self.end is None: self.end = self.start @@ -53,7 +49,7 @@ def to_device(self, device: str, non_blocking: bool = False): new_meta[k] = v out_dict['meta'] = new_meta - return MultiModalTensor(**out_dict) + return MultiModalData(**out_dict) -MultiModalInputs = Dict[str, List[MultiModalTensor]] +MultiModalInputs = Dict[str, List[MultiModalData]] diff --git a/lmdeploy/pytorch/multimodal/image_type.py b/lmdeploy/pytorch/multimodal/image_type.py deleted file mode 100644 index 19211a381f..0000000000 --- a/lmdeploy/pytorch/multimodal/image_type.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from dataclasses import dataclass -from typing import Any, ClassVar, Dict - -from PIL import Image - -from .data_type import MultiModalData - - -@dataclass -class ImageData(MultiModalData): - data: Image - loc: int - meta: Dict[str, Any] = None - type: ClassVar[str] = 'image' diff --git a/lmdeploy/serve/core/async_engine.py b/lmdeploy/serve/core/async_engine.py index 2a68483ad4..17cdd90e7f 100644 --- a/lmdeploy/serve/core/async_engine.py +++ b/lmdeploy/serve/core/async_engine.py @@ -294,6 +294,7 @@ async def generate( input_ids: List | None = None, enable_thinking: bool | None = None, chat_template_kwargs: Dict | None = None, + media_io_kwargs: Dict[str, Any] | None = None, mm_processor_kwargs: Dict[str, Any] | None = None, **kwargs): """Generate responses. @@ -338,6 +339,7 @@ async def generate( tools=tools, reasoning_effort=reasoning_effort, chat_template_kwargs=chat_template_kwargs, + media_io_kwargs=media_io_kwargs, mm_processor_kwargs=mm_processor_kwargs, **kwargs) prompt = prompt_input['prompt'] diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index 3e37caffe5..5d22aea8d3 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -482,6 +482,7 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque do_preprocess=do_preprocess, adapter_name=adapter_name, chat_template_kwargs=chat_template_kwargs or None, + media_io_kwargs=request.media_io_kwargs, mm_processor_kwargs=request.mm_processor_kwargs) def create_stream_response_json(index: int, @@ -973,6 +974,7 @@ async def generate(request: GenerateReqInput, raw_request: Request = None): sequence_start=True, sequence_end=True, do_preprocess=False, + media_io_kwargs=request.media_io_kwargs, mm_processor_kwargs=request.mm_processor_kwargs) def create_generate_response_json(res, text, output_ids, logprobs, finish_reason, routed_experts=None): diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py index 9749ade086..3b8a33c362 100644 --- a/lmdeploy/serve/openai/protocol.py +++ b/lmdeploy/serve/openai/protocol.py @@ -152,11 +152,17 @@ class ChatCompletionRequest(BaseModel): enable_thinking: Optional[bool] = None # will be deprecated in the future return_token_ids: Optional[bool] = False include_stop_str_in_output: Optional[bool] = False + # kwargs for chat template renderer chat_template_kwargs: dict[str, Any] | None = Field( default=None, description=('Additional keyword args to pass to the template renderer. ' 'Will be accessible by the chat template.'), ) + # kwargs for media IO + media_io_kwargs: Optional[dict[str, Any]] = Field( + default=None, + description=('Additional kwargs to pass to the media IO processing, keyed by modality.'), + ) # kwargs for hf processor mm_processor_kwargs: Optional[dict[str, Any]] = Field( default=None, diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py index 2ea599e687..e05163124a 100644 --- a/lmdeploy/serve/processors/multimodal.py +++ b/lmdeploy/serve/processors/multimodal.py @@ -7,6 +7,11 @@ from lmdeploy.model import MODELS, BaseChatTemplate from lmdeploy.tokenizer import Tokenizer from lmdeploy.utils import get_logger +from lmdeploy.vl.constants import Modality +from lmdeploy.vl.media.connection import load_from_url +from lmdeploy.vl.media.image import ImageMediaIO +from lmdeploy.vl.media.time_series import TimeSeriesMediaIO +from lmdeploy.vl.media.video import VideoMediaIO logger = get_logger('lmdeploy') @@ -85,122 +90,66 @@ def merge_message_content(msg: Dict) -> Dict: return result @staticmethod - async def async_convert_multimodal_data(messages: List[Dict]) -> List[Dict]: - """Convert user-input multimodal data into GPT4V message format.""" - from lmdeploy.vl.time_series_utils import load_time_series - from lmdeploy.vl.utils import load_image + def _parse_multimodal_item(i: int, in_messages: List[Dict], out_messages: List[Dict], media_io_kwargs: Dict[str, + Any]): + """Synchronous helper to parse a single multimodal message item.""" + role = in_messages[i]['role'] + content = in_messages[i]['content'] + + if role != 'user' or isinstance(content, str): + out_messages[i] = in_messages[i] + return + + assert isinstance(content, list) + out_message = dict(role=role, content=[]) + + for item in content: + item_type = item.get('type') + if item_type == 'text': + out_message['content'].append(item) + continue + + item_params = item.get(item_type, {}).copy() + data_src = item_params.pop('url', None) or item_params.pop('data', None) + + if item_type == 'image_data': + modality = Modality.IMAGE + data = data_src + elif item_type == 'image_url': + modality = Modality.IMAGE + img_io = ImageMediaIO(**media_io_kwargs.get('image', {})) + data = load_from_url(data_src, img_io) + elif item_type == 'video_url': + modality = Modality.VIDEO + vid_io = VideoMediaIO(image_io=ImageMediaIO(), **media_io_kwargs.get('video', {})) + data, metadata = load_from_url(data_src, vid_io) + item_params['video_metadata'] = metadata + elif item_type == 'time_series_url': + modality = Modality.TIME_SERIES + ts_io = TimeSeriesMediaIO(**media_io_kwargs.get('time_series', {})) + data = load_from_url(data_src, ts_io) + else: + raise NotImplementedError(f'unknown type: {item_type}') + + out_message['content'].append({'type': modality, 'data': data, **item_params}) + + out_messages[i] = out_message - if isinstance(messages, Dict): + @staticmethod + async def async_parse_multimodal_item(messages: List[Dict], + media_io_kwargs: Dict[str, Any] | None = None) -> List[Dict]: + """Convert user-input multimodal data into GPT4V message format.""" + if isinstance(messages, dict): messages = [messages] - assert isinstance(messages, List) + assert isinstance(messages, list) out_messages = [None] * len(messages) - - def _inner_call(i, in_messages, out_messages): - role = in_messages[i]['role'] - content = in_messages[i]['content'] - assert role in ['system', 'user', 'assistant'], \ - f'unsupported role "{role}"' - if role != 'user' or isinstance(content, str): - # the content is a user's prompt or an assistant's prompt, - # returning it directly - out_messages[i] = in_messages[i] - return - # the role is a user and the content is a list, in which there - # might be image_url or image_data - assert isinstance(content, List) - message = dict(role=role, content=[]) - for item in content: - # image url or base64-encoded image data - if item['type'] == 'image_url': - """ - convert the following item: - { - 'type': 'image_url', - 'image_url': { - 'url': 'image url or base64-encoded image data', - 'key': 'value' # parameters used in image processing - ... - } - } - to: - { - 'type': 'image', - 'image': Pillow.Image, - 'key': 'value' # parameters used in image processing - ... - } - """ # noqa - data = item['image_url'].copy() - try: - url = data.pop('url') - image = load_image(url) - data.update(type='image', image=image) - message['content'].append(data) - except KeyError: - logger.error(f'invalid format {message}') - elif item['type'] == 'image_data': - """ - convert the following item: - { - 'type': 'image_data', - 'image_data': { - 'data': Pillow.Image, - 'key': 'value' # parameters used in image processing - ... - } - } - to: - { - 'type': 'image', - 'image': Pillow.Image, - 'key': 'value' # parameters used in image processing - ... - } - """ # noqa - data = item['image_data'].copy() - try: - image = data.pop('data') - data.update(type='image', image=image) - message['content'].append(data) - except KeyError: - logger.error(f'invalid format {message}') - elif item['type'] == 'time_series_url': - """ - convert the following item: - { - 'type': 'time_series_url', - 'time_series_url': { - 'url': 'time series url or base64-encoded time series data', - 'key': 'value' # parameters used in time series processing - ... - } - } - to: - { - 'type': 'time_series', - 'time_series': np.ndarray, - 'key': 'value' # parameters used in time series processing - ... - } - """ # noqa - data = item['time_series_url'].copy() - try: - url = data.pop('url') - time_series = load_time_series(url) - data.update(type='time_series', time_series=time_series) - message['content'].append(data) - except KeyError: - logger.error(f'invalid format {message}') - elif item['type'] in ['text']: - message['content'].append(item) - else: - logger.error(f'unexpected content type {message}') - out_messages[i] = message + media_io_kwargs = media_io_kwargs or {} + loop = asyncio.get_event_loop() await asyncio.gather(*[ - asyncio.get_event_loop().run_in_executor(None, _inner_call, i, messages, out_messages) - for i in range(len(messages)) + loop.run_in_executor(None, MultimodalProcessor._parse_multimodal_item, i, messages, out_messages, + media_io_kwargs) for i in range(len(messages)) ]) return out_messages @@ -212,6 +161,7 @@ async def get_prompt_input(self, tools: List[object] | None = None, reasoning_effort: Literal['low', 'medium', 'high'] | None = None, chat_template_kwargs: Dict | None = None, + media_io_kwargs: Dict[str, Any] | None = None, mm_processor_kwargs: Dict[str, Any] | None = None, **kwargs): """Process prompt and return prompt string and input_ids. @@ -227,6 +177,7 @@ async def get_prompt_input(self, tools: Optional list of tools. reasoning_effort: Optional reasoning effort level. chat_template_kwargs: Optional kwargs for chat template. + media_io_kwargs: Optional kwargs for media IO operations. mm_processor_kwargs: Optional kwargs for multimodal processor. **kwargs: Additional keyword arguments. @@ -268,6 +219,7 @@ async def get_prompt_input(self, adapter_name=adapter_name, tools=tools, chat_template_kwargs=chat_template_kwargs, + media_io_kwargs=media_io_kwargs, mm_processor_kwargs=mm_processor_kwargs, **kwargs) else: @@ -320,7 +272,7 @@ def _is_image_list(obj) -> bool: @staticmethod def _re_format_prompt_images_pair(prompt: Tuple) -> Dict: """Reformat the prompt to openai message format.""" - from lmdeploy.vl.utils import load_image + from lmdeploy.vl import load_image messages = {'role': 'user', 'content': []} prompt, images = prompt @@ -352,7 +304,7 @@ def _re_format_prompt_images_pair(prompt: Tuple) -> Dict: def _has_multimodal_input(self, messages: List[Dict]) -> bool: """Check if messages contain multimodal input (images).""" - multimodal_types = ['image_url', 'image_data', 'time_series_url'] + multimodal_types = ['image_url', 'image_data', 'video_url', 'time_series_url'] return any( isinstance(message.get('content'), list) and any( item.get('type') in multimodal_types for item in message['content']) for message in messages) @@ -397,12 +349,13 @@ async def _get_multimodal_prompt_input(self, adapter_name: str, tools: List[object] | None = None, chat_template_kwargs: Dict | None = None, + media_io_kwargs: Dict[str, Any] | None = None, mm_processor_kwargs: Dict[str, Any] | None = None, **kwargs): """Process multimodal prompt and return processed data for inference engines.""" chat_template = self.chat_template if do_preprocess else BaseChatTemplate() - messages = await self.async_convert_multimodal_data(messages) + messages = await self.async_parse_multimodal_item(messages, media_io_kwargs) results = await self.vl_encoder.preprocess(messages, mm_processor_kwargs) if self.backend == 'turbomind': @@ -412,19 +365,19 @@ async def _get_multimodal_prompt_input(self, # embedding_ranges and so on. All the returned values are passed # to tm engine for token generation results = await self.vl_encoder.async_infer(results) - results = await self.vl_encoder.wrap_for_turbomind(results, - chat_template, - self.tokenizer, - sequence_start, + results = await self.vl_encoder.wrap_for_turbomind(messages=results, + chat_template=chat_template, + tokenizer=self.tokenizer, + sequence_start=sequence_start, tools=tools, chat_template_kwargs=chat_template_kwargs) elif self.backend == 'pytorch': # for pt engine, this module only conduct the image preprocessing # It leaves the vision embedding to the pt engine - results = await self.vl_encoder.wrap_for_pytorch(results, - chat_template, - self.tokenizer, - sequence_start, + results = await self.vl_encoder.wrap_for_pytorch(messages=results, + chat_template=chat_template, + tokenizer=self.tokenizer, + sequence_start=sequence_start, tools=tools, chat_template_kwargs=chat_template_kwargs) return results diff --git a/lmdeploy/vl/__init__.py b/lmdeploy/vl/__init__.py index 7e800b2db2..5c8c69d8b5 100644 --- a/lmdeploy/vl/__init__.py +++ b/lmdeploy/vl/__init__.py @@ -1,5 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .time_series_utils import load_time_series -from .utils import load_image +from .utils import (encode_image_base64, encode_time_series_base64, encode_video_base64, load_image, load_time_series, + load_video) -__all__ = ['load_image', 'load_time_series'] +__all__ = [ + 'load_image', + 'load_video', + 'load_time_series', + 'encode_image_base64', + 'encode_video_base64', + 'encode_time_series_base64', +] diff --git a/lmdeploy/vl/constants.py b/lmdeploy/vl/constants.py index eeaa697cdf..e0cd744f15 100644 --- a/lmdeploy/vl/constants.py +++ b/lmdeploy/vl/constants.py @@ -1,3 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. -IMAGE_DUMMY_TOKEN_INDEX = 0 +from enum import Enum + IMAGE_TOKEN = '' + + +class Modality(str, Enum): + IMAGE = 'image' + VIDEO = 'video' + AUDIO = 'audio' + TIME_SERIES = 'time_series' diff --git a/lmdeploy/vl/media/__init__.py b/lmdeploy/vl/media/__init__.py new file mode 100644 index 0000000000..ef101fec61 --- /dev/null +++ b/lmdeploy/vl/media/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/lmdeploy/vl/media/base.py b/lmdeploy/vl/media/base.py new file mode 100644 index 0000000000..2ba58897a9 --- /dev/null +++ b/lmdeploy/vl/media/base.py @@ -0,0 +1,23 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# adapted from https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/media/base.py + +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Generic, TypeVar + +_T = TypeVar('_T') + + +class MediaIO(ABC, Generic[_T]): + + @abstractmethod + def load_bytes(self, data: bytes) -> _T: + raise NotImplementedError + + @abstractmethod + def load_base64(self, media_type: str, data: str) -> _T: + raise NotImplementedError + + @abstractmethod + def load_file(self, filepath: Path) -> _T: + raise NotImplementedError diff --git a/lmdeploy/vl/media/connection.py b/lmdeploy/vl/media/connection.py new file mode 100644 index 0000000000..7389d67d8f --- /dev/null +++ b/lmdeploy/vl/media/connection.py @@ -0,0 +1,70 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +from pathlib import Path +from typing import TypeVar +from urllib.parse import ParseResult, urlparse +from urllib.request import url2pathname + +import requests + +from .base import MediaIO + +_M = TypeVar('_M') + +FETCH_TIMEOUT = int(os.environ.get('LMDEPLOY_FETCH_TIMEOUT', 10)) +headers = { + 'User-Agent': + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' + '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' +} + + +def _load_http_url(url_spec: ParseResult, media_io: MediaIO[_M]) -> _M: + if url_spec.scheme not in ('http', 'https'): + raise ValueError(f'Unsupported URL scheme: {url_spec.scheme}') + + client = requests.Session() + # TODO: zhouxinyu, timeout for video should be longer + response = client.get(url_spec.geturl(), headers=headers, timeout=FETCH_TIMEOUT) + response.raise_for_status() + + return media_io.load_bytes(response.content) + + +def _load_data_url(url_spec: ParseResult, media_io: MediaIO[_M]) -> _M: + url_spec_path = url_spec.path or '' + data_spec, data = url_spec_path.split(',', 1) + media_type, data_type = data_spec.split(';', 1) + # media_type starts with a leading "/" (e.g., "/video/jpeg") + media_type = media_type.lstrip('/') + + if data_type != 'base64': + msg = 'Only base64 data URLs are supported for now.' + raise NotImplementedError(msg) + + return media_io.load_base64(media_type, data) + + +def _load_file_url(url_spec: ParseResult, media_io: MediaIO[_M]) -> _M: + url_spec_path = url_spec.path or '' + url_spec_netloc = url_spec.netloc or '' + filepath = Path(url2pathname(url_spec_netloc + url_spec_path)) + return media_io.load_file(filepath) + + +def load_from_url(url: str, media_io: MediaIO[_M]) -> _M: + """Load media from a HTTP, data or file url.""" + url_spec = urlparse(url) + + if url_spec.scheme and url_spec.scheme.startswith('http'): + return _load_http_url(url_spec, media_io) + + if url_spec.scheme == 'data': + return _load_data_url(url_spec, media_io) + + # file url or raw file path (absolute or relative) + if url_spec.scheme == 'file' or os.path.exists(url) or os.path.exists(url_spec.path): + return _load_file_url(url_spec, media_io) + + msg = 'The URL must be either a HTTP, data or file URL.' + raise ValueError(msg) diff --git a/lmdeploy/vl/media/image.py b/lmdeploy/vl/media/image.py new file mode 100644 index 0000000000..7a6c12a086 --- /dev/null +++ b/lmdeploy/vl/media/image.py @@ -0,0 +1,43 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# adapted from https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/media/image.py + +from io import BytesIO +from pathlib import Path + +import pybase64 +from PIL import Image, ImageFile + +from .base import MediaIO + +ImageFile.LOAD_TRUNCATED_IMAGES = True + + +class ImageMediaIO(MediaIO[Image.Image]): + + def __init__(self, image_mode: str = 'RGB', **kwargs) -> None: + super().__init__() + self.image_mode = image_mode + + # for potential custom arguments from --media-io-kwargs + self.kwargs = kwargs + + def load_bytes(self, data: bytes) -> Image.Image: + image = Image.open(BytesIO(data)) + return image.convert(self.image_mode) + + def load_base64(self, media_type: str, data: str) -> Image.Image: + return self.load_bytes(pybase64.b64decode(data)) + + def load_file(self, file_path: Path) -> Image.Image: + with open(file_path, 'rb') as f: + data = f.read() + image = Image.open(BytesIO(data)) + return image.convert(self.image_mode) + + def encode_base64(self, image: Image.Image, image_format: str = 'PNG') -> str: + with BytesIO() as buffer: + image = image.convert(self.image_mode) + image.save(buffer, image_format) + data = buffer.getvalue() + + return pybase64.b64encode(data).decode('utf-8') diff --git a/lmdeploy/vl/media/time_series.py b/lmdeploy/vl/media/time_series.py new file mode 100644 index 0000000000..2e22ad278d --- /dev/null +++ b/lmdeploy/vl/media/time_series.py @@ -0,0 +1,60 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from io import BytesIO +from pathlib import Path + +import numpy as np +import numpy.typing as npt +import pybase64 + +from lmdeploy.utils import get_logger + +from .base import MediaIO + +logger = get_logger('lmdeploy') + + +class TimeSeriesMediaIO(MediaIO[npt.NDArray]): + + def __init__(self, **kwargs): + super().__init__() + + # for potential custom arguments from --media-io-kwargs + self.kwargs = kwargs + + def load_bytes(self, data: bytes) -> npt.NDArray: + ts_array = np.load(BytesIO(data), allow_pickle=False) + return ts_array + + def load_base64(self, media_type: str, data: str) -> npt.NDArray: + return self.load_bytes(pybase64.b64decode(data)) + + def load_file(self, filepath: Path) -> npt.NDArray: + suffix = filepath.suffix.lower() + + if suffix == '.npy': + return np.load(filepath, allow_pickle=False) + elif suffix == '.csv': + try: + ts_array = np.genfromtxt(filepath, delimiter=',', dtype=np.float32) + if ts_array.size == 0: + raise ValueError(f'CSV file {filepath} yielded no data.') + return ts_array + except Exception as e: + logger.error(f'Failed to load CSV {filepath}: {e}') + raise + elif suffix in ['.wav', '.mp3', '.flac']: + try: + import soundfile as sf + except ImportError: + raise ImportError('Please install soundfile via `pip install soundfile`.') + + ts_array, _ = sf.read(filepath) + return ts_array + + raise ValueError(f'Unsupported file format: {suffix}') + + def encode_base64(self, data: npt.NDArray) -> str: + """Encode numpy array to base64 string using NPY format.""" + buffer = BytesIO() + np.save(buffer, data, allow_pickle=False) + return pybase64.b64encode(buffer.getvalue()).decode('utf-8') diff --git a/lmdeploy/vl/media/video.py b/lmdeploy/vl/media/video.py new file mode 100644 index 0000000000..5fe0ff6477 --- /dev/null +++ b/lmdeploy/vl/media/video.py @@ -0,0 +1,117 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# adapted from https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/media/video.py + +import base64 +from functools import partial +from pathlib import Path +from typing import Any + +import numpy as np +import numpy.typing as npt +from PIL import Image + +from lmdeploy.utils import get_logger + +from .base import MediaIO +from .image import ImageMediaIO +from .video_loader import (DecordVideoLoader, OpenCVVideoLoader, TorchCodecVideoLoader, TorchVisionVideoLoader, + VideoLoader) + +logger = get_logger('lmdeploy') + + +class VideoMediaIO(MediaIO[tuple[npt.NDArray, dict[str, Any]]]): + + def __init__( + self, + image_io: ImageMediaIO, + num_frames: int = 32, + **kwargs, + ) -> None: + super().__init__() + + self.image_io = image_io + self.num_frames = num_frames + + # for potential custom arguments from --media-io-kwargs + self.kwargs = kwargs + self.video_loader = self._get_video_loader_backend() + + def _get_video_loader_backend(self) -> VideoLoader: + """Determines the best available video loader backend.""" + # vLLM: OpenCV + # SGLang: Decord + # qwen-vl-utils: TorchCodec -> Decord -> TorchVision (deprecated soon) + backends = [ + ('cv2', OpenCVVideoLoader), + ('decord', DecordVideoLoader), + ('torchcodec', TorchCodecVideoLoader), + ('torchvision', TorchVisionVideoLoader), + ] + + for module_name, loader_cls in backends: + try: + __import__(module_name) + return loader_cls() + except (ImportError, RuntimeError): + logger.warning(f"Video backend '{module_name}' not found. Trying next backend...") + continue + + raise ImportError( + 'No video backend found. Install either opencv-python-headless, decord, torchcodec, or torchvision.') + + def load_bytes(self, data: bytes) -> tuple[npt.NDArray, dict[str, Any]]: + return self.video_loader.load_bytes(data, num_frames=self.num_frames, **self.kwargs) + + def load_base64(self, media_type: str, data: str) -> tuple[npt.NDArray, dict[str, Any]]: + if media_type.lower() == 'video/jpeg': + load_frame = partial( + self.image_io.load_base64, + 'image/jpeg', + ) + + # NOTE: known issue in https://github.com/QwenLM/Qwen3-VL/issues/1643 + # when passing a video as a sequence of JPEG frames, we cannot obtain the video metadata + # therefore we construct a default metadata dictionary with common values. + frames = np.stack([np.asarray(load_frame(frame_data)) for frame_data in data.split(',')]) + + total_frames_num = int(frames.shape[0]) + fps = float(self.kwargs.get('fps', 2)) # default to 2 fps if not specified + duration = (total_frames_num / fps) if fps > 0 else 0 + frame_idx = list(range(total_frames_num)) + + metadata = { + 'total_num_frames': total_frames_num, + 'fps': fps, + 'duration': duration, + 'video_backend': 'jpeg_sequence', + 'frames_indices': frame_idx, + } + + logger.info('Loading video from base64-encoded JPEG frames misses video metadata.' + f'Fall back to default metadata values:\n{metadata}') + return frames, metadata + + return self.load_bytes(base64.b64decode(data)) + + def load_file(self, filepath: Path) -> tuple[npt.NDArray, dict[str, Any]]: + return self.video_loader.load_file(filepath, num_frames=self.num_frames, **self.kwargs) + + def encode_base64( + self, + media: npt.NDArray, + *, + video_format: str = 'JPEG', + ) -> str: + video = media + + if video_format == 'JPEG': + encode_frame = partial( + self.image_io.encode_base64, + image_format=video_format, + ) + + return ','.join(encode_frame(Image.fromarray(frame)) for frame in video) + + msg = 'Only JPEG format is supported for now.' + raise NotImplementedError(msg) diff --git a/lmdeploy/vl/media/video_loader.py b/lmdeploy/vl/media/video_loader.py new file mode 100644 index 0000000000..0cf921f3e5 --- /dev/null +++ b/lmdeploy/vl/media/video_loader.py @@ -0,0 +1,342 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# adapted from https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/video.py +# adapted from https://github.com/QwenLM/Qwen3-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py + +import math +import os +import tempfile +from abc import abstractmethod +from io import BytesIO +from pathlib import Path +from typing import Any + +import numpy as np +import numpy.typing as npt + +from lmdeploy.utils import get_logger + +logger = get_logger('lmdeploy') + + +class VideoLoader: + + @classmethod + @abstractmethod + def load_bytes(self, data: bytes, num_frames: int = -1, **kwargs) -> tuple[npt.NDArray, dict[str, Any]]: + raise NotImplementedError + + @classmethod + def smart_nframes(self, total_frames_num: int, num_frames: int, fps: int, duration: int) -> tuple[int, list[int]]: + # resample video to target num_frames and fps + # - the minimum of the two will be used + num_frames_to_sample = total_frames_num + if num_frames > 0: + num_frames_to_sample = min(num_frames, total_frames_num) + if fps > 0: + num_frames_to_sample = min(num_frames_to_sample, math.floor(duration * fps)) + num_frames_to_sample = max(1, num_frames_to_sample) # at least one sample + + if num_frames_to_sample == total_frames_num: + frame_idx = list(range(0, num_frames_to_sample)) + else: + uniform_sampled_frames = np.linspace(0, total_frames_num - 1, num_frames_to_sample, dtype=int) + frame_idx = uniform_sampled_frames.tolist() + return num_frames_to_sample, frame_idx + + +class OpenCVVideoLoader(VideoLoader): + + def get_cv2_video_api(self): + import cv2.videoio_registry as vr + + api_pref = None + for backend in vr.getStreamBufferedBackends(): + if not vr.hasBackend(backend): + continue + if not vr.isBackendBuiltIn(backend): + _, abi, api = vr.getStreamBufferedBackendPluginVersion(backend) + if abi < 1 or (abi == 1 and api < 2): + continue + api_pref = backend + break + return api_pref + + @staticmethod + def _read_frames( + cap, + frame_indices: set[int], + num_expected_frames: int, + max_frame_idx: int, + ) -> tuple[npt.NDArray, int, list[int]]: + import cv2 + + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + frames = np.empty((num_expected_frames, height, width, 3), dtype=np.uint8) # THWC + + i = 0 + valid_frame_indices = [] + for idx in range(max_frame_idx + 1): + ok = cap.grab() + if not ok: + # Frame is broken/unreadable, log warning + if idx in frame_indices: + logger.warning( + 'Failed to grab frame %d during video loading. ' + 'This frame will be skipped.', + idx, + ) + continue + if idx in frame_indices: + ret, frame = cap.retrieve() + if ret: + frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + valid_frame_indices.append(idx) + i += 1 + else: + # retrieve() failed even though grab() succeeded + logger.warning( + 'Failed to retrieve frame %d during video loading. ' + 'This frame will be skipped.', + idx, + ) + + valid_num_frames = len(valid_frame_indices) + if valid_num_frames < num_expected_frames: + logger.warning( + 'Video loading completed with %d broken/unreadable frames. ' + 'Expected %d frames but only loaded %d frames.', + num_expected_frames - valid_num_frames, + num_expected_frames, + valid_num_frames, + ) + + return frames[:valid_num_frames], valid_num_frames, valid_frame_indices + + @classmethod + def load_file( + self, + filepath: Path, + num_frames: int = -1, + fps: int = -1, + max_duration: int = 300, + **kwargs, + ) -> tuple[npt.NDArray, dict[str, Any]]: + with open(filepath, 'rb') as f: + data = f.read() + return self.load_bytes(data, num_frames=num_frames, fps=fps, max_duration=max_duration, **kwargs) + + @classmethod + def load_bytes( + cls, + data: bytes, + num_frames: int = -1, + fps: int = -1, + max_duration: int = 300, + **kwargs, + ) -> tuple[npt.NDArray, dict[str, Any]]: + """Load video frames from bytes. + + Args: + data: Raw video bytes + num_frames: Target number of frames to sample (-1 for all) + fps: Target FPS for sampling (-1 for original) + max_duration: Maximum duration (unused in base backend) + + Returns: + Tuple of (frames_array, metadata_dict) + """ + import cv2 + + backend = cls().get_cv2_video_api() + cap = cv2.VideoCapture(BytesIO(data), backend, []) + if not cap.isOpened(): + raise ValueError('Could not open video stream') + + total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + original_fps = cap.get(cv2.CAP_PROP_FPS) + duration = total_frames_num / original_fps if original_fps > 0 else 0 + + num_frames_to_sample, frame_idx = cls.smart_nframes(total_frames_num, num_frames, fps, duration) + + frame_idx_set = set(frame_idx) + frames, valid_num_frames, valid_frame_indices = cls._read_frames(cap, frame_idx_set, num_frames_to_sample, + max(frame_idx)) + + # Use transformers transformers.video_utils.VideoMetadata format + # For models like Qwen3-VL/GLM4.5V, this metadata + # can cause incorrect timestamp calculation without num_frames=-1. + # TODO: zhouxinyu, support per-request do_sample_frames + metadata = { + 'total_num_frames': total_frames_num, + 'fps': original_fps, + 'duration': duration, + 'video_backend': 'opencv', + 'frames_indices': valid_frame_indices, + # extra field used to control hf processor's video + # sampling behavior + # "do_sample_frames": valid_num_frames == total_frames_num, + } + return frames, metadata + + +class DecordVideoLoader(VideoLoader): + + @classmethod + def load_file(self, + filepath: Path, + num_frames: int = -1, + fps: int = -1, + max_duration: int = 300, + **kwargs) -> tuple[npt.NDArray, dict[str, Any]]: + import decord + vr = decord.VideoReader(str(filepath)) + total_frames_num = len(vr) + original_fps = vr.get_avg_fps() + duration = total_frames_num / original_fps if original_fps > 0 else 0 + + num_frames_to_sample, frame_idx = self.smart_nframes(total_frames_num, num_frames, fps, duration) + + video = vr.get_batch(frame_idx).asnumpy() # THWC + metadata = { + 'total_num_frames': total_frames_num, + 'fps': original_fps, + 'duration': duration, + 'video_backend': 'decord', + 'frames_indices': frame_idx, + } + return video, metadata + + @classmethod + def load_bytes(self, + data: bytes, + num_frames: int = -1, + fps: int = -1, + max_duration: int = 300, + **kwargs) -> tuple[npt.NDArray, dict[str, Any]]: + tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') + try: + tmp_file.write(data) + tmp_file.close() + return self.load_file(Path(tmp_file.name), + num_frames=num_frames, + fps=fps, + max_duration=max_duration, + **kwargs) + finally: + # always cleanup, even if load_file crashes + try: + os.unlink(tmp_file.name) + except OSError: + pass # file might not exist if write failed + + +class TorchCodecVideoLoader(VideoLoader): + + @classmethod + def load_file(self, + filepath: Path, + num_frames: int = -1, + fps: int = -1, + max_duration: int = 300, + **kwargs) -> tuple[npt.NDArray, dict[str, Any]]: + # torchcodec requires matched ffmpeg, torchcodec, and torch versions + # ffmpeg 5.1.2, torch 2.8.0, torchcodec 0.7.0 are verified to work together + from torchcodec.decoders import VideoDecoder + + torch_codec_num_threads = 8 + decoder = VideoDecoder(str(filepath), num_ffmpeg_threads=torch_codec_num_threads) + total_frames_num = decoder.metadata.num_frames + original_fps = decoder.metadata.average_fps + duration = total_frames_num / original_fps if original_fps > 0 else 0 + + num_frames_to_sample, frame_idx = self.smart_nframes(total_frames_num, num_frames, fps, duration) + + video = decoder.get_frames_at(frame_idx).data + metadata = { + 'total_num_frames': total_frames_num, + 'fps': original_fps, + 'duration': duration, + 'video_backend': 'torchcodec', + 'frames_indices': frame_idx, + } + return video, metadata + + @classmethod + def load_bytes(self, + data: bytes, + num_frames: int = -1, + fps: int = -1, + max_duration: int = 300, + **kwargs) -> tuple[npt.NDArray, dict[str, Any]]: + tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') + try: + tmp_file.write(data) + tmp_file.close() + return self.load_file(Path(tmp_file.name), + num_frames=num_frames, + fps=fps, + max_duration=max_duration, + **kwargs) + finally: + # always cleanup, even if load_file crashes + try: + os.unlink(tmp_file.name) + except OSError: + pass # file might not exist if write failed + + +class TorchVisionVideoLoader(VideoLoader): + + @classmethod + def load_file(self, + filepath: Path, + num_frames: int = -1, + fps: int = -1, + max_duration: int = 300, + **kwargs) -> tuple[npt.NDArray, dict[str, Any]]: + import torchvision + + video, audio, info = torchvision.io.read_video( + filepath, + pts_unit='sec', + output_format='THWC', + ) + total_frames_num = video.size(0) + original_fps = info['video_fps'] + duration = total_frames_num / original_fps if original_fps > 0 else 0 + + num_frames_to_sample, frame_idx = self.smart_nframes(total_frames_num, num_frames, fps, duration) + + video = video[frame_idx] + metadata = { + 'total_num_frames': total_frames_num, + 'fps': original_fps, + 'duration': duration, + 'video_backend': 'torchvision', + 'frames_indices': frame_idx, + } + return video, metadata + + @classmethod + def load_bytes(self, + data: bytes, + num_frames: int = -1, + fps: int = -1, + max_duration: int = 300, + **kwargs) -> tuple[npt.NDArray, dict[str, Any]]: + tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') + try: + tmp_file.write(data) + tmp_file.close() + return self.load_file(Path(tmp_file.name), + num_frames=num_frames, + fps=fps, + max_duration=max_duration, + **kwargs) + finally: + # always cleanup, even if load_file crashes + try: + os.unlink(tmp_file.name) + except OSError: + pass # file might not exist if write failed diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py index 521cbc7985..f282e44e67 100644 --- a/lmdeploy/vl/model/base.py +++ b/lmdeploy/vl/model/base.py @@ -165,45 +165,34 @@ def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, chat_ raise NotImplementedError() @staticmethod - def collect_images(messages): - """Gather all images along with their respective parameters from the - messages and compile them into a single list. Each image is converted - to RGB color space. + def collect_multimodal_items(messages): + """Gather all multimodal items along with their respective parameters + from the messages and compile them into a single list. Args: - messages (List[Tuple[Image, Dict]]): a list of images with their - corresponding parameters - """ # noqa - images = [] + messages (List[Dict]): a list of message + Returns: + List[Tuple[Modality, Any, Dict]]: a list of (modality, data, params) for each multimodal item + """ + multimodal_items = [] for message in messages: content = message['content'] - if not isinstance(content, List): + if not isinstance(content, list): continue - images.extend([(x['image'], { - k: v - for k, v in x.items() if k not in {'type', 'image'} - }) for x in content if x['type'] == 'image']) - return images - @staticmethod - def collect_time_series(messages): - """Gather all time series data along with their respective parameters - from the messages and compile them into a single list. + for x in content: + if not isinstance(x, dict): + continue - Args: - messages (List[Tuple[np.ndarray, Dict]]): a list of time - series data with their corresponding parameters - """ # noqa - time_series = [] - for message in messages: - content = message['content'] - if not isinstance(content, List): - continue - time_series.extend([(x['time_series'], { - k: v - for k, v in x.items() if k not in {'type', 'time_series'} - }) for x in content if x['type'] == 'time_series']) - return time_series + modality = x.get('type') + if modality is None or modality == 'text': + continue + + data = x.get('data') + params = {k: v for k, v in x.items() if k not in ['type', 'data']} + multimodal_items.append((modality, data, params)) + + return multimodal_items @staticmethod def IMAGE_TOKEN_included(messages): diff --git a/lmdeploy/vl/model/cogvlm.py b/lmdeploy/vl/model/cogvlm.py index ca5e41a96f..8b7da1b57e 100644 --- a/lmdeploy/vl/model/cogvlm.py +++ b/lmdeploy/vl/model/cogvlm.py @@ -41,9 +41,9 @@ def build_model(self): def preprocess(self, messages: List[Dict]) -> List[Dict]: """Refer to the spec of `super().preprocess`""" - images = self.collect_images(messages) + images = self.collect_multimodal_items(messages) outputs = [] - for image, _ in images: + for modality, image, _ in images: image = image.convert('RGB') pixel_values = self.image_transform(image) outputs.append( diff --git a/lmdeploy/vl/model/deepseek.py b/lmdeploy/vl/model/deepseek.py index 0a1f6c12e9..91c2d2ef45 100644 --- a/lmdeploy/vl/model/deepseek.py +++ b/lmdeploy/vl/model/deepseek.py @@ -88,9 +88,9 @@ def build_model(self): def preprocess(self, messages: List[Dict]) -> List[Dict]: """Refers to the spec of `super.preprocess()""" - images = self.collect_images(messages) + images = self.collect_multimodal_items(messages) outputs = [] - for image, _ in images: + for modality, image, _ in images: image = image.convert('RGB') pixel_values = self.image_processor([image], return_tensors='pt').pixel_values outputs.append( diff --git a/lmdeploy/vl/model/deepseek_vl2.py b/lmdeploy/vl/model/deepseek_vl2.py index a2e4b034ca..823e162d07 100644 --- a/lmdeploy/vl/model/deepseek_vl2.py +++ b/lmdeploy/vl/model/deepseek_vl2.py @@ -69,7 +69,7 @@ def build_model(self): def preprocess(self, messages: List[Dict]) -> List[Dict]: """Refers to the spec of `super.preprocess()""" - images = self.collect_images(messages) + images = self.collect_multimodal_items(messages) # convert to upstream api formats images = [img_parameter[0] for img_parameter in images] diff --git a/lmdeploy/vl/model/gemma3_vl.py b/lmdeploy/vl/model/gemma3_vl.py index c2879a6b83..4d246c752a 100644 --- a/lmdeploy/vl/model/gemma3_vl.py +++ b/lmdeploy/vl/model/gemma3_vl.py @@ -72,8 +72,8 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]: 'add_special_tokens': False }, ) - images = self.collect_images(messages) - images = [image.convert('RGB') for image, _ in images] + images = self.collect_multimodal_items(messages) + images = [image.convert('RGB') for modality, image, _ in images] num_image = len(images) images = make_nested_list_of_images(images) image_inputs = self.processor.image_processor(images, **output_kwargs['images_kwargs']) diff --git a/lmdeploy/vl/model/glm4_1v.py b/lmdeploy/vl/model/glm4_1v.py index 3b4b2ab937..a5bb0ff81f 100644 --- a/lmdeploy/vl/model/glm4_1v.py +++ b/lmdeploy/vl/model/glm4_1v.py @@ -35,10 +35,10 @@ def build_model(self): def preprocess(self, messages: List[Dict]) -> List[Dict]: """Refer to `super().preprocess()` for spec.""" - images = self.collect_images(messages) + images = self.collect_multimodal_items(messages) optional_keys = {'resized_height', 'resized_width', 'min_pixels', 'max_pixels'} outputs = [] - for image, params in images: + for modality, image, params in images: image = image.convert('RGB') item = dict(type='image', image=image) diff --git a/lmdeploy/vl/model/interns1_pro.py b/lmdeploy/vl/model/interns1_pro.py index e11efbb32a..ce9f6a481f 100644 --- a/lmdeploy/vl/model/interns1_pro.py +++ b/lmdeploy/vl/model/interns1_pro.py @@ -6,6 +6,7 @@ from transformers import AutoProcessor from lmdeploy.utils import get_logger +from lmdeploy.vl.constants import Modality from lmdeploy.vl.model.base import VISION_MODELS, VisionModel logger = get_logger('lmdeploy') @@ -31,15 +32,23 @@ class InternS1ProVisionModel(VisionModel): def build_preprocessor(self): check_transformers() self.processor = AutoProcessor.from_pretrained(self.model_path, trust_remote_code=True) - tokenizer = self.processor.tokenizer + + # image tokens self.image_token = self.processor.image_token - self.image_token_id = tokenizer.encode(self.image_token)[-1] - self.mm_processor_kwargs = None + self.image_token_id = self.processor.image_token_id + + # video tokens + self.video_token = self.processor.video_token + self.video_token_id = self.processor.video_token_id - # Time Series tokens + # time series tokens self.ts_token = getattr(self.processor, 'ts_token', None) self.ts_token_id = getattr(self.processor, 'ts_token_id', None) + # vision start and end tokens + self.vision_start_token = self.processor.vision_start_token + self.vision_end_token = self.processor.vision_end_token + def get_processor_args(self, mm_processor_kwargs: Optional[Dict[str, Any]] = None): min_pixels = self.processor.image_processor.size['shortest_edge'] max_pixels = self.processor.image_processor.size['longest_edge'] @@ -82,7 +91,64 @@ def check_time_series_input(self, messages): for message in messages) self.has_time_series_input = has_time_series_input - def time_series_processor(self, ts_input, sr): + def _preprocess_image(self, + data: List[Any], + params: Dict[str, Any], + mm_processor_kwargs: Dict[str, Any] | None = None) -> List[Dict]: + + image = data.convert('RGB') + min_pixels, max_pixels = self.get_processor_args(mm_processor_kwargs) + + result = self.processor.image_processor(images=image, + size={ + 'shortest_edge': min_pixels, + 'longest_edge': max_pixels + }, + return_tensors='pt') + merge_length = self.processor.image_processor.merge_size**2 + image_tokens = result['image_grid_thw'].prod(dim=1) // merge_length + result.update(dict(image_size=image.size, image_tokens=image_tokens, image_token_id=self.image_token_id)) + return result + + def _preprocess_video(self, + data: List[Any], + params: Dict[str, Any], + mm_processor_kwargs: Dict[str, Any] | None = None) -> List[Dict]: + + # TODO: zhouxinyu, apply transformers smart_resize using per-request kwargs + metadata = params['video_metadata'] + video_kwargs = dict(return_metadata=True, + do_resize=True, + do_sample_frames=False, + video_metadata=metadata, + return_tensors='pt') + result = self.processor.video_processor(videos=data, **video_kwargs) + video_grid_thw = result['video_grid_thw'] + + merge_length = self.processor.video_processor.merge_size**2 + if metadata.get('fps') is None: + logger.warning_once('Qwen3VL: fps not found, defaulting to 24.') + metadata['fps'] = metadata['fps'] or 24 + + # if timestamps are not provided, calculate them + curr_timestamp = self.processor._calculate_timestamps( + metadata['frames_indices'], + metadata['fps'], + self.processor.video_processor.merge_size, + ) + + frame_seqlen = video_grid_thw[0][1:].prod() // merge_length + result.update(curr_timestamp=curr_timestamp, frame_seqlen=frame_seqlen, video_token_id=self.video_token_id) + return result + + def _preprocess_time_series(self, + data: List[Any], + params: Dict[str, Any], + mm_processor_kwargs: Dict[str, Any] | None = None) -> List[Dict]: + + ts_input = data + sr = params.get('sampling_rate') if params is not None else None + if not isinstance(ts_input, np.ndarray): ts_input = np.array(ts_input, dtype=np.float32) @@ -108,43 +174,34 @@ def time_series_processor(self, ts_input, sr): stride = np.floor(160 / ((1 + np.exp(-sr / 100))**6)) patch_size = stride * 2 embed_length = (np.ceil((ts_len - patch_size) / stride) + 1) - num_ts_tokens = int((embed_length // 2 + 1) // 2) + ts_tokens = int((embed_length // 2 + 1) // 2) - return dict(ts_values=[ts_input], ts_sr=[sr], ts_lens=[ts_len], num_ts_tokens=[num_ts_tokens]) + return dict(ts_values=[ts_input], + ts_sr=[sr], + ts_lens=[ts_len], + ts_tokens=[ts_tokens], + ts_token_id=self.ts_token_id) - def preprocess(self, messages: List[Dict], mm_processor_kwargs: Optional[Dict[str, Any]] = None) -> List[Dict]: + def preprocess(self, messages: List[Dict], mm_processor_kwargs: Dict[str, Any] | None = None) -> List[Dict]: """Refer to `super().preprocess()` for spec.""" - - self.check_time_series_input(messages) - - if self.has_time_series_input: - time_series = self.collect_time_series(messages) - outputs = [] - for ts_input, params in time_series: - sr = params.get('sampling_rate') if params is not None else None - time_series_inputs = self.time_series_processor(ts_input, sr) - time_series_inputs.update({'ts_token_id': self.ts_token_id}) - outputs.append(time_series_inputs) - else: - min_pixels, max_pixels = self.get_processor_args(mm_processor_kwargs) - - images = self.collect_images(messages) - outputs = [] - for image, params in images: - image = image.convert('RGB') - - result = self.processor.image_processor(images=image, - videos=None, - size={ - 'shortest_edge': min_pixels, - 'longest_edge': max_pixels - }, - return_tensors='pt') - merge_length = self.processor.image_processor.merge_size**2 - image_tokens = result['image_grid_thw'].prod(dim=1) // merge_length - result.update(dict(image_size=image.size, image_tokens=image_tokens, - image_token_id=self.image_token_id)) - outputs.append(result) + outputs = [] + self.contains_video_input = False + self.contains_ts_input = False + + mm_items = self.collect_multimodal_items(messages) + for modality, data, params in mm_items: + result = {} + if modality == Modality.IMAGE: + result = self._preprocess_image(data, params, mm_processor_kwargs) + elif modality == Modality.VIDEO: + self.contains_video_input = True + result = self._preprocess_video(data, params, mm_processor_kwargs) + elif modality == Modality.TIME_SERIES: + self.contains_ts_input = True + result = self._preprocess_time_series(data, params, mm_processor_kwargs) + + result.update(modality=modality) + outputs.append(result) messages.append(dict(role='preprocess', content=outputs)) return messages @@ -161,11 +218,6 @@ def proc_messages(self, IMAGE_TOKEN = '' messages = [x for x in messages if x['role'] not in ['preprocess', 'forward']] - if self.has_time_series_input: - prompt_messages = messages - prompt = chat_template.messages2prompt(prompt_messages, sequence_start, tools=tools, **chat_template_kwargs) - return prompt, self.ts_token - if VisionModel.IMAGE_TOKEN_included(messages): # backward compatibility for message in messages: @@ -179,21 +231,61 @@ def proc_messages(self, prompt_messages.append(dict(role='user', content=prompt)) else: prompt_messages = messages + + # time series input requires enabling_thinking = False + if self.contains_ts_input: + chat_template_kwargs['enable_thinking'] = False + prompt = chat_template.messages2prompt(prompt_messages, sequence_start, tools=tools, **chat_template_kwargs) - return prompt, self.image_token - - def ts_to_pytorch_aux(self, messages, prompt, TS_TOKEN, tokenizer, sequence_start): - """Auxiliary function to pack the preprocessing results in a format - compatible with what is required by pytorch engine. - - Args: - messages(List[Dict]): the output of `preprocess` - prompt(str): the prompt after applying chat template - TS_TOKEN(str): a placeholder where time series tokens will be - inserted - tokenizer: the tokenizer model - sequence_start: starting flag of a sequence - """ + return prompt, None + + def to_pytorch_aux_video(self, messages, prompt, VIDEO_TOKEN, tokenizer, sequence_start): + """Pack the video input to the compatible format with pytorch + engine.""" + + # collect all preprocessing result from messages + preps = [x['content'] for x in messages if x['role'] == 'preprocess'] + assert len(preps) == 1 + preps = preps[0] + + # split prompt into segments and validate data + segs = prompt.split(self.vision_start_token + self.video_token + self.vision_end_token) + assert len(segs) == len(preps) + 1, (f'the number of {self.video_token} is not equal ' + f'to input videos, {len(segs) - 1} vs {len(preps)}') + + # calculate the video token offset for each video + input_ids = [] + for i, seg in enumerate(segs): + if i > 0 and i <= len(preps): + preps[i - 1].update(offset=len(input_ids)) + frame_seqlen = preps[i - 1]['frame_seqlen'] + assert self.video_token_id == preps[i - 1]['video_token_id'] + + video_grid_thw = preps[i - 1]['video_grid_thw'] + curr_timestamp = preps[i - 1]['curr_timestamp'] + + # update prompt with timestamp index tokens and video pad tokens + video_placeholder = '' + for frame_idx in range(video_grid_thw[0][0]): + curr_time = curr_timestamp[frame_idx] + video_placeholder += f'<{curr_time:.1f} seconds>' + video_placeholder += (self.vision_start_token + '<|placeholder|>' * frame_seqlen + + self.vision_end_token) + + video_placeholder = video_placeholder.replace('<|placeholder|>', self.video_token) + video_token_ids = tokenizer.encode(video_placeholder) + input_ids.extend(video_token_ids) + + preps[i - 1].update(video_tokens=len(video_token_ids)) + + token_ids = tokenizer.encode(seg, add_bos=((i == 0) and sequence_start)) + input_ids.extend(token_ids) + + return dict(prompt=prompt, input_ids=input_ids, multimodal=preps) + + def to_pytorch_aux_ts(self, messages, prompt, TS_TOKEN, tokenizer, sequence_start): + """Pack the time series input to the compatible format with pytorch + engine.""" # collect all preprocessing result from messages preps = [x['content'] for x in messages if x['role'] == 'preprocess'] assert len(preps) == 1 @@ -208,13 +300,12 @@ def ts_to_pytorch_aux(self, messages, prompt, TS_TOKEN, tokenizer, sequence_star for i, seg in enumerate(segs): if i > 0 and i <= len(preps): preps[i - 1].update(offset=len(input_ids)) - ts_tokens = preps[i - 1]['num_ts_tokens'] + ts_tokens = preps[i - 1]['ts_tokens'] - # NOTE: zhouxinyu, better to be valid type in the processor ts_tokens = ts_tokens[0] ts_array = np.array(preps[i - 1]['ts_values']) - preps[i - 1].update(num_ts_tokens=ts_tokens) + preps[i - 1].update(ts_tokens=ts_tokens) preps[i - 1].update(ts_values=torch.from_numpy(ts_array).to(dtype=torch.bfloat16)) preps[i - 1].update(ts_lens=torch.tensor(preps[i - 1]['ts_lens'])) preps[i - 1].update(ts_sr=torch.tensor(preps[i - 1]['ts_sr'])) @@ -235,22 +326,18 @@ def to_pytorch(self, chat_template_kwargs: Optional[Dict] = None, **kwargs): """Return to the information needed by pytorch engine.""" - if self.has_time_series_input: - # time series input requires enabling_thinking = False - chat_template_kwargs = {'enable_thinking': False} - prompt, TS_TOKEN = self.proc_messages(messages, - chat_template, - sequence_start, - tools=tools, - chat_template_kwargs=chat_template_kwargs) - return self.ts_to_pytorch_aux(messages, prompt, TS_TOKEN, tokenizer, sequence_start) + prompt, _ = self.proc_messages(messages, + chat_template, + sequence_start, + tools=tools, + chat_template_kwargs=chat_template_kwargs) + + if self.contains_video_input: + return self.to_pytorch_aux_video(messages, prompt, self.video_token, tokenizer, sequence_start) + elif self.contains_ts_input: + return self.to_pytorch_aux_ts(messages, prompt, self.ts_token, tokenizer, sequence_start) else: - prompt, IMAGE_TOKEN = self.proc_messages(messages, - chat_template, - sequence_start, - tools=tools, - chat_template_kwargs=chat_template_kwargs) - return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start) + return self.to_pytorch_aux(messages, prompt, self.image_token, tokenizer, sequence_start) def build_model(self): # TODO: implement for turbomind diff --git a/lmdeploy/vl/model/internvl.py b/lmdeploy/vl/model/internvl.py index 9036866818..7f49dd9799 100644 --- a/lmdeploy/vl/model/internvl.py +++ b/lmdeploy/vl/model/internvl.py @@ -192,9 +192,9 @@ def _forward(self, inputs, max_batch_size): def preprocess(self, messages: List[Dict]) -> List[Dict]: """Refers to `super.preprocess() for spec.""" - images = self.collect_images(messages) + images = self.collect_multimodal_items(messages) outputs = [] - for image, params in images: + for modality, image, params in images: image = image.convert('RGB') pixel_values = self.processor(image, params) image_tokens = (pixel_values.shape[0] * self.image_tokens_per_patch) diff --git a/lmdeploy/vl/model/internvl3_hf.py b/lmdeploy/vl/model/internvl3_hf.py index 85eb40bbdc..4601b01930 100644 --- a/lmdeploy/vl/model/internvl3_hf.py +++ b/lmdeploy/vl/model/internvl3_hf.py @@ -94,8 +94,8 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]: 'add_special_tokens': False }, ) - images = self.collect_images(messages) - images = [image.convert('RGB') for image, _ in images] + images = self.collect_multimodal_items(messages) + images = [image.convert('RGB') for modality, image, _ in images] num_image = len(images) images = make_flat_list_of_images(images) image_inputs = self.processor.image_processor(images, **output_kwargs['images_kwargs']) diff --git a/lmdeploy/vl/model/llama4.py b/lmdeploy/vl/model/llama4.py index e0752d7b99..1661d9fc63 100644 --- a/lmdeploy/vl/model/llama4.py +++ b/lmdeploy/vl/model/llama4.py @@ -60,13 +60,13 @@ def build_model(self): def preprocess(self, messages: List[Dict]) -> List[Dict]: """Refers to `super.preprocess() for spec.""" - images = self.collect_images(messages) + images = self.collect_multimodal_items(messages) outputs = [] processor = self.processor patch_size = processor.patch_size downsample_ratio = processor.downsample_ratio images_kwargs = self.images_kwargs - for image, params in images: + for modality, image, params in images: image_inputs = processor.image_processor(images=[image], **images_kwargs) pixel_values = image_inputs['pixel_values'] image_height, image_width = image_inputs['pixel_values'][0].shape[-2:] diff --git a/lmdeploy/vl/model/llava.py b/lmdeploy/vl/model/llava.py index 91da486643..06ef05d6c8 100644 --- a/lmdeploy/vl/model/llava.py +++ b/lmdeploy/vl/model/llava.py @@ -297,9 +297,9 @@ def encode_images(self, images: torch.Tensor) -> torch.Tensor: def preprocess(self, messages: List[Dict]) -> List[Dict]: """Refer to `super().preprocess() for spec.""" - images = self.collect_images(messages) + images = self.collect_multimodal_items(messages) outputs = [] - for image, params in images: + for modality, image, params in images: image = image.convert('RGB') pixel_values = process_images([image], self.image_processor, self.config) outputs.append( diff --git a/lmdeploy/vl/model/llava_hf.py b/lmdeploy/vl/model/llava_hf.py index c66f68be68..cf470d09ba 100644 --- a/lmdeploy/vl/model/llava_hf.py +++ b/lmdeploy/vl/model/llava_hf.py @@ -57,9 +57,9 @@ def build_model(self): def preprocess(self, messages: List[Dict]) -> List[Dict]: """Refers to `super.preprocess() for spec.""" - images = self.collect_images(messages) + images = self.collect_multimodal_items(messages) outputs = [] - for image, params in images: + for modality, image, params in images: image = image.convert('RGB') pixel_values = self.processor(image, return_tensors='pt', input_data_format='channels_last').pixel_values outputs.append( diff --git a/lmdeploy/vl/model/llava_next.py b/lmdeploy/vl/model/llava_next.py index b705f237b8..08ffbe44c4 100644 --- a/lmdeploy/vl/model/llava_next.py +++ b/lmdeploy/vl/model/llava_next.py @@ -66,9 +66,9 @@ def build_model(self): def preprocess(self, messages: List[Dict]) -> List[Dict]: """Refers to the spec of `super.preprocess()""" from transformers.models.llava_next.modeling_llava_next import image_size_to_num_patches - images = self.collect_images(messages) + images = self.collect_multimodal_items(messages) outputs = [] - for image, params in images: + for modality, image, params in images: image = image.convert('RGB') result = self.processor(image, return_tensors='pt', input_data_format='channels_last') # ! infer image_num_patches from image_sizes diff --git a/lmdeploy/vl/model/mllama.py b/lmdeploy/vl/model/mllama.py index ab0949fe03..e9d20684de 100644 --- a/lmdeploy/vl/model/mllama.py +++ b/lmdeploy/vl/model/mllama.py @@ -26,9 +26,9 @@ def build_preprocessor(self): def preprocess(self, messages: List[Dict]) -> List[Dict]: """Refer to the spec of `super().preprocess`""" - images = self.collect_images(messages) + images = self.collect_multimodal_items(messages) outputs = [] - for image, params in images: + for modality, image, params in images: image = image.convert('RGB') results = self.processor.image_processor(images=image, return_tensors='pt') results.update(image_size=image.size, image_tokens=1, image_token_id=self.image_token_id) diff --git a/lmdeploy/vl/model/phi3_vision.py b/lmdeploy/vl/model/phi3_vision.py index 683220c29c..8167ae883d 100644 --- a/lmdeploy/vl/model/phi3_vision.py +++ b/lmdeploy/vl/model/phi3_vision.py @@ -31,9 +31,9 @@ def build_model(self): def preprocess(self, messages: List[Dict]) -> List[Dict]: """Refers to `super.preprocess() for spec.""" - images = self.collect_images(messages) + images = self.collect_multimodal_items(messages) outputs = [] - for image, params in images: + for modality, image, params in images: result = self.processor.image_processor([image], return_tensors='pt') image_tokens = result['num_img_tokens'] result.update(dict(image_size=image.size, image_tokens=image_tokens, image_token_id=self.image_token_id)) diff --git a/lmdeploy/vl/model/qwen.py b/lmdeploy/vl/model/qwen.py index db54fb5f57..d959c0ffbf 100644 --- a/lmdeploy/vl/model/qwen.py +++ b/lmdeploy/vl/model/qwen.py @@ -71,9 +71,9 @@ def build_model(self): def preprocess(self, messages: List[Dict]) -> List[Dict]: """Refers to `super.preprocess() for spec.""" - images = self.collect_images(messages) + images = self.collect_multimodal_items(messages) outputs = [] - for image, params in images: + for modality, image, params in images: image = image.convert('RGB') pixel_values = self.image_transform(image) outputs.append( diff --git a/lmdeploy/vl/model/qwen2.py b/lmdeploy/vl/model/qwen2.py index 52ade4980c..a802242d6c 100644 --- a/lmdeploy/vl/model/qwen2.py +++ b/lmdeploy/vl/model/qwen2.py @@ -37,10 +37,10 @@ def preprocess(self, messages: list[dict]) -> list[dict]: """Refer to `super().preprocess()` for spec.""" from qwen_vl_utils import process_vision_info - images = self.collect_images(messages) + images = self.collect_multimodal_items(messages) optional_keys = {'resized_height', 'resized_width', 'min_pixels', 'max_pixels'} outputs = [] - for image, params in images: + for modality, image, params in images: image = image.convert('RGB') item = dict(type='image', image=image) diff --git a/lmdeploy/vl/model/qwen3.py b/lmdeploy/vl/model/qwen3.py index 265bf9f729..404f84d781 100644 --- a/lmdeploy/vl/model/qwen3.py +++ b/lmdeploy/vl/model/qwen3.py @@ -1,10 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List import torch from transformers import AutoProcessor from lmdeploy.utils import get_logger +from lmdeploy.vl.constants import Modality from lmdeploy.vl.model.base import VISION_MODELS, VisionModel logger = get_logger('lmdeploy') @@ -27,12 +28,20 @@ class Qwen3VLModel(VisionModel): def build_preprocessor(self): check_transformers() self.processor = AutoProcessor.from_pretrained(self.model_path) - tokenizer = self.processor.tokenizer + + # image tokens self.image_token = self.processor.image_token - self.image_token_id = tokenizer.encode(self.image_token)[-1] - self.mm_processor_kwargs = None + self.image_token_id = self.processor.image_token_id + + # video tokens + self.video_token = self.processor.video_token + self.video_token_id = self.processor.video_token_id + + # vision start and end tokens + self.vision_start_token = self.processor.vision_start_token + self.vision_end_token = self.processor.vision_end_token - def get_processor_args(self, mm_processor_kwargs: Optional[Dict[str, Any]] = None): + def get_processor_args(self, mm_processor_kwargs: Dict[str, Any] | None = None): min_pixels = self.processor.image_processor.size['shortest_edge'] max_pixels = self.processor.image_processor.size['longest_edge'] @@ -68,26 +77,73 @@ def get_processor_args(self, mm_processor_kwargs: Optional[Dict[str, Any]] = Non return min_pixels, max_pixels - def preprocess(self, messages: List[Dict], mm_processor_kwargs: Optional[Dict[str, Any]] = None) -> List[Dict]: - """Refer to `super().preprocess()` for spec.""" + def _preprocess_image(self, + data: List[Any], + params: Dict[str, Any], + mm_processor_kwargs: Dict[str, Any] | None = None) -> List[Dict]: + image = data.convert('RGB') min_pixels, max_pixels = self.get_processor_args(mm_processor_kwargs) - images = self.collect_images(messages) + result = self.processor.image_processor(images=image, + size={ + 'shortest_edge': min_pixels, + 'longest_edge': max_pixels + }, + return_tensors='pt') + merge_length = self.processor.image_processor.merge_size**2 + image_tokens = result['image_grid_thw'].prod(dim=1) // merge_length + result.update(dict(image_size=image.size, image_tokens=image_tokens, image_token_id=self.image_token_id)) + return result + + def _preprocess_video(self, + data: List[Any], + params: Dict[str, Any], + mm_processor_kwargs: Dict[str, Any] | None = None) -> List[Dict]: + + # TODO: zhouxinyu, apply transformers smart_resize using per-request kwargs + metadata = params['video_metadata'] + video_kwargs = dict(return_metadata=True, + do_resize=True, + do_sample_frames=False, + video_metadata=metadata, + return_tensors='pt') + result = self.processor.video_processor(videos=data, **video_kwargs) + video_grid_thw = result['video_grid_thw'] + + merge_length = self.processor.video_processor.merge_size**2 + if metadata.get('fps') is None: + logger.warning_once('Qwen3VL: fps not found, defaulting to 24.') + metadata['fps'] = metadata['fps'] or 24 + + # if timestamps are not provided, calculate them + curr_timestamp = self.processor._calculate_timestamps( + metadata['frames_indices'], + metadata['fps'], + self.processor.video_processor.merge_size, + ) + + frame_seqlen = video_grid_thw[0][1:].prod() // merge_length + result.update(curr_timestamp=curr_timestamp, frame_seqlen=frame_seqlen, video_token_id=self.video_token_id) + return result + + def preprocess(self, messages: List[Dict], mm_processor_kwargs: Dict[str, Any] | None = None) -> List[Dict]: + """Refer to `super().preprocess()` for spec.""" outputs = [] - for image, params in images: - image = image.convert('RGB') - - result = self.processor.image_processor(images=image, - size={ - 'shortest_edge': min_pixels, - 'longest_edge': max_pixels - }, - return_tensors='pt') - merge_length = self.processor.image_processor.merge_size**2 - image_tokens = result['image_grid_thw'].prod(dim=1) // merge_length - result.update(dict(image_size=image.size, image_tokens=image_tokens, image_token_id=self.image_token_id)) + self.contains_video_input = False + + mm_items = self.collect_multimodal_items(messages) + for modality, data, params in mm_items: + result = {} + if modality == Modality.IMAGE: + result = self._preprocess_image(data, params, mm_processor_kwargs) + elif modality == Modality.VIDEO: + self.contains_video_input = True + result = self._preprocess_video(data, params, mm_processor_kwargs) + + result.update(modality=modality) outputs.append(result) + messages.append(dict(role='preprocess', content=outputs)) return messages @@ -111,18 +167,66 @@ def proc_messages(self, messages, chat_template, sequence_start, chat_template_k else: prompt_messages = messages prompt = chat_template.messages2prompt(prompt_messages, sequence_start, **chat_template_kwargs) - return prompt, self.image_token + return prompt, None + + def to_pytorch_aux_video(self, messages, prompt, VIDEO_TOKEN, tokenizer, sequence_start): + """Pack the video input to the compatible format with pytorch + engine.""" + + # collect all preprocessing result from messages + preps = [x['content'] for x in messages if x['role'] == 'preprocess'] + assert len(preps) == 1 + preps = preps[0] + + # split prompt into segments and validate data + segs = prompt.split(self.vision_start_token + self.video_token + self.vision_end_token) + assert len(segs) == len(preps) + 1, (f'the number of {self.video_token} is not equal ' + f'to input videos, {len(segs) - 1} vs {len(preps)}') + + # calculate the video token offset for each video + input_ids = [] + for i, seg in enumerate(segs): + if i > 0 and i <= len(preps): + preps[i - 1].update(offset=len(input_ids)) + frame_seqlen = preps[i - 1]['frame_seqlen'] + assert self.video_token_id == preps[i - 1]['video_token_id'] + + video_grid_thw = preps[i - 1]['video_grid_thw'] + curr_timestamp = preps[i - 1]['curr_timestamp'] + + # update prompt with timestamp index tokens and video pad tokens + video_placeholder = '' + for frame_idx in range(video_grid_thw[0][0]): + curr_time = curr_timestamp[frame_idx] + video_placeholder += f'<{curr_time:.1f} seconds>' + video_placeholder += (self.vision_start_token + '<|placeholder|>' * frame_seqlen + + self.vision_end_token) + + video_placeholder = video_placeholder.replace('<|placeholder|>', self.video_token) + video_token_ids = tokenizer.encode(video_placeholder) + input_ids.extend(video_token_ids) + + preps[i - 1].update(video_tokens=len(video_token_ids)) + + token_ids = tokenizer.encode(seg, add_bos=((i == 0) and sequence_start)) + input_ids.extend(token_ids) + + return dict(prompt=prompt, input_ids=input_ids, multimodal=preps) def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, - chat_template_kwargs: Optional[Dict] = None, + chat_template_kwargs: Dict | None = None, **kwargs): """Return to the information needed by pytorch engine.""" - prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start, chat_template_kwargs) - return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start) + prompt, _ = self.proc_messages(messages, chat_template, sequence_start, chat_template_kwargs) + + if self.contains_video_input: + return self.to_pytorch_aux_video(messages, prompt, self.video_token, tokenizer, sequence_start) + else: + return self.to_pytorch_aux(messages, prompt, self.image_token, tokenizer, sequence_start) def build_model(self): # TODO: implement for turbomind @@ -138,7 +242,7 @@ def to_turbomind(self, chat_template, tokenizer, sequence_start, - chat_template_kwargs: Optional[Dict] = None, + chat_template_kwargs: Dict | None = None, **kwargs): # TODO: implement for turbomind pass diff --git a/lmdeploy/vl/model/qwen3_5.py b/lmdeploy/vl/model/qwen3_5.py index c75b87bed0..f030de5e5e 100644 --- a/lmdeploy/vl/model/qwen3_5.py +++ b/lmdeploy/vl/model/qwen3_5.py @@ -25,8 +25,17 @@ class Qwen3_5Model(Qwen3VLModel): def build_preprocessor(self): check_transformers() + self.processor = AutoProcessor.from_pretrained(self.model_path) - tokenizer = self.processor.tokenizer + + # image tokens self.image_token = self.processor.image_token - self.image_token_id = tokenizer.encode(self.image_token)[-1] - self.mm_processor_kwargs = None + self.image_token_id = self.processor.image_token_id + + # video tokens + self.video_token = self.processor.video_token + self.video_token_id = self.processor.video_token_id + + # vision start and end tokens + self.vision_start_token = self.processor.vision_start_token + self.vision_end_token = self.processor.vision_end_token diff --git a/lmdeploy/vl/model/xcomposer2.py b/lmdeploy/vl/model/xcomposer2.py index cff3b808f5..5cc4eac5f4 100644 --- a/lmdeploy/vl/model/xcomposer2.py +++ b/lmdeploy/vl/model/xcomposer2.py @@ -206,9 +206,9 @@ def _preprocess_4khd_7b(self, image: Image, params: Dict) -> Dict: def preprocess(self, messages: List[Dict]) -> List[Dict]: """Refer to `super().preprocess() for spec.""" - images = self.collect_images(messages) + images = self.collect_multimodal_items(messages) outputs = [] - for image, params in images: + for modality, image, params in images: image = image.convert('RGB') pixel_values, n_token = self.preprocess_func(image, params) outputs.append( diff --git a/lmdeploy/vl/model/yi.py b/lmdeploy/vl/model/yi.py index 02dd1c83e5..317f8afac4 100644 --- a/lmdeploy/vl/model/yi.py +++ b/lmdeploy/vl/model/yi.py @@ -119,9 +119,9 @@ def build_model(self): def preprocess(self, messages: List[Dict]) -> List[Dict]: """Refer to `super().preprocess() for spec.""" - images = self.collect_images(messages) + images = self.collect_multimodal_items(messages) outputs = [] - for image, params in images: + for modality, image, params in images: image = image.convert('RGB') pixel_values = process_images([image], self.image_processor, self.config) outputs.append( diff --git a/lmdeploy/vl/time_series_utils.py b/lmdeploy/vl/time_series_utils.py deleted file mode 100644 index 5651f65697..0000000000 --- a/lmdeploy/vl/time_series_utils.py +++ /dev/null @@ -1,160 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import csv -import os -from io import BytesIO, StringIO - -import numpy as np -import pybase64 -import requests - -from lmdeploy.utils import get_logger - -logger = get_logger('lmdeploy') - -FETCH_TIMEOUT = int(os.environ.get('LMDEPLOY_FETCH_TIMEOUT', 10)) -HEADERS = { - 'User-Agent': - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' - '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' -} - - -def encode_time_series_base64(data: str | np.ndarray) -> str: - """Encode time series data to base64. - - Supports: HTTP URL, local path, or numpy array. - """ - buffered = BytesIO() - - try: - if isinstance(data, str): - if data.startswith('http'): - response = requests.get(data, headers=HEADERS, timeout=FETCH_TIMEOUT) - response.raise_for_status() - ts_array = _load_bytes(response.content, data) - elif data.startswith('file://'): - path = data.removeprefix('file://') - ts_array = _load_path(path) - elif os.path.exists(data): - ts_array = _load_path(data) - else: - raise ValueError(f'Path does not exist: {data}') - elif isinstance(data, np.ndarray): - ts_array = data - else: - raise TypeError(f'Expected str or np.ndarray, got {type(data)}') - - np.save(buffered, ts_array) - - except Exception as error: - data_info = str(data)[:100] + ' ...' if isinstance(data, str) and len(data) > 100 else str(data) - logger.error(f'{error}, data={data_info}') - np.save(buffered, np.zeros((6000, 3), dtype=np.float32)) # dummy - - return pybase64.b64encode(buffered.getvalue()).decode('utf-8') - - -def load_time_series_from_base64(ts_base64: bytes | str) -> np.ndarray: - """Load time series from base64 format.""" - if isinstance(ts_base64, str): - ts_base64 = ts_base64.encode('utf-8') - return np.load(BytesIO(pybase64.b64decode(ts_base64)), allow_pickle=False) - - -def load_time_series(data_source: str | np.ndarray) -> np.ndarray: - """Load time series from URL, local path, base64 data URL, or numpy - array.""" - try: - if isinstance(data_source, np.ndarray): - return data_source - - if data_source.startswith('http'): - response = requests.get(data_source, headers=HEADERS, timeout=FETCH_TIMEOUT) - response.raise_for_status() - return _load_bytes(response.content, data_source) - - if data_source.startswith('data:time_series'): - return load_time_series_from_base64(data_source.split(',')[1]) - - if data_source.startswith('file://'): - path = data_source.removeprefix('file://') - return _load_path(path) - - if os.path.exists(data_source): - return _load_path(data_source) - - raise ValueError(f'Invalid data source: {data_source}') - except Exception as error: - data_info = str(data_source)[:100] + ' ...' if isinstance(data_source, - str) and len(data_source) > 100 else str(data_source) - logger.error(f'{error}, data_source={data_info}') - return np.zeros((6000, 3), dtype=np.float32) # dummy - - -def _load_bytes(content: bytes, hint: str = '') -> np.ndarray: - """Auto-detect format from bytes. - - Try: npy -> csv -> audio. - """ - hint = hint.lower() - - # Format hints from URL/path - if '.npy' in hint: - return np.load(BytesIO(content)) - if '.csv' in hint: - return _load_csv(content) - if any(ext in hint for ext in ['.wav', '.mp3', '.flac']): - return _load_audio(content) - - # Fallback: try all formats - loaders = [lambda: np.load(BytesIO(content)), lambda: _load_csv(content), lambda: _load_audio(content)] - for loader in loaders: - try: - return loader() - except Exception: - continue - raise ValueError(f'Cannot detect format from bytes: {hint[:50]}') - - -def _load_path(path: str) -> np.ndarray: - """Load from local file path based on extension.""" - ext = os.path.splitext(path)[-1].lower() - - if ext == '.npy': - return np.load(path) - if ext == '.csv': - return _load_csv(path) - if ext in ['.wav', '.mp3', '.flac']: - return _load_audio(path) - - raise ValueError(f'Unsupported format: {ext}') - - -def _load_csv(source: bytes | str) -> np.ndarray: - """Load CSV from bytes or file path.""" - # Read content as text - if isinstance(source, bytes): - text = source.decode('utf-8') - else: - with open(source, 'r', newline='') as f: - text = f.read() - - # Parse CSV - f = StringIO(text) - reader = csv.reader(f) - rows = list(reader) - - return np.array(rows, dtype=np.float32) - - -def _load_audio(source: bytes | str) -> np.ndarray: - """Load audio from bytes or file path.""" - try: - import soundfile as sf - except ImportError: - raise ImportError('Please install soundfile to process audio files.') - - if isinstance(source, bytes): - source = BytesIO(source) - ts, sr = sf.read(source) - return ts diff --git a/lmdeploy/vl/utils.py b/lmdeploy/vl/utils.py index a089d06ad7..2ebb998b1d 100644 --- a/lmdeploy/vl/utils.py +++ b/lmdeploy/vl/utils.py @@ -1,83 +1,54 @@ # Copyright (c) OpenMMLab. All rights reserved. -import os -from io import BytesIO -from typing import Union +from typing import Any, Dict, Tuple -import pybase64 -import requests -from PIL import Image, ImageFile +import numpy.typing as npt +from PIL import Image -from lmdeploy.utils import get_logger +from .media.connection import load_from_url +from .media.image import ImageMediaIO +from .media.time_series import TimeSeriesMediaIO +from .media.video import VideoMediaIO -logger = get_logger('lmdeploy') +def load_image(image_url: str, **kwargs) -> Image.Image: + """Fetch and decode an image from a URL, path, or base64 string.""" + image_io = ImageMediaIO(**kwargs) + return load_from_url(image_url, image_io) -def encode_image_base64(image: Union[str, Image.Image]) -> str: - """Encode raw data to base64 format.""" - buffered = BytesIO() - FETCH_TIMEOUT = int(os.environ.get('LMDEPLOY_FETCH_TIMEOUT', 10)) - headers = { - 'User-Agent': - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' - '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' - } - try: - if isinstance(image, str): - url_or_path = image - if url_or_path.startswith('http'): - response = requests.get(url_or_path, headers=headers, timeout=FETCH_TIMEOUT) - response.raise_for_status() - buffered.write(response.content) - elif os.path.exists(url_or_path): - with open(url_or_path, 'rb') as image_file: - buffered.write(image_file.read()) - elif isinstance(image, Image.Image): - image.save(buffered, format='PNG') - except Exception as error: - if isinstance(image, str) and len(image) > 100: - image = image[:100] + ' ...' - logger.error(f'{error}, image={image}') - # use dummy image - image = Image.new('RGB', (32, 32)) - image.save(buffered, format='PNG') - res = pybase64.b64encode(buffered.getvalue()).decode('utf-8') - return res +def load_video(video_url: str, **kwargs) -> Tuple[npt.NDArray, Dict[str, Any]]: + """Fetch and decode video frames from a URL, path, or base64 string.""" + image_io = ImageMediaIO() + video_io = VideoMediaIO(image_io=image_io, **kwargs) + return load_from_url(video_url, video_io) -def load_image_from_base64(image: Union[bytes, str]) -> Image.Image: - """Load image from base64 format.""" - return Image.open(BytesIO(pybase64.b64decode(image))) +def load_time_series(ts_url: str, **kwargs) -> npt.NDArray: + """Fetch and decode time-series from a URL or path or base64 string..""" + ts_io = TimeSeriesMediaIO(**kwargs) + return load_from_url(ts_url, ts_io) -def load_image(image_url: Union[str, Image.Image]) -> Image.Image: - """Load image from url, local path or openai GPT4V.""" - FETCH_TIMEOUT = int(os.environ.get('LMDEPLOY_FETCH_TIMEOUT', 10)) - headers = { - 'User-Agent': - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' - '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' - } - try: - ImageFile.LOAD_TRUNCATED_IMAGES = True - if isinstance(image_url, Image.Image): - img = image_url - elif image_url.startswith('http'): - response = requests.get(image_url, headers=headers, timeout=FETCH_TIMEOUT) - response.raise_for_status() - img = Image.open(BytesIO(response.content)) - elif image_url.startswith('data:image'): - img = load_image_from_base64(image_url.split(',')[1]) - else: - # Load image from local path - img = Image.open(image_url) - # check image valid - img = img.convert('RGB') - except Exception as error: - if isinstance(image_url, str) and len(image_url) > 100: - image_url = image_url[:100] + ' ...' - logger.error(f'{error}, image_url={image_url}') - # use dummy image - img = Image.new('RGB', (32, 32)) +def encode_image_base64(image: str | Image.Image, format: str = 'PNG', **kwargs) -> str: + """Encode image (path or PIL image) to a base64 string.""" + if isinstance(image, str): + image = load_image(image, **kwargs) + image_io = ImageMediaIO(**kwargs) + return image_io.encode_base64(image, image_format=format) - return img + +def encode_video_base64(video: str | npt.NDArray, format: str = 'JPEG', **kwargs) -> str: + """Encode video (path or frames) to a base64 string.""" + if isinstance(video, str): + video, _ = load_video(video, **kwargs) + image_io = ImageMediaIO() + video_io = VideoMediaIO(image_io=image_io, **kwargs) + return video_io.encode_base64(video, video_format=format) + + +def encode_time_series_base64(data: str | npt.NDArray, **kwargs) -> str: + """Encode time-series (path or numpy array) to a base64 string.""" + if isinstance(data, str): + data = load_time_series(data, **kwargs) + ts_io = TimeSeriesMediaIO(**kwargs) + return ts_io.encode_base64(data) diff --git a/requirements/runtime_cuda.txt b/requirements/runtime_cuda.txt index 3f5c687a03..f8929b6601 100644 --- a/requirements/runtime_cuda.txt +++ b/requirements/runtime_cuda.txt @@ -9,6 +9,7 @@ mmengine-lite numpy openai openai_harmony +opencv-python-headless partial_json_parser peft<=0.14.0 pillow diff --git a/tests/test_lmdeploy/test_vl/test_qwen3vl_processor.py b/tests/test_lmdeploy/test_vl/test_qwen3vl_processor.py index 46bf84c887..a80acbdef8 100644 --- a/tests/test_lmdeploy/test_vl/test_qwen3vl_processor.py +++ b/tests/test_lmdeploy/test_vl/test_qwen3vl_processor.py @@ -2,8 +2,8 @@ import pytest +from lmdeploy.vl import load_image from lmdeploy.vl.model.qwen3 import Qwen3VLModel -from lmdeploy.vl.utils import load_image QWEN3VL_MODELS = [ 'Qwen/Qwen3-VL-4B-Instruct', @@ -35,7 +35,7 @@ def sample_messages(): }, { 'type': 'image', - 'image': pil_image + 'data': pil_image }, ] }] diff --git a/tests/test_lmdeploy/test_vl/test_vl_encode.py b/tests/test_lmdeploy/test_vl/test_vl_encode.py index eaa29e37a9..607b4822a5 100644 --- a/tests/test_lmdeploy/test_vl/test_vl_encode.py +++ b/tests/test_lmdeploy/test_vl/test_vl_encode.py @@ -1,39 +1,123 @@ -# yapf: disable -from lmdeploy.vl.utils import encode_image_base64, load_image, load_image_from_base64 +import math -# yapf: enable +import numpy as np +from lmdeploy.vl import (encode_image_base64, encode_time_series_base64, encode_video_base64, load_image, + load_time_series, load_video) -def test_encode_image_base64(): - url = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg' # noqa E501 - im1 = load_image(url) - base64 = encode_image_base64(url) - im2 = load_image_from_base64(base64) - assert im1 == im2.convert('RGB') +def test_image_encode_decode(): + url = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg' -def test_load_truncated_image(): + img1 = load_image(url) + # use PNG for lossless pixel-perfect comparison + b64 = encode_image_base64(url, format='PNG') + img2 = load_image(f'data:image/png;base64,{b64}') + + assert img1.size == img2.size + assert img1.mode == img2.mode + assert img1.tobytes() == img2.tobytes() + + +def test_video_encode_decode(): + # url = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/space_woaudio.mp4' + url = 'https://raw.githubusercontent.com/CUHKSZzxy/Online-Data/main/clip_3_removed.mp4' + + # num_frames=4 to keep test fast + vid1, meta1 = load_video(url, num_frames=4) + b64 = encode_video_base64(url, num_frames=4, format='JPEG') + vid2, meta2 = load_video(f'data:video/jpeg;base64,{b64}') + + gt_meta = { + 'total_num_frames': 498, + 'fps': 29.97002997002997, + 'duration': 16.616600000000002, + 'video_backend': 'opencv', + 'frames_indices': [0, 165, 331, 497] + } + + assert vid1.shape == vid2.shape + assert np.mean(np.abs(vid1.astype(float) - vid2.astype(float))) < 2.0 # JPEG is lossy + assert meta1['total_num_frames'] == gt_meta['total_num_frames'] + assert meta1['frames_indices'] == gt_meta['frames_indices'] + + +def test_time_series_encode_decode(): + # url = "https://huggingface.co/internlm/Intern-S1-Pro/raw/main/0092638_seism.npy" + url = 'https://raw.githubusercontent.com/CUHKSZzxy/Online-Data/main/0092638_seism.npy' + + ts1 = load_time_series(url) + b64 = encode_time_series_base64(url) + ts2 = load_time_series(f'data:time_series/npy;base64,{b64}') + + assert ts1.shape == ts2.shape + assert np.allclose(ts1, ts2) + + +def test_image_modes(): + import numpy as np + from PIL import Image + + grayscale_img = Image.fromarray(np.zeros((100, 100), dtype=np.uint8)).convert('L') + b64 = encode_image_base64(grayscale_img) # should convert L -> RGB internally + + img_out = load_image(f'data:image/png;base64,{b64}') + assert img_out.mode == 'RGB' + + +def test_truncated_image(): url = 'https://github.com/irexyc/lmdeploy/releases/download/v0.0.1/tr.jpeg' im = load_image(url) assert im.width == 1638 assert im.height == 2048 -def test_load_invalid_url(): - url = ('https://raw.githubusercontent.com/open-mmlab/' - 'mmdeploy/main/tests/data/tiger.jpeg') - # invalid - im1 = load_image(url[:-1]) - assert im1.width == 32 - assert im1.height == 32 - # valid - im2 = load_image(url) - assert im2.height == 182 - assert im2.width == 278 - - -def test_load_invalid_base64(): - base64 = 'data:image/jpeg;base64,xxx' - im = load_image(base64) - assert im.width == 32 - assert im.height == 32 +def test_single_frame_video(): + url = 'https://raw.githubusercontent.com/CUHKSZzxy/Online-Data/main/clip_3_removed.mp4' + vid, meta = load_video(url, num_frames=1) + assert vid.shape[0] == 1 + + b64 = encode_video_base64(vid) + assert isinstance(b64, str) + assert ',' not in b64 # should only be one JPEG block, no commas + + +def test_video_sampling_params(): + url = 'https://raw.githubusercontent.com/CUHKSZzxy/Online-Data/main/clip_3_removed.mp4' + + # 1. test num_frames constraint + num_frames = 5 + vid, meta = load_video(url, num_frames=num_frames) + assert vid.shape[0] == num_frames + assert len(meta['frames_indices']) == num_frames + + # 2. test fps constraint (original fps is ~29.97, duration ~16.6s) + fps = 1 + vid, meta = load_video(url, fps=fps) + expected_frames = max(1, int(math.floor(meta['duration'] * fps))) + assert vid.shape[0] == expected_frames + + # 3. test both constraints (should take the minimum) + # 10 fps x 16.6s ~= 166 frames > 10 frames, so will be limited by num_frames + num_frames = 10 + fps = 10 + vid, meta = load_video(url, num_frames=num_frames, fps=fps) + assert vid.shape[0] == num_frames + + # 1 fps x 16.6s ~= 16 frames < 100 frames, so will be limited by fps + num_frames = 100 + fps = 1 + vid, meta = load_video(url, num_frames=num_frames, fps=fps) + expected_frames = max(1, int(math.floor(meta['duration'] * fps))) + assert vid.shape[0] == expected_frames + + +def test_invalid_inputs(): + # non-existent local path + import pytest + with pytest.raises(Exception): + load_image('/non_existent/path/image.jpg') + with pytest.raises(Exception): + load_video('/non_existent/path/video.mp4') + with pytest.raises(Exception): + load_time_series('/non_existent/path/data.npy')