InternLM · CUHKSZzxy · Feb 13, 2026 · Feb 27, 2026 · Feb 27, 2026 · Feb 28, 2026
diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py
@@ -5,9 +5,8 @@
 from PIL import Image
 
 from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.vl import encode_image_base64, load_image
 from lmdeploy.vl.constants import IMAGE_TOKEN
-from lmdeploy.vl.utils import encode_image_base64
 
 gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=10)
 

diff --git a/docs/en/multi_modal/internvl.md b/docs/en/multi_modal/internvl.md
@@ -116,7 +116,7 @@ import numpy as np
 from lmdeploy import pipeline, GenerationConfig
 from decord import VideoReader, cpu
 from lmdeploy.vl.constants import IMAGE_TOKEN
-from lmdeploy.vl.utils import encode_image_base64
+from lmdeploy.vl import encode_image_base64
 from PIL import Image
 pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')
 

diff --git a/docs/en/multi_modal/minicpmv.md b/docs/en/multi_modal/minicpmv.md
@@ -97,7 +97,7 @@ print(out.text)
 
 ```python
 from lmdeploy import pipeline, GenerationConfig
-from lmdeploy.vl.utils import encode_image_base64
+from lmdeploy.vl import encode_image_base64
 import torch
 from PIL import Image
 from transformers import AutoModel, AutoTokenizer

diff --git a/docs/en/multi_modal/qwen2_5_vl.md b/docs/en/multi_modal/qwen2_5_vl.md
@@ -99,7 +99,7 @@ import numpy as np
 from lmdeploy import pipeline, GenerationConfig
 from decord import VideoReader, cpu
 from lmdeploy.vl.constants import IMAGE_TOKEN
-from lmdeploy.vl.utils import encode_image_base64
+from lmdeploy.vl import encode_image_base64
 from PIL import Image
 pipe = pipeline('Qwen/Qwen2.5-VL-7B-Instruct', log_level='INFO')
 

diff --git a/docs/zh_cn/multi_modal/internvl.md b/docs/zh_cn/multi_modal/internvl.md
@@ -116,7 +116,7 @@ import numpy as np
 from lmdeploy import pipeline, GenerationConfig
 from decord import VideoReader, cpu
 from lmdeploy.vl.constants import IMAGE_TOKEN
-from lmdeploy.vl.utils import encode_image_base64
+from lmdeploy.vl import encode_image_base64
 from PIL import Image
 pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')
 

diff --git a/docs/zh_cn/multi_modal/minicpmv.md b/docs/zh_cn/multi_modal/minicpmv.md
@@ -97,7 +97,7 @@ print(out.text)
 
 ```python
 from lmdeploy import pipeline, GenerationConfig
-from lmdeploy.vl.utils import encode_image_base64
+from lmdeploy.vl import encode_image_base64
 import torch
 from PIL import Image
 from transformers import AutoModel, AutoTokenizer

diff --git a/docs/zh_cn/multi_modal/qwen2_5_vl.md b/docs/zh_cn/multi_modal/qwen2_5_vl.md
@@ -99,7 +99,7 @@ import numpy as np
 from lmdeploy import pipeline, GenerationConfig
 from decord import VideoReader, cpu
 from lmdeploy.vl.constants import IMAGE_TOKEN
-from lmdeploy.vl.utils import encode_image_base64
+from lmdeploy.vl import encode_image_base64
 from PIL import Image
 pipe = pipeline('Qwen/Qwen2.5-VL-7B-Instruct', log_level='INFO')
 

diff --git a/lmdeploy/pytorch/model_inputs.py b/lmdeploy/pytorch/model_inputs.py
@@ -11,7 +11,7 @@
 import lmdeploy.pytorch.distributed as dist
 from lmdeploy.pytorch.backends import get_backend
 from lmdeploy.pytorch.config import CacheConfig, DLLMConfig, ModelConfig, QuantizationConfig
-from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.utils import CtxMgrBase, singleton
 
 if TYPE_CHECKING:
@@ -66,7 +66,7 @@ class VisionModelInputs:
     input_embeddings: List[List[torch.Tensor]] = None
     input_embedding_ranges: List[torch.LongTensor] = None
     input_embedding_indexing: torch.BoolTensor = None
-    input_multimodals: List[MultiModalTensor] = None
+    input_multimodals: List[MultiModalData] = None
 
     def to_device(self, device: str, non_blocking: bool = False):
         """To device."""
@@ -255,7 +255,7 @@ class StepContext:
     local_adapter_ids: torch.LongTensor = None
     input_embeddings: torch.Tensor = None
     input_embedding_indexing: torch.Tensor = None
-    input_multimodals: List[MultiModalTensor] = None
+    input_multimodals: List[MultiModalData] = None
     vision_inputs: VisionModelInputs = None
     attn_metadata: Any = None
     kv_quant_policy: Literal[0, 4, 8] = 0

diff --git a/lmdeploy/pytorch/models/chatglm2.py b/lmdeploy/pytorch/models/chatglm2.py
@@ -9,7 +9,7 @@
 
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding,
                                  build_rotary_params)
 from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_down_linear, build_gateup_linear, build_o_proj,
@@ -866,10 +866,10 @@ def preprocess_input(self,
             if isinstance(num_pad, torch.Tensor):
                 num_pad = num_pad.item()
 
-            mm_data = MultiModalTensor(data=pixel_values,
-                                       start=offset,
-                                       end=offset + num_pad,
-                                       meta=dict(image_token_id=image_token_id))
+            mm_data = MultiModalData(data=pixel_values,
+                                     start=offset,
+                                     end=offset + num_pad,
+                                     meta=dict(image_token_id=image_token_id))
             input_imgs.append(mm_data)
 
         result = PreprocessInputResult(

diff --git a/lmdeploy/pytorch/models/cogvlm.py b/lmdeploy/pytorch/models/cogvlm.py
@@ -12,7 +12,7 @@
 from lmdeploy.pytorch.distributed import get_tp_world_rank
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding
 from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_merged_colwise_linear, build_qkv_proj,
                                         build_rowwise_linear)
@@ -901,10 +901,10 @@ def preprocess_input(self, input_ids: List[int], input_multimodals=None, **kwarg
             if isinstance(num_pad, torch.Tensor):
                 num_pad = num_pad.item()
 
-            mm_data = MultiModalTensor(data=pixel_values,
-                                       start=offset,
-                                       end=offset + num_pad,
-                                       meta=dict(image_token_id=image_token_id))
+            mm_data = MultiModalData(data=pixel_values,
+                                     start=offset,
+                                     end=offset + num_pad,
+                                     meta=dict(image_token_id=image_token_id))
             input_imgs.append(mm_data)
 
         result = PreprocessInputResult(

diff --git a/lmdeploy/pytorch/models/deepseek_vl2.py b/lmdeploy/pytorch/models/deepseek_vl2.py
@@ -11,7 +11,7 @@
 
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .deepseek_v2 import DeepseekV2ForCausalLM
@@ -440,13 +440,13 @@ def preprocess_input(self,
             if isinstance(num_pad, torch.Tensor):
                 num_pad = num_pad.item()
 
-            mm_data = MultiModalTensor(data=pixel_values,
-                                       start=offset,
-                                       end=offset + num_pad,
-                                       meta=dict(
-                                           image_token_id=image_token_id,
-                                           images_spatial_crop=images_spatial_crop,
-                                       ))
+            mm_data = MultiModalData(data=pixel_values,
+                                     start=offset,
+                                     end=offset + num_pad,
+                                     meta=dict(
+                                         image_token_id=image_token_id,
+                                         images_spatial_crop=images_spatial_crop,
+                                     ))
 
             input_imgs.append(mm_data)
 

diff --git a/lmdeploy/pytorch/models/gemma3_vl.py b/lmdeploy/pytorch/models/gemma3_vl.py
@@ -8,7 +8,7 @@
 
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .patch import build_model_from_hf_config
@@ -108,10 +108,10 @@ def preprocess_input(self,
             if isinstance(num_pad, torch.Tensor):
                 num_pad = num_pad.item()
 
-            mm_data = MultiModalTensor(data=pixel_values,
-                                       start=offset,
-                                       end=offset + num_pad,
-                                       meta=dict(image_token_id=image_token_id))
+            mm_data = MultiModalData(data=pixel_values,
+                                     start=offset,
+                                     end=offset + num_pad,
+                                     meta=dict(image_token_id=image_token_id))
             input_imgs.append(mm_data)
 
         result = PreprocessInputResult(

diff --git a/lmdeploy/pytorch/models/glm4_1v.py b/lmdeploy/pytorch/models/glm4_1v.py
@@ -11,7 +11,7 @@
 
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, FlashAttention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config
 from lmdeploy.pytorch.nn.linear import build_merged_colwise_linear, build_qkv_proj, build_rowwise_linear
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
@@ -865,10 +865,10 @@ def preprocess_input(self,
             if isinstance(num_pad, torch.Tensor):
                 num_pad = num_pad.item()
 
-            mm_data = MultiModalTensor(data=pixel_values,
-                                       start=start,
-                                       end=start + num_pad,
-                                       meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id))
+            mm_data = MultiModalData(data=pixel_values,
+                                     start=start,
+                                     end=start + num_pad,
+                                     meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id))
             input_imgs.append(mm_data)
 
         result = PreprocessInputResult(

diff --git a/lmdeploy/pytorch/models/interns1_pro.py b/lmdeploy/pytorch/models/interns1_pro.py
@@ -7,8 +7,9 @@
 
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
+from lmdeploy.vl.constants import Modality
 
 from .interns1_pro_ts import InternS1ProTimeSeriesModel
 from .patch import add_prefix, get_build_model_context
@@ -173,25 +174,28 @@ def prepare_inputs_for_generation(
         ts_sr = None
         ts_mask = None
         if context.input_multimodals is not None:
-            mm_data = [input_mm.get('image', []) for input_mm in context.input_multimodals]
+            mm_inputs = [input_mm.get('mm_data', []) for input_mm in context.input_multimodals]
             # flatten batch
-            mm_data = [data for im_data in mm_data for data in im_data]
+            mm_inputs = [item for sublist in mm_inputs for item in sublist]
 
-            if len(mm_data) > 0:
-                is_time_series = mm_data[0].meta.get('ts_token_id', False)
+            if len(mm_inputs) > 0:
+                modality = mm_inputs[0].modality
+                image_token_id = mm_inputs[0].meta.get('image_token_id')
+                video_token_id = mm_inputs[0].meta.get('video_token_id')
+                ts_token_id = mm_inputs[0].meta.get('ts_token_id')
 
-                if is_time_series:
-                    ts_values = mm_data
-                    ts_token_id = ts_values[0].meta['ts_token_id']
-                    ts_lens = ts_values[0].meta['ts_lens']
-                    ts_sr = ts_values[0].meta['ts_sr']
+                if modality == Modality.TIME_SERIES:
+                    ts_values = torch.cat([inp.data for inp in mm_inputs])
                     ts_mask = input_ids == ts_token_id
-                    ts_values = torch.cat([data.data for data in ts_values])
+
+                    ts_lens = mm_inputs[0].meta['ts_lens']
+                    ts_sr = mm_inputs[0].meta['ts_sr']
                 else:
-                    pixel_values = torch.cat([data.data for data in mm_data])
-                    image_token_id = mm_data[0].meta['image_token_id']
-                    image_mask = input_ids == image_token_id
-                    grid_thw = torch.cat([data.meta['grid_thw'] for data in mm_data]).cpu()
+                    pixel_values = torch.cat([inp.data for inp in mm_inputs])
+                    mm_token_id = image_token_id if modality == Modality.IMAGE else video_token_id
+                    image_mask = (input_ids == mm_token_id)
+
+                    grid_thw = torch.cat([data.meta['grid_thw'] for data in mm_inputs]).cpu()
                     vis_pos_emb = self.visual.rot_pos_emb(grid_thw)
                     pos_embeds = self.visual.fast_pos_embed_interpolate(grid_thw)
                     vis_cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
@@ -365,6 +369,63 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype) -> None:
         self.config = config
         self.dtype = dtype
 
+    def _make_image_mm_data(self, input_mm: Dict[str, Any]) -> MultiModalData:
+        """Make image MultiModalData."""
+        pixel_values = input_mm['pixel_values'].to(self.dtype)
+        image_grid_thw = input_mm['image_grid_thw']
+        offset = input_mm['offset']
+        start = offset
+        image_token_id = input_mm['image_token_id']
+        num_pad = input_mm['image_tokens']
+        if isinstance(num_pad, torch.Tensor):
+            num_pad = num_pad.item()
+
+        mm_data = MultiModalData(modality=Modality.IMAGE,
+                                 data=pixel_values,
+                                 start=start,
+                                 end=start + num_pad,
+                                 meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id))
+        return mm_data
+
+    def _make_video_mm_data(self, input_mm: Dict[str, Any]) -> MultiModalData:
+        """Make video MultiModalData."""
+        pixel_values_videos = input_mm['pixel_values_videos'].to(self.dtype)
+        video_grid_thw = input_mm['video_grid_thw']
+        offset = input_mm['offset']
+        start = offset
+        video_token_id = input_mm['video_token_id']
+        num_pad = input_mm['video_tokens']
+        if isinstance(num_pad, torch.Tensor):
+            num_pad = num_pad.item()
+
+        mm_data = MultiModalData(modality=Modality.VIDEO,
+                                 data=pixel_values_videos,
+                                 start=start,
+                                 end=start + num_pad,
+                                 meta=dict(
+                                     grid_thw=video_grid_thw,
+                                     video_token_id=video_token_id,
+                                 ))
+        return mm_data
+
+    def _make_time_series_mm_data(self, input_mm: Dict[str, Any]) -> MultiModalData:
+        """Make time series MultiModalData."""
+        ts_values = input_mm['ts_values'].to(self.dtype)
+        offset = input_mm['offset']
+        ts_token_id = input_mm['ts_token_id']
+        ts_lens = input_mm['ts_lens']
+        ts_sr = input_mm['ts_sr']
+        num_pad = input_mm['ts_tokens']
+        if isinstance(num_pad, torch.Tensor):
+            num_pad = num_pad.item()
+
+        mm_data = MultiModalData(modality=Modality.TIME_SERIES,
+                                 data=ts_values,
+                                 start=offset,
+                                 end=offset + num_pad,
+                                 meta=dict(ts_lens=ts_lens, ts_sr=ts_sr, ts_token_id=ts_token_id))
+        return mm_data
+
     def preprocess_input(self,
                          input_ids: List[int],
                          input_multimodals: List[Dict[str, Any]] = None,
@@ -373,38 +434,17 @@ def preprocess_input(self,
         if input_multimodals is None or len(input_multimodals) == 0:
             return input_ids, input_multimodals
 
-        input_imgs = []
+        input_mm_data = []
         for input_mm in input_multimodals:
-            if 'ts_values' in input_mm:
-                ts_values = input_mm['ts_values'].to(self.dtype)
-                offset = input_mm['offset']
-                ts_token_id = input_mm['ts_token_id']
-                ts_lens = input_mm['ts_lens']
-                ts_sr = input_mm['ts_sr']
-                num_pad = input_mm['num_ts_tokens']
-
-                mm_data = MultiModalTensor(data=ts_values,
-                                           start=offset,
-                                           end=offset + num_pad,
-                                           meta=dict(ts_token_id=ts_token_id, ts_lens=ts_lens, ts_sr=ts_sr))
-            else:
-                pixel_values = input_mm['pixel_values'].to(self.dtype)
-                image_grid_thw = input_mm['image_grid_thw']
-                offset = input_mm['offset']
-                start = offset
-                image_token_id = input_mm['image_token_id']
-                num_pad = input_mm['image_tokens']
-                if isinstance(num_pad, torch.Tensor):
-                    num_pad = num_pad.item()
-
-                mm_data = MultiModalTensor(data=pixel_values,
-                                           start=start,
-                                           end=start + num_pad,
-                                           meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id))
-            input_imgs.append(mm_data)
-
-        result = PreprocessInputResult(
-            input_ids=input_ids,
-            input_multimodals=dict(image=input_imgs),
-        )
+            modality = input_mm.get('modality')
+            if modality == Modality.IMAGE:
+                mm_data = self._make_image_mm_data(input_mm)
+            elif modality == Modality.VIDEO:
+                mm_data = self._make_video_mm_data(input_mm)
+            elif modality == Modality.TIME_SERIES:
+                mm_data = self._make_time_series_mm_data(input_mm)
+            input_mm_data.append(mm_data)
+
+        result = PreprocessInputResult(input_ids=input_ids, input_multimodals=dict(mm_data=input_mm_data))
+
         return result
diff --git a/lmdeploy/pytorch/models/interns1_pro_ts.py b/lmdeploy/pytorch/models/interns1_pro_ts.py
@@ -154,7 +154,7 @@ def __init__(self, d_model, max_len=20000, dtype: torch.dtype = None, device: to
         div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) * (-math.log(10000.0) / d_model))
         pe[:, 0::2] = torch.sin(position * div_term)
         pe[:, 1::2] = torch.cos(position * div_term)
-        # TODO: zhouxinyu, hf forces float32 during init, but becomes bf16 during forward
+        # hf forces float32 during init, but becomes bf16 during forward
         pe = pe.unsqueeze(0).transpose(0, 1).to(dtype=dtype, device=device)  # (max_len, 1, d_model)
         self.register_buffer('pe', pe, persistent=True)