diff --git a/autotest/tools/pipeline/mllm_case.py b/autotest/tools/pipeline/mllm_case.py
index 55e000fc88..2ac134f440 100644
--- a/autotest/tools/pipeline/mllm_case.py
+++ b/autotest/tools/pipeline/mllm_case.py
@@ -5,9 +5,8 @@
 from PIL import Image
 
 from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.vl import encode_image_base64, load_image
 from lmdeploy.vl.constants import IMAGE_TOKEN
-from lmdeploy.vl.utils import encode_image_base64
 
 gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=10)
 
diff --git a/docs/en/multi_modal/internvl.md b/docs/en/multi_modal/internvl.md
index aeab4fe67b..a592ba224e 100644
--- a/docs/en/multi_modal/internvl.md
+++ b/docs/en/multi_modal/internvl.md
@@ -116,7 +116,7 @@ import numpy as np
 from lmdeploy import pipeline, GenerationConfig
 from decord import VideoReader, cpu
 from lmdeploy.vl.constants import IMAGE_TOKEN
-from lmdeploy.vl.utils import encode_image_base64
+from lmdeploy.vl import encode_image_base64
 from PIL import Image
 pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')
 
diff --git a/docs/en/multi_modal/minicpmv.md b/docs/en/multi_modal/minicpmv.md
index 15774de7e7..0f2bf176b9 100644
--- a/docs/en/multi_modal/minicpmv.md
+++ b/docs/en/multi_modal/minicpmv.md
@@ -97,7 +97,7 @@ print(out.text)
 
 ```python
 from lmdeploy import pipeline, GenerationConfig
-from lmdeploy.vl.utils import encode_image_base64
+from lmdeploy.vl import encode_image_base64
 import torch
 from PIL import Image
 from transformers import AutoModel, AutoTokenizer
diff --git a/docs/en/multi_modal/qwen2_5_vl.md b/docs/en/multi_modal/qwen2_5_vl.md
index 75c98c5a68..ac2ffa2ce6 100644
--- a/docs/en/multi_modal/qwen2_5_vl.md
+++ b/docs/en/multi_modal/qwen2_5_vl.md
@@ -99,7 +99,7 @@ import numpy as np
 from lmdeploy import pipeline, GenerationConfig
 from decord import VideoReader, cpu
 from lmdeploy.vl.constants import IMAGE_TOKEN
-from lmdeploy.vl.utils import encode_image_base64
+from lmdeploy.vl import encode_image_base64
 from PIL import Image
 pipe = pipeline('Qwen/Qwen2.5-VL-7B-Instruct', log_level='INFO')
 
diff --git a/docs/zh_cn/multi_modal/internvl.md b/docs/zh_cn/multi_modal/internvl.md
index e6a37c0ac0..3207550d82 100644
--- a/docs/zh_cn/multi_modal/internvl.md
+++ b/docs/zh_cn/multi_modal/internvl.md
@@ -116,7 +116,7 @@ import numpy as np
 from lmdeploy import pipeline, GenerationConfig
 from decord import VideoReader, cpu
 from lmdeploy.vl.constants import IMAGE_TOKEN
-from lmdeploy.vl.utils import encode_image_base64
+from lmdeploy.vl import encode_image_base64
 from PIL import Image
 pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')
 
diff --git a/docs/zh_cn/multi_modal/minicpmv.md b/docs/zh_cn/multi_modal/minicpmv.md
index b605bc1fcc..eb2a168cdb 100644
--- a/docs/zh_cn/multi_modal/minicpmv.md
+++ b/docs/zh_cn/multi_modal/minicpmv.md
@@ -97,7 +97,7 @@ print(out.text)
 
 ```python
 from lmdeploy import pipeline, GenerationConfig
-from lmdeploy.vl.utils import encode_image_base64
+from lmdeploy.vl import encode_image_base64
 import torch
 from PIL import Image
 from transformers import AutoModel, AutoTokenizer
diff --git a/docs/zh_cn/multi_modal/qwen2_5_vl.md b/docs/zh_cn/multi_modal/qwen2_5_vl.md
index f258a783a0..2b1d81c0a4 100644
--- a/docs/zh_cn/multi_modal/qwen2_5_vl.md
+++ b/docs/zh_cn/multi_modal/qwen2_5_vl.md
@@ -99,7 +99,7 @@ import numpy as np
 from lmdeploy import pipeline, GenerationConfig
 from decord import VideoReader, cpu
 from lmdeploy.vl.constants import IMAGE_TOKEN
-from lmdeploy.vl.utils import encode_image_base64
+from lmdeploy.vl import encode_image_base64
 from PIL import Image
 pipe = pipeline('Qwen/Qwen2.5-VL-7B-Instruct', log_level='INFO')
 
diff --git a/lmdeploy/pytorch/model_inputs.py b/lmdeploy/pytorch/model_inputs.py
index 538b9c6f3a..2bf93bb530 100644
--- a/lmdeploy/pytorch/model_inputs.py
+++ b/lmdeploy/pytorch/model_inputs.py
@@ -11,7 +11,7 @@
 import lmdeploy.pytorch.distributed as dist
 from lmdeploy.pytorch.backends import get_backend
 from lmdeploy.pytorch.config import CacheConfig, DLLMConfig, ModelConfig, QuantizationConfig
-from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.utils import CtxMgrBase, singleton
 
 if TYPE_CHECKING:
@@ -66,7 +66,7 @@ class VisionModelInputs:
     input_embeddings: List[List[torch.Tensor]] = None
     input_embedding_ranges: List[torch.LongTensor] = None
     input_embedding_indexing: torch.BoolTensor = None
-    input_multimodals: List[MultiModalTensor] = None
+    input_multimodals: List[MultiModalData] = None
 
     def to_device(self, device: str, non_blocking: bool = False):
         """To device."""
@@ -255,7 +255,7 @@ class StepContext:
     local_adapter_ids: torch.LongTensor = None
     input_embeddings: torch.Tensor = None
     input_embedding_indexing: torch.Tensor = None
-    input_multimodals: List[MultiModalTensor] = None
+    input_multimodals: List[MultiModalData] = None
     vision_inputs: VisionModelInputs = None
     attn_metadata: Any = None
     kv_quant_policy: Literal[0, 4, 8] = 0
diff --git a/lmdeploy/pytorch/models/chatglm2.py b/lmdeploy/pytorch/models/chatglm2.py
index 56e3169bb7..b690217564 100644
--- a/lmdeploy/pytorch/models/chatglm2.py
+++ b/lmdeploy/pytorch/models/chatglm2.py
@@ -9,7 +9,7 @@
 
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding,
                                  build_rotary_params)
 from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_down_linear, build_gateup_linear, build_o_proj,
@@ -866,10 +866,10 @@ def preprocess_input(self,
             if isinstance(num_pad, torch.Tensor):
                 num_pad = num_pad.item()
 
-            mm_data = MultiModalTensor(data=pixel_values,
-                                       start=offset,
-                                       end=offset + num_pad,
-                                       meta=dict(image_token_id=image_token_id))
+            mm_data = MultiModalData(data=pixel_values,
+                                     start=offset,
+                                     end=offset + num_pad,
+                                     meta=dict(image_token_id=image_token_id))
             input_imgs.append(mm_data)
 
         result = PreprocessInputResult(
diff --git a/lmdeploy/pytorch/models/cogvlm.py b/lmdeploy/pytorch/models/cogvlm.py
index ad8adc9739..9b8d0b5472 100644
--- a/lmdeploy/pytorch/models/cogvlm.py
+++ b/lmdeploy/pytorch/models/cogvlm.py
@@ -12,7 +12,7 @@
 from lmdeploy.pytorch.distributed import get_tp_world_rank
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding
 from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_merged_colwise_linear, build_qkv_proj,
                                         build_rowwise_linear)
@@ -901,10 +901,10 @@ def preprocess_input(self, input_ids: List[int], input_multimodals=None, **kwarg
             if isinstance(num_pad, torch.Tensor):
                 num_pad = num_pad.item()
 
-            mm_data = MultiModalTensor(data=pixel_values,
-                                       start=offset,
-                                       end=offset + num_pad,
-                                       meta=dict(image_token_id=image_token_id))
+            mm_data = MultiModalData(data=pixel_values,
+                                     start=offset,
+                                     end=offset + num_pad,
+                                     meta=dict(image_token_id=image_token_id))
             input_imgs.append(mm_data)
 
         result = PreprocessInputResult(
diff --git a/lmdeploy/pytorch/models/deepseek_vl2.py b/lmdeploy/pytorch/models/deepseek_vl2.py
index 290b9a4fc0..b778c6ebeb 100644
--- a/lmdeploy/pytorch/models/deepseek_vl2.py
+++ b/lmdeploy/pytorch/models/deepseek_vl2.py
@@ -11,7 +11,7 @@
 
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .deepseek_v2 import DeepseekV2ForCausalLM
@@ -440,13 +440,13 @@ def preprocess_input(self,
             if isinstance(num_pad, torch.Tensor):
                 num_pad = num_pad.item()
 
-            mm_data = MultiModalTensor(data=pixel_values,
-                                       start=offset,
-                                       end=offset + num_pad,
-                                       meta=dict(
-                                           image_token_id=image_token_id,
-                                           images_spatial_crop=images_spatial_crop,
-                                       ))
+            mm_data = MultiModalData(data=pixel_values,
+                                     start=offset,
+                                     end=offset + num_pad,
+                                     meta=dict(
+                                         image_token_id=image_token_id,
+                                         images_spatial_crop=images_spatial_crop,
+                                     ))
 
             input_imgs.append(mm_data)
 
diff --git a/lmdeploy/pytorch/models/gemma3_vl.py b/lmdeploy/pytorch/models/gemma3_vl.py
index 8f4ea8e972..cff9615df2 100644
--- a/lmdeploy/pytorch/models/gemma3_vl.py
+++ b/lmdeploy/pytorch/models/gemma3_vl.py
@@ -8,7 +8,7 @@
 
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .patch import build_model_from_hf_config
@@ -108,10 +108,10 @@ def preprocess_input(self,
             if isinstance(num_pad, torch.Tensor):
                 num_pad = num_pad.item()
 
-            mm_data = MultiModalTensor(data=pixel_values,
-                                       start=offset,
-                                       end=offset + num_pad,
-                                       meta=dict(image_token_id=image_token_id))
+            mm_data = MultiModalData(data=pixel_values,
+                                     start=offset,
+                                     end=offset + num_pad,
+                                     meta=dict(image_token_id=image_token_id))
             input_imgs.append(mm_data)
 
         result = PreprocessInputResult(
diff --git a/lmdeploy/pytorch/models/glm4_1v.py b/lmdeploy/pytorch/models/glm4_1v.py
index 9b89164bef..8332940247 100644
--- a/lmdeploy/pytorch/models/glm4_1v.py
+++ b/lmdeploy/pytorch/models/glm4_1v.py
@@ -11,7 +11,7 @@
 
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, FlashAttention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config
 from lmdeploy.pytorch.nn.linear import build_merged_colwise_linear, build_qkv_proj, build_rowwise_linear
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
@@ -865,10 +865,10 @@ def preprocess_input(self,
             if isinstance(num_pad, torch.Tensor):
                 num_pad = num_pad.item()
 
-            mm_data = MultiModalTensor(data=pixel_values,
-                                       start=start,
-                                       end=start + num_pad,
-                                       meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id))
+            mm_data = MultiModalData(data=pixel_values,
+                                     start=start,
+                                     end=start + num_pad,
+                                     meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id))
             input_imgs.append(mm_data)
 
         result = PreprocessInputResult(
diff --git a/lmdeploy/pytorch/models/interns1_pro.py b/lmdeploy/pytorch/models/interns1_pro.py
index 51ed9deaf6..773d9e3964 100644
--- a/lmdeploy/pytorch/models/interns1_pro.py
+++ b/lmdeploy/pytorch/models/interns1_pro.py
@@ -7,8 +7,9 @@
 
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
+from lmdeploy.vl.constants import Modality
 
 from .interns1_pro_ts import InternS1ProTimeSeriesModel
 from .patch import add_prefix, get_build_model_context
@@ -173,25 +174,28 @@ def prepare_inputs_for_generation(
         ts_sr = None
         ts_mask = None
         if context.input_multimodals is not None:
-            mm_data = [input_mm.get('image', []) for input_mm in context.input_multimodals]
+            mm_inputs = [input_mm.get('mm_data', []) for input_mm in context.input_multimodals]
             # flatten batch
-            mm_data = [data for im_data in mm_data for data in im_data]
+            mm_inputs = [item for sublist in mm_inputs for item in sublist]
 
-            if len(mm_data) > 0:
-                is_time_series = mm_data[0].meta.get('ts_token_id', False)
+            if len(mm_inputs) > 0:
+                modality = mm_inputs[0].modality
+                image_token_id = mm_inputs[0].meta.get('image_token_id')
+                video_token_id = mm_inputs[0].meta.get('video_token_id')
+                ts_token_id = mm_inputs[0].meta.get('ts_token_id')
 
-                if is_time_series:
-                    ts_values = mm_data
-                    ts_token_id = ts_values[0].meta['ts_token_id']
-                    ts_lens = ts_values[0].meta['ts_lens']
-                    ts_sr = ts_values[0].meta['ts_sr']
+                if modality == Modality.TIME_SERIES:
+                    ts_values = torch.cat([inp.data for inp in mm_inputs])
                     ts_mask = input_ids == ts_token_id
-                    ts_values = torch.cat([data.data for data in ts_values])
+
+                    ts_lens = mm_inputs[0].meta['ts_lens']
+                    ts_sr = mm_inputs[0].meta['ts_sr']
                 else:
-                    pixel_values = torch.cat([data.data for data in mm_data])
-                    image_token_id = mm_data[0].meta['image_token_id']
-                    image_mask = input_ids == image_token_id
-                    grid_thw = torch.cat([data.meta['grid_thw'] for data in mm_data]).cpu()
+                    pixel_values = torch.cat([inp.data for inp in mm_inputs])
+                    mm_token_id = image_token_id if modality == Modality.IMAGE else video_token_id
+                    image_mask = (input_ids == mm_token_id)
+
+                    grid_thw = torch.cat([data.meta['grid_thw'] for data in mm_inputs]).cpu()
                     vis_pos_emb = self.visual.rot_pos_emb(grid_thw)
                     pos_embeds = self.visual.fast_pos_embed_interpolate(grid_thw)
                     vis_cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
@@ -365,6 +369,63 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype) -> None:
         self.config = config
         self.dtype = dtype
 
+    def _make_image_mm_data(self, input_mm: Dict[str, Any]) -> MultiModalData:
+        """Make image MultiModalData."""
+        pixel_values = input_mm['pixel_values'].to(self.dtype)
+        image_grid_thw = input_mm['image_grid_thw']
+        offset = input_mm['offset']
+        start = offset
+        image_token_id = input_mm['image_token_id']
+        num_pad = input_mm['image_tokens']
+        if isinstance(num_pad, torch.Tensor):
+            num_pad = num_pad.item()
+
+        mm_data = MultiModalData(modality=Modality.IMAGE,
+                                 data=pixel_values,
+                                 start=start,
+                                 end=start + num_pad,
+                                 meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id))
+        return mm_data
+
+    def _make_video_mm_data(self, input_mm: Dict[str, Any]) -> MultiModalData:
+        """Make video MultiModalData."""
+        pixel_values_videos = input_mm['pixel_values_videos'].to(self.dtype)
+        video_grid_thw = input_mm['video_grid_thw']
+        offset = input_mm['offset']
+        start = offset
+        video_token_id = input_mm['video_token_id']
+        num_pad = input_mm['video_tokens']
+        if isinstance(num_pad, torch.Tensor):
+            num_pad = num_pad.item()
+
+        mm_data = MultiModalData(modality=Modality.VIDEO,
+                                 data=pixel_values_videos,
+                                 start=start,
+                                 end=start + num_pad,
+                                 meta=dict(
+                                     grid_thw=video_grid_thw,
+                                     video_token_id=video_token_id,
+                                 ))
+        return mm_data
+
+    def _make_time_series_mm_data(self, input_mm: Dict[str, Any]) -> MultiModalData:
+        """Make time series MultiModalData."""
+        ts_values = input_mm['ts_values'].to(self.dtype)
+        offset = input_mm['offset']
+        ts_token_id = input_mm['ts_token_id']
+        ts_lens = input_mm['ts_lens']
+        ts_sr = input_mm['ts_sr']
+        num_pad = input_mm['ts_tokens']
+        if isinstance(num_pad, torch.Tensor):
+            num_pad = num_pad.item()
+
+        mm_data = MultiModalData(modality=Modality.TIME_SERIES,
+                                 data=ts_values,
+                                 start=offset,
+                                 end=offset + num_pad,
+                                 meta=dict(ts_lens=ts_lens, ts_sr=ts_sr, ts_token_id=ts_token_id))
+        return mm_data
+
     def preprocess_input(self,
                          input_ids: List[int],
                          input_multimodals: List[Dict[str, Any]] = None,
@@ -373,38 +434,17 @@ def preprocess_input(self,
         if input_multimodals is None or len(input_multimodals) == 0:
             return input_ids, input_multimodals
 
-        input_imgs = []
+        input_mm_data = []
         for input_mm in input_multimodals:
-            if 'ts_values' in input_mm:
-                ts_values = input_mm['ts_values'].to(self.dtype)
-                offset = input_mm['offset']
-                ts_token_id = input_mm['ts_token_id']
-                ts_lens = input_mm['ts_lens']
-                ts_sr = input_mm['ts_sr']
-                num_pad = input_mm['num_ts_tokens']
-
-                mm_data = MultiModalTensor(data=ts_values,
-                                           start=offset,
-                                           end=offset + num_pad,
-                                           meta=dict(ts_token_id=ts_token_id, ts_lens=ts_lens, ts_sr=ts_sr))
-            else:
-                pixel_values = input_mm['pixel_values'].to(self.dtype)
-                image_grid_thw = input_mm['image_grid_thw']
-                offset = input_mm['offset']
-                start = offset
-                image_token_id = input_mm['image_token_id']
-                num_pad = input_mm['image_tokens']
-                if isinstance(num_pad, torch.Tensor):
-                    num_pad = num_pad.item()
-
-                mm_data = MultiModalTensor(data=pixel_values,
-                                           start=start,
-                                           end=start + num_pad,
-                                           meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id))
-            input_imgs.append(mm_data)
-
-        result = PreprocessInputResult(
-            input_ids=input_ids,
-            input_multimodals=dict(image=input_imgs),
-        )
+            modality = input_mm.get('modality')
+            if modality == Modality.IMAGE:
+                mm_data = self._make_image_mm_data(input_mm)
+            elif modality == Modality.VIDEO:
+                mm_data = self._make_video_mm_data(input_mm)
+            elif modality == Modality.TIME_SERIES:
+                mm_data = self._make_time_series_mm_data(input_mm)
+            input_mm_data.append(mm_data)
+
+        result = PreprocessInputResult(input_ids=input_ids, input_multimodals=dict(mm_data=input_mm_data))
+
         return result
diff --git a/lmdeploy/pytorch/models/interns1_pro_ts.py b/lmdeploy/pytorch/models/interns1_pro_ts.py
index 48ba00fcef..6b400febbd 100644
--- a/lmdeploy/pytorch/models/interns1_pro_ts.py
+++ b/lmdeploy/pytorch/models/interns1_pro_ts.py
@@ -154,7 +154,7 @@ def __init__(self, d_model, max_len=20000, dtype: torch.dtype = None, device: to
         div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) * (-math.log(10000.0) / d_model))
         pe[:, 0::2] = torch.sin(position * div_term)
         pe[:, 1::2] = torch.cos(position * div_term)
-        # TODO: zhouxinyu, hf forces float32 during init, but becomes bf16 during forward
+        # hf forces float32 during init, but becomes bf16 during forward
         pe = pe.unsqueeze(0).transpose(0, 1).to(dtype=dtype, device=device)  # (max_len, 1, d_model)
         self.register_buffer('pe', pe, persistent=True)
 
diff --git a/lmdeploy/pytorch/models/internvl.py b/lmdeploy/pytorch/models/internvl.py
index 5b6c261dd2..43b80644f6 100644
--- a/lmdeploy/pytorch/models/internvl.py
+++ b/lmdeploy/pytorch/models/internvl.py
@@ -12,7 +12,7 @@
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.models.utils.micro_batch import enable_micro_batch, split_batch
-from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.nn import LayerNorm, RMSNorm
 from lmdeploy.pytorch.nn.linear import build_colwise_linear, build_o_proj, build_qkv_proj, build_rowwise_linear
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
@@ -992,10 +992,10 @@ def preprocess_input(self,
             if isinstance(num_pad, torch.Tensor):
                 num_pad = num_pad.item()
 
-            mm_data = MultiModalTensor(data=pixel_values,
-                                       start=offset,
-                                       end=offset + num_pad,
-                                       meta=dict(image_token_id=image_token_id))
+            mm_data = MultiModalData(data=pixel_values,
+                                     start=offset,
+                                     end=offset + num_pad,
+                                     meta=dict(image_token_id=image_token_id))
             input_imgs.append(mm_data)
 
         result = PreprocessInputResult(
diff --git a/lmdeploy/pytorch/models/internvl3_hf.py b/lmdeploy/pytorch/models/internvl3_hf.py
index 7cd4cd940c..4ea2eb2f45 100644
--- a/lmdeploy/pytorch/models/internvl3_hf.py
+++ b/lmdeploy/pytorch/models/internvl3_hf.py
@@ -13,7 +13,7 @@
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.models.utils.micro_batch import enable_micro_batch, split_batch
-from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.nn import LayerNorm, RMSNorm
 from lmdeploy.pytorch.nn.linear import build_colwise_linear, build_o_proj, build_qkv_proj, build_rowwise_linear
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
@@ -736,10 +736,10 @@ def preprocess_input(self,
             if isinstance(num_pad, torch.Tensor):
                 num_pad = num_pad.item()
 
-            mm_data = MultiModalTensor(data=pixel_values,
-                                       start=offset,
-                                       end=offset + num_pad,
-                                       meta=dict(image_token_id=image_token_id))
+            mm_data = MultiModalData(data=pixel_values,
+                                     start=offset,
+                                     end=offset + num_pad,
+                                     meta=dict(image_token_id=image_token_id))
             input_imgs.append(mm_data)
 
         result = PreprocessInputResult(
diff --git a/lmdeploy/pytorch/models/llama4.py b/lmdeploy/pytorch/models/llama4.py
index 4b3c2196bc..e7711b83d3 100644
--- a/lmdeploy/pytorch/models/llama4.py
+++ b/lmdeploy/pytorch/models/llama4.py
@@ -8,7 +8,7 @@
 import lmdeploy.pytorch.distributed as dist
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config
 from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_merged_colwise_linear, build_qkv_proj,
                                         build_rowwise_linear)
@@ -1033,10 +1033,10 @@ def preprocess_input(self,
             if isinstance(num_pad, torch.Tensor):
                 num_pad = num_pad.item()
 
-            mm_data = MultiModalTensor(data=pixel_values,
-                                       start=offset,
-                                       end=offset + num_pad,
-                                       meta=dict(image_token_id=image_token_id))
+            mm_data = MultiModalData(data=pixel_values,
+                                     start=offset,
+                                     end=offset + num_pad,
+                                     meta=dict(image_token_id=image_token_id))
             input_imgs.append(mm_data)
 
         result = PreprocessInputResult(
diff --git a/lmdeploy/pytorch/models/llava.py b/lmdeploy/pytorch/models/llava.py
index e87242df4c..4004441050 100644
--- a/lmdeploy/pytorch/models/llava.py
+++ b/lmdeploy/pytorch/models/llava.py
@@ -11,7 +11,7 @@
 
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.nn.linear import build_colwise_linear, build_qkv_proj, build_rowwise_linear
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
@@ -555,10 +555,10 @@ def preprocess_input(self,
             if isinstance(num_pad, torch.Tensor):
                 num_pad = num_pad.item()
 
-            mm_data = MultiModalTensor(data=pixel_values,
-                                       start=offset,
-                                       end=offset + num_pad,
-                                       meta=dict(image_token_id=image_token_id))
+            mm_data = MultiModalData(data=pixel_values,
+                                     start=offset,
+                                     end=offset + num_pad,
+                                     meta=dict(image_token_id=image_token_id))
             input_imgs.append(mm_data)
 
         result = PreprocessInputResult(
@@ -834,10 +834,10 @@ def preprocess_input(self,
             if isinstance(num_pad, torch.Tensor):
                 num_pad = num_pad.item()
 
-            mm_data = MultiModalTensor(data=pixel_values,
-                                       start=offset,
-                                       end=offset + num_pad,
-                                       meta=dict(image_sizes=image_sizes, image_token_id=image_token_id))
+            mm_data = MultiModalData(data=pixel_values,
+                                     start=offset,
+                                     end=offset + num_pad,
+                                     meta=dict(image_sizes=image_sizes, image_token_id=image_token_id))
             input_imgs.append(mm_data)
 
         result = PreprocessInputResult(
diff --git a/lmdeploy/pytorch/models/phi3_v.py b/lmdeploy/pytorch/models/phi3_v.py
index c6804d5586..aff3f78935 100644
--- a/lmdeploy/pytorch/models/phi3_v.py
+++ b/lmdeploy/pytorch/models/phi3_v.py
@@ -8,7 +8,7 @@
 
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .phi3 import Phi3ForCausalLM, Phi3Model
@@ -379,10 +379,10 @@ def preprocess_input(self,
             if isinstance(num_pad, torch.Tensor):
                 num_pad = num_pad.item()
 
-            mm_data = MultiModalTensor(data=pixel_values,
-                                       start=offset,
-                                       end=offset + num_pad,
-                                       meta=dict(image_sizes=image_sizes, image_token_id=image_token_id))
+            mm_data = MultiModalData(data=pixel_values,
+                                     start=offset,
+                                     end=offset + num_pad,
+                                     meta=dict(image_sizes=image_sizes, image_token_id=image_token_id))
             input_imgs.append(mm_data)
 
         result = PreprocessInputResult(
diff --git a/lmdeploy/pytorch/models/qwen2_5_vl.py b/lmdeploy/pytorch/models/qwen2_5_vl.py
index 9c19a7de21..0458d01c76 100644
--- a/lmdeploy/pytorch/models/qwen2_5_vl.py
+++ b/lmdeploy/pytorch/models/qwen2_5_vl.py
@@ -12,7 +12,7 @@
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.models.qwen2_vl import Qwen2Model
-from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, FlashAttention, RMSNorm, SiluAndMul
 from lmdeploy.pytorch.nn.linear import build_merged_colwise_linear, build_qkv_proj, build_rowwise_linear
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
@@ -687,9 +687,6 @@ def get_input_processor(self) -> BaseModelInputProcessor:
         return self.input_processor
 
 
-InputMultiModalType = List[Dict[str, Any]]
-
-
 class Qwen2_5_VLInputProcessor(BaseModelInputProcessor):
     """Qwen2 input processor."""
 
@@ -715,10 +712,10 @@ def preprocess_input(self,
             if isinstance(num_pad, torch.Tensor):
                 num_pad = num_pad.item()
 
-            mm_data = MultiModalTensor(data=pixel_values,
-                                       start=start,
-                                       end=start + num_pad,
-                                       meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id))
+            mm_data = MultiModalData(data=pixel_values,
+                                     start=start,
+                                     end=start + num_pad,
+                                     meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id))
             input_imgs.append(mm_data)
 
         result = PreprocessInputResult(
diff --git a/lmdeploy/pytorch/models/qwen2_vl.py b/lmdeploy/pytorch/models/qwen2_vl.py
index 605f8ded76..888d350aa2 100644
--- a/lmdeploy/pytorch/models/qwen2_vl.py
+++ b/lmdeploy/pytorch/models/qwen2_vl.py
@@ -8,7 +8,7 @@
 
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, FlashAttention, LayerNorm, RMSNorm, SiluAndMul,
                                  build_rotary_embedding_from_config)
 from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_merged_colwise_linear, build_qkv_proj,
@@ -890,9 +890,6 @@ def get_input_processor(self) -> BaseModelInputProcessor:
         return self.input_processor
 
 
-InputMultiModalType = List[Dict[str, Any]]
-
-
 class Qwen2VLInputProcessor(BaseModelInputProcessor):
     """Qwen2 input processor."""
 
@@ -918,10 +915,10 @@ def preprocess_input(self,
             if isinstance(num_pad, torch.Tensor):
                 num_pad = num_pad.item()
 
-            mm_data = MultiModalTensor(data=pixel_values,
-                                       start=start,
-                                       end=start + num_pad,
-                                       meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id))
+            mm_data = MultiModalData(data=pixel_values,
+                                     start=start,
+                                     end=start + num_pad,
+                                     meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id))
             input_imgs.append(mm_data)
 
         result = PreprocessInputResult(
diff --git a/lmdeploy/pytorch/models/qwen3_5.py b/lmdeploy/pytorch/models/qwen3_5.py
index 59f373f075..8e3d51a385 100644
--- a/lmdeploy/pytorch/models/qwen3_5.py
+++ b/lmdeploy/pytorch/models/qwen3_5.py
@@ -20,11 +20,12 @@
                                         build_rowwise_linear)
 from lmdeploy.pytorch.nn.rotary_embedding import get_rope_parameters
 from lmdeploy.pytorch.weight_loader.model_weight_loader import default_weight_loader, load_weight
+from lmdeploy.vl.constants import Modality
 
 from .patch import add_prefix
 from .qwen2_5_vl import Qwen2_5_VisionRotaryEmbedding as Qwen3_5VisionRotaryEmbedding
-from .qwen2_5_vl import Qwen2_5_VLInputProcessor as Qwen3_5InputProcessor
 from .qwen2_5_vl import Qwen2_5_VLVisionAttention as Qwen3_5VisionAttention
+from .qwen3_vl import Qwen3VLInputProcessor as Qwen3_5InputProcessor
 from .utils.cudagraph import CudaGraphMeta, CudaGraphMixin
 from .utils.model import DeployModelMixinV1, vlm_model
 
@@ -1144,14 +1145,20 @@ def prepare_inputs_for_generation(
         grid_thw = None
         pos_embeds = None
         if context.input_multimodals is not None:
-            image_data = [input_mm.get('image', []) for input_mm in context.input_multimodals]
-            if len(image_data) > 0:
-                # flatten batch
-                image_data = [data for im_data in image_data for data in im_data]
-                pixel_values = torch.cat([data.data for data in image_data])
-                image_token_id = image_data[0].meta['image_token_id']
-                image_mask = input_ids == image_token_id
-                grid_thw = torch.cat([data.meta['grid_thw'] for data in image_data]).cpu()
+            mm_inputs = [input_mm.get('mm_data', []) for input_mm in context.input_multimodals]
+            # flatten batch
+            mm_inputs = [item for sublist in mm_inputs for item in sublist]
+
+            if len(mm_inputs) > 0:
+                modality = mm_inputs[0].modality
+                pixel_values = torch.cat([inp.data for inp in mm_inputs])
+
+                image_token_id = mm_inputs[0].meta.get('image_token_id')
+                video_token_id = mm_inputs[0].meta.get('video_token_id')
+                mm_token_id = image_token_id if modality == Modality.IMAGE else video_token_id
+                image_mask = (input_ids == mm_token_id)
+
+                grid_thw = torch.cat([data.meta['grid_thw'] for data in mm_inputs]).cpu()
                 vis_pos_emb = self.model.visual.rot_pos_emb(grid_thw)
                 pos_embeds = self.model.visual.fast_pos_embed_interpolate(grid_thw)
                 vis_cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
@@ -1341,9 +1348,10 @@ def _update_model_meta_prefilling(self, context: StepContext):
         mrope_position_ids = []
         new_model_metas = []
         for pos_ids, model_meta, input_mm in zip(batched_pos_ids, model_metas, input_multimodals):
-            images = []
+            mm_data_list = []
             if input_mm is not None:
-                images = input_mm.get('image', [])
+                mm_data_list.extend(input_mm.get('mm_data', []))
+
             if model_meta is None or 'mrope_delta' not in model_meta:
                 mrope_delta = 0
             else:
@@ -1352,19 +1360,63 @@ def _update_model_meta_prefilling(self, context: StepContext):
             pos_start = pos_ids[0].item()
             mrope_pos_ids = pos_ids + mrope_delta
             mrope_pos_ids = mrope_pos_ids[None].expand(3, -1).clone()
-            for img in images:
-                grid_thw = img.meta['grid_thw'][0].tolist()
-                _, h, w = grid_thw
-                h //= 2
-                w //= 2
-                num_pad = img.end - img.start - max(h, w)
-                mrope_delta -= num_pad
-                fill_start = img.start - pos_start
-                fill_end = img.end - pos_start
-                img_pos_ids = self._get_multimodal_pos_ids(grid_thw, pos_ids.device)
-                img_pos_ids += mrope_pos_ids[:, fill_start:fill_start + 1]
-                mrope_pos_ids[:, fill_end:] -= num_pad
-                mrope_pos_ids[:, fill_start:fill_end] = img_pos_ids
+
+            for mm_data in mm_data_list:
+                if mm_data.modality == Modality.IMAGE:
+                    grid_thw = mm_data.meta['grid_thw'][0].tolist()
+                    _, h, w = grid_thw
+                    h //= 2
+                    w //= 2
+                    num_pad = mm_data.end - mm_data.start - max(h, w)
+                    mrope_delta -= num_pad
+                    fill_start = mm_data.start - pos_start
+                    fill_end = mm_data.end - pos_start
+                    img_pos_ids = self._get_multimodal_pos_ids(grid_thw, pos_ids.device)
+                    img_pos_ids += mrope_pos_ids[:, fill_start:fill_start + 1]
+                    mrope_pos_ids[:, fill_end:] -= num_pad
+                    mrope_pos_ids[:, fill_start:fill_end] = img_pos_ids
+                elif mm_data.modality == Modality.VIDEO:
+                    video_token_id = self.config.video_token_id
+                    grid_thw = mm_data.meta['grid_thw']
+
+                    grid_thw = torch.repeat_interleave(grid_thw, grid_thw[:, 0], dim=0)
+                    grid_thw[:, 0] = 1
+
+                    position_ids_list = []
+                    input_tokens = context.input_ids.tolist()[0]
+
+                    st = 0
+                    # treat each frame separately as a single image
+                    for video_idx in range(grid_thw.shape[0]):
+                        # text before video. e.g. <0.3 seconds><|vision_start|> ...
+                        ed_video = input_tokens.index(video_token_id, st)
+                        ed = ed_video
+                        text_len = ed - st
+                        st_idx = position_ids_list[-1].max() + 1 if len(position_ids_list) > 0 else 0
+                        text_pos_ids = torch.arange(text_len, device=pos_ids.device).view(1, -1).expand(3, -1) + st_idx
+                        position_ids_list.append(text_pos_ids)
+
+                        # video frame. <video_pad> ... <|video_end|>
+                        t, h, w = (
+                            grid_thw[video_idx][0],
+                            grid_thw[video_idx][1] // 2,
+                            grid_thw[video_idx][2] // 2,
+                        )
+                        video_pos_ids = self._get_multimodal_pos_ids(grid_thw[video_idx], pos_ids.device)
+                        position_ids_list.append(video_pos_ids + text_len + st_idx)
+
+                        st = ed + t * h * w
+
+                    # text after video, <|vision_end|> ...
+                    if st < len(input_tokens):
+                        st_idx = position_ids_list[-1].max() + 1 if len(position_ids_list) > 0 else 0
+                        text_len = len(input_tokens) - st
+                        text_pos_ids = torch.arange(text_len, device=pos_ids.device).view(1, -1).expand(3, -1) + st_idx
+                        position_ids_list.append(text_pos_ids)
+
+                    mrope_pos_ids = torch.cat(position_ids_list, dim=1).reshape(3, -1)
+                    mrope_delta = mrope_pos_ids.max() + 1 - pos_ids.size(0)
+                    mrope_pos_ids += pos_start  # add back the original position offset
 
             mrope_position_ids.append(mrope_pos_ids)
             new_model_metas.append(dict(mrope_delta=mrope_delta))
diff --git a/lmdeploy/pytorch/models/qwen3_5_moe.py b/lmdeploy/pytorch/models/qwen3_5_moe.py
index c475b873a5..98d0970ae2 100644
--- a/lmdeploy/pytorch/models/qwen3_5_moe.py
+++ b/lmdeploy/pytorch/models/qwen3_5_moe.py
@@ -15,10 +15,10 @@
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .patch import add_prefix, get_build_model_context
-from .qwen2_5_vl import Qwen2_5_VLInputProcessor as Qwen3_5MoeInputProcessor
 from .qwen3_5 import (Qwen3_5Attention, Qwen3_5DecoderLayer, Qwen3_5ForConditionalGeneration, Qwen3_5GatedDeltaNet,
                       Qwen3_5MLP, Qwen3_5Model, Qwen3_5TextModel, Qwen3_5TextRotaryEmbedding)
 from .qwen3_5 import Qwen3_5VisionModel as Qwen3_5MoeVisionModel
+from .qwen3_vl import Qwen3VLInputProcessor as Qwen3_5MoeInputProcessor
 
 
 class Qwen3_5MoeTopKRouter(nn.Module):
@@ -51,7 +51,6 @@ def __init__(self,
                  device: torch.device | None = None,
                  prefix: str = ''):
         super().__init__()
-        # TODO: zhouxinyu, determine modules_to_not_convert from config file
         quantization_config = getattr(config, 'quantization_config', None)
         self.layer_idx = layer_idx
         self.hidden_dim = config.hidden_size
diff --git a/lmdeploy/pytorch/models/qwen3_moe.py b/lmdeploy/pytorch/models/qwen3_moe.py
index 5157199175..098194b8dd 100644
--- a/lmdeploy/pytorch/models/qwen3_moe.py
+++ b/lmdeploy/pytorch/models/qwen3_moe.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Tuple
 
 import torch
 from torch import nn
@@ -95,7 +95,7 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        past_key_value: Tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -198,7 +198,6 @@ def __init__(self,
                  device: torch.device = None,
                  prefix: str = ''):
         super().__init__()
-        # TODO: zhouxinyu, determine modules_to_not_convert from config file
         quantization_config = getattr(config, 'quantization_config', None)
         self.layer_idx = layer_idx
         self.hidden_dim = config.hidden_size
@@ -318,8 +317,8 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        past_key_value: List[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
         all_routed_experts: torch.Tensor = None,
     ):
@@ -396,10 +395,10 @@ def __init__(self,
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: List[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
         all_routed_experts: torch.Tensor = None,
     ):
         """Rewrite of LlamaModel.forward."""
@@ -519,7 +518,7 @@ def get_input_embeddings(self):
     def prepare_inputs_for_generation(
         self,
         past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
diff --git a/lmdeploy/pytorch/models/qwen3_next.py b/lmdeploy/pytorch/models/qwen3_next.py
index 4c56c01aa3..89953ecee9 100644
--- a/lmdeploy/pytorch/models/qwen3_next.py
+++ b/lmdeploy/pytorch/models/qwen3_next.py
@@ -369,7 +369,6 @@ def __init__(self,
                  dtype: torch.dtype = None,
                  device: torch.device = None):
         super().__init__()
-        # TODO: zhouxinyu, determine modules_to_not_convert from config file
         quantization_config = getattr(config, 'quantization_config', None)
         self.layer_idx = layer_idx
         self.hidden_dim = config.hidden_size
diff --git a/lmdeploy/pytorch/models/qwen3_vl.py b/lmdeploy/pytorch/models/qwen3_vl.py
index a6f694c6f2..0d0434a58f 100644
--- a/lmdeploy/pytorch/models/qwen3_vl.py
+++ b/lmdeploy/pytorch/models/qwen3_vl.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 from functools import lru_cache
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Tuple
 
 import numpy as np
 import torch
@@ -9,16 +9,17 @@
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 
-from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor
+from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
+from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.nn import LayerNorm
 from lmdeploy.pytorch.nn.linear import build_colwise_linear, build_rowwise_linear
 from lmdeploy.pytorch.nn.rotary_embedding import get_rope_parameters
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
+from lmdeploy.vl.constants import Modality
 
 from .patch import add_prefix
 from .qwen2_5_vl import Qwen2_5_VisionRotaryEmbedding as Qwen3VLVisionRotaryEmbedding
-from .qwen2_5_vl import Qwen2_5_VLInputProcessor as Qwen3VLInputProcessor
 from .qwen2_5_vl import Qwen2_5_VLVisionAttention as Qwen3VLVisionAttention
 from .qwen3 import Qwen3model
 from .utils.cudagraph import CudaGraphMeta, CudaGraphMixin
@@ -116,23 +117,16 @@ def __init__(self,
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: List[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
         mrope_position_ids: torch.LongTensor = None,
         # args for deepstack
-        visual_pos_masks: Optional[torch.Tensor] = None,
-        deepstack_visual_embeds: Optional[list[torch.Tensor]] = None,
+        visual_pos_masks: torch.Tensor | None = None,
+        deepstack_visual_embeds: List[torch.Tensor] | None = None,
     ):
-        """visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`,
-        *optional*):
-
-        The mask of the visual positions. deepstack_visual_embeds (`list[torch.Tensor]`, *optional*):     The deepstack
-        visual embeddings. The shape is (num_layers, visual_seqlen, embed_dim).     The feature is extracted from the
-        different visual encoder layers, and fed to the decoder     hidden states. It's from the paper DeepStack (
-        https://arxiv.org/abs/2406.04)
-        """
+        """Rewrite of LlamaModel.forward."""
 
         # token embedding
         if inputs_embeds is None:
@@ -279,7 +273,7 @@ def __init__(
     def forward(self,
                 hidden_states: torch.Tensor,
                 cu_seqlens: torch.Tensor,
-                rotary_pos_emb: Optional[torch.Tensor] = None) -> torch.Tensor:
+                rotary_pos_emb: torch.Tensor | None = None) -> torch.Tensor:
         hidden_states = hidden_states + self.attn(
             self.norm1(hidden_states),
             cu_seqlens=cu_seqlens,
@@ -610,7 +604,7 @@ def get_input_embeddings(self):
     def prepare_inputs_for_generation(
         self,
         past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -627,14 +621,20 @@ def prepare_inputs_for_generation(
         grid_thw = None
         pos_embeds = None
         if context.input_multimodals is not None:
-            image_data = [input_mm.get('image', []) for input_mm in context.input_multimodals]
-            if len(image_data) > 0:
-                # flatten batch
-                image_data = [data for im_data in image_data for data in im_data]
-                pixel_values = torch.cat([data.data for data in image_data])
-                image_token_id = image_data[0].meta['image_token_id']
-                image_mask = input_ids == image_token_id
-                grid_thw = torch.cat([data.meta['grid_thw'] for data in image_data]).cpu()
+            mm_inputs = [input_mm.get('mm_data', []) for input_mm in context.input_multimodals]
+            # flatten batch
+            mm_inputs = [item for sublist in mm_inputs for item in sublist]
+
+            if len(mm_inputs) > 0:
+                modality = mm_inputs[0].modality
+                pixel_values = torch.cat([inp.data for inp in mm_inputs])
+
+                image_token_id = mm_inputs[0].meta.get('image_token_id')
+                video_token_id = mm_inputs[0].meta.get('video_token_id')
+                mm_token_id = image_token_id if modality == Modality.IMAGE else video_token_id
+                image_mask = (input_ids == mm_token_id)
+
+                grid_thw = torch.cat([data.meta['grid_thw'] for data in mm_inputs]).cpu()
                 vis_pos_emb = self.visual.rot_pos_emb(grid_thw)
                 pos_embeds = self.visual.fast_pos_embed_interpolate(grid_thw)
                 vis_cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
@@ -793,9 +793,10 @@ def _update_model_meta_prefilling(self, context: StepContext):
         mrope_position_ids = []
         new_model_metas = []
         for pos_ids, model_meta, input_mm in zip(batched_pos_ids, model_metas, input_multimodals):
-            images = []
+            mm_data_list = []
             if input_mm is not None:
-                images = input_mm.get('image', [])
+                mm_data_list.extend(input_mm.get('mm_data', []))
+
             if model_meta is None or 'mrope_delta' not in model_meta:
                 mrope_delta = 0
             else:
@@ -804,19 +805,63 @@ def _update_model_meta_prefilling(self, context: StepContext):
             pos_start = pos_ids[0].item()
             mrope_pos_ids = pos_ids + mrope_delta
             mrope_pos_ids = mrope_pos_ids[None].expand(3, -1).clone()
-            for img in images:
-                grid_thw = img.meta['grid_thw'][0].tolist()
-                _, h, w = grid_thw
-                h //= 2
-                w //= 2
-                num_pad = img.end - img.start - max(h, w)
-                mrope_delta -= num_pad
-                fill_start = img.start - pos_start
-                fill_end = img.end - pos_start
-                img_pos_ids = self._get_multimodal_pos_ids(grid_thw, pos_ids.device)
-                img_pos_ids += mrope_pos_ids[:, fill_start:fill_start + 1]
-                mrope_pos_ids[:, fill_end:] -= num_pad
-                mrope_pos_ids[:, fill_start:fill_end] = img_pos_ids
+
+            for mm_data in mm_data_list:
+                if mm_data.modality == Modality.IMAGE:
+                    grid_thw = mm_data.meta['grid_thw'][0].tolist()
+                    _, h, w = grid_thw
+                    h //= 2
+                    w //= 2
+                    num_pad = mm_data.end - mm_data.start - max(h, w)
+                    mrope_delta -= num_pad
+                    fill_start = mm_data.start - pos_start
+                    fill_end = mm_data.end - pos_start
+                    img_pos_ids = self._get_multimodal_pos_ids(grid_thw, pos_ids.device)
+                    img_pos_ids += mrope_pos_ids[:, fill_start:fill_start + 1]
+                    mrope_pos_ids[:, fill_end:] -= num_pad
+                    mrope_pos_ids[:, fill_start:fill_end] = img_pos_ids
+                elif mm_data.modality == Modality.VIDEO:
+                    video_token_id = self.config.video_token_id
+                    grid_thw = mm_data.meta['grid_thw']
+
+                    grid_thw = torch.repeat_interleave(grid_thw, grid_thw[:, 0], dim=0)
+                    grid_thw[:, 0] = 1
+
+                    position_ids_list = []
+                    input_tokens = context.input_ids.tolist()[0]
+
+                    st = 0
+                    # treat each frame separately as a single image
+                    for video_idx in range(grid_thw.shape[0]):
+                        # text before video. e.g. <0.3 seconds><|vision_start|> ...
+                        ed_video = input_tokens.index(video_token_id, st)
+                        ed = ed_video
+                        text_len = ed - st
+                        st_idx = position_ids_list[-1].max() + 1 if len(position_ids_list) > 0 else 0
+                        text_pos_ids = torch.arange(text_len, device=pos_ids.device).view(1, -1).expand(3, -1) + st_idx
+                        position_ids_list.append(text_pos_ids)
+
+                        # video frame. <video_pad> ... <|video_end|>
+                        t, h, w = (
+                            grid_thw[video_idx][0],
+                            grid_thw[video_idx][1] // 2,
+                            grid_thw[video_idx][2] // 2,
+                        )
+                        video_pos_ids = self._get_multimodal_pos_ids(grid_thw[video_idx], pos_ids.device)
+                        position_ids_list.append(video_pos_ids + text_len + st_idx)
+
+                        st = ed + t * h * w
+
+                    # text after video, <|vision_end|> ...
+                    if st < len(input_tokens):
+                        st_idx = position_ids_list[-1].max() + 1 if len(position_ids_list) > 0 else 0
+                        text_len = len(input_tokens) - st
+                        text_pos_ids = torch.arange(text_len, device=pos_ids.device).view(1, -1).expand(3, -1) + st_idx
+                        position_ids_list.append(text_pos_ids)
+
+                    mrope_pos_ids = torch.cat(position_ids_list, dim=1).reshape(3, -1)
+                    mrope_delta = mrope_pos_ids.max() + 1 - pos_ids.size(0)
+                    mrope_pos_ids += pos_start  # add back the original position offset
 
             mrope_position_ids.append(mrope_pos_ids)
             new_model_metas.append(dict(mrope_delta=mrope_delta))
@@ -828,7 +873,7 @@ def _update_model_meta_prefilling(self, context: StepContext):
 
     def update_model_metas(self,
                            past_key_values: List[List[torch.Tensor]],
-                           inputs_embeds: Optional[torch.Tensor] = None,
+                           inputs_embeds: torch.Tensor | None = None,
                            context: StepContext = None):
         """Update model meta."""
         if context.is_decoding:
@@ -841,4 +886,68 @@ def get_input_processor(self) -> BaseModelInputProcessor:
         return self.input_processor
 
 
-InputMultiModalType = List[Dict[str, Any]]
+class Qwen3VLInputProcessor(BaseModelInputProcessor):
+    """Qwen3 input processor."""
+
+    def __init__(self, config: PretrainedConfig) -> None:
+        self.config = config
+
+    def _make_image_mm_data(self, input_mm: Dict[str, Any]) -> MultiModalData:
+        """Make image MultiModalData."""
+        pixel_values = input_mm['pixel_values']
+        image_grid_thw = input_mm['image_grid_thw']
+        offset = input_mm['offset']
+        start = offset
+        image_token_id = input_mm['image_token_id']
+        num_pad = input_mm['image_tokens']
+        if isinstance(num_pad, torch.Tensor):
+            num_pad = num_pad.item()
+
+        mm_data = MultiModalData(modality=Modality.IMAGE,
+                                 data=pixel_values,
+                                 start=start,
+                                 end=start + num_pad,
+                                 meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id))
+        return mm_data
+
+    def _make_video_mm_data(self, input_mm: Dict[str, Any]) -> MultiModalData:
+        """Make video MultiModalData."""
+        pixel_values_videos = input_mm['pixel_values_videos']
+        video_grid_thw = input_mm['video_grid_thw']
+        offset = input_mm['offset']
+        start = offset
+        video_token_id = input_mm['video_token_id']
+        num_pad = input_mm['video_tokens']
+        if isinstance(num_pad, torch.Tensor):
+            num_pad = num_pad.item()
+
+        mm_data = MultiModalData(modality=Modality.VIDEO,
+                                 data=pixel_values_videos,
+                                 start=start,
+                                 end=start + num_pad,
+                                 meta=dict(
+                                     grid_thw=video_grid_thw,
+                                     video_token_id=video_token_id,
+                                 ))
+        return mm_data
+
+    def preprocess_input(self,
+                         input_ids: List[int],
+                         input_multimodals: List[Dict[str, Any]] = None,
+                         **kwargs) -> PreprocessInputResult:
+        """Prepare multimodal input."""
+        if input_multimodals is None or len(input_multimodals) == 0:
+            return input_ids, input_multimodals
+
+        input_mm_data = []
+        for input_mm in input_multimodals:
+            modality = input_mm.get('modality')
+            if modality == Modality.IMAGE:
+                mm_data = self._make_image_mm_data(input_mm)
+            elif modality == Modality.VIDEO:
+                mm_data = self._make_video_mm_data(input_mm)
+            input_mm_data.append(mm_data)
+
+        result = PreprocessInputResult(input_ids=input_ids, input_multimodals=dict(mm_data=input_mm_data))
+
+        return result
diff --git a/lmdeploy/pytorch/models/qwen3_vl_moe.py b/lmdeploy/pytorch/models/qwen3_vl_moe.py
index 5810ab9b11..0b453f0a39 100644
--- a/lmdeploy/pytorch/models/qwen3_vl_moe.py
+++ b/lmdeploy/pytorch/models/qwen3_vl_moe.py
@@ -44,14 +44,7 @@ def forward(
         visual_pos_masks: Optional[torch.Tensor] = None,
         deepstack_visual_embeds: Optional[list[torch.Tensor]] = None,
     ):
-        """visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`,
-        *optional*):
-
-        The mask of the visual positions. deepstack_visual_embeds (`list[torch.Tensor]`, *optional*):     The deepstack
-        visual embeddings. The shape is (num_layers, visual_seqlen, embed_dim).     The feature is extracted from the
-        different visual encoder layers, and fed to the decoder     hidden states. It's from the paper DeepStack (
-        https://arxiv.org/abs/2406.04)
-        """
+        """Rewrite of LlamaModel.forward."""
 
         # token embedding
         if inputs_embeds is None:
diff --git a/lmdeploy/pytorch/models/utils/multimodal.py b/lmdeploy/pytorch/models/utils/multimodal.py
deleted file mode 100644
index 699f88021f..0000000000
--- a/lmdeploy/pytorch/models/utils/multimodal.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Tuple
-
-from lmdeploy.pytorch.multimodal.data_type import MultiModalInputs
-
-PreparedInputs = Tuple[List[int], MultiModalInputs]
-
-
-class MultiModalMixin:
-
-    def prepare_multimodal_input(self, input_ids, input_multimodals, **kwargs) -> PreparedInputs:
-        """Prepare multimodals inputs."""
-        raise NotImplementedError('prepare input not implemented.')
diff --git a/lmdeploy/pytorch/multimodal/__init__.py b/lmdeploy/pytorch/multimodal/__init__.py
index c3e8c6a16f..fc2d5890d9 100644
--- a/lmdeploy/pytorch/multimodal/__init__.py
+++ b/lmdeploy/pytorch/multimodal/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .data_type import MultiModalData, MultiModalTensor
+from .data_type import MultiModalData
 
-__all__ = ['MultiModalData', 'MultiModalTensor']
+__all__ = ['MultiModalData']
diff --git a/lmdeploy/pytorch/multimodal/data_type.py b/lmdeploy/pytorch/multimodal/data_type.py
index dd3ec9a37d..4ff71fdcbc 100644
--- a/lmdeploy/pytorch/multimodal/data_type.py
+++ b/lmdeploy/pytorch/multimodal/data_type.py
@@ -4,24 +4,20 @@
 
 from torch import Tensor
 
-
-class MultiModalData:
-    pass
-
-
-MultiModalDataList = List[MultiModalData]
+from lmdeploy.vl.constants import Modality
 
 NestedTensor = Union[Tensor, List[Tensor]]
 
 
 @dataclass
-class MultiModalTensor:
+class MultiModalData:
     data: NestedTensor
     start: int
     end: int = None
-    encoder_len: int = None
     meta: Dict[str, Any] = None
 
+    modality: Modality = Modality.IMAGE
+
     def __post_init__(self):
         if self.end is None:
             self.end = self.start
@@ -53,7 +49,7 @@ def to_device(self, device: str, non_blocking: bool = False):
                 new_meta[k] = v
 
         out_dict['meta'] = new_meta
-        return MultiModalTensor(**out_dict)
+        return MultiModalData(**out_dict)
 
 
-MultiModalInputs = Dict[str, List[MultiModalTensor]]
+MultiModalInputs = Dict[str, List[MultiModalData]]
diff --git a/lmdeploy/pytorch/multimodal/image_type.py b/lmdeploy/pytorch/multimodal/image_type.py
deleted file mode 100644
index 19211a381f..0000000000
--- a/lmdeploy/pytorch/multimodal/image_type.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from dataclasses import dataclass
-from typing import Any, ClassVar, Dict
-
-from PIL import Image
-
-from .data_type import MultiModalData
-
-
-@dataclass
-class ImageData(MultiModalData):
-    data: Image
-    loc: int
-    meta: Dict[str, Any] = None
-    type: ClassVar[str] = 'image'
diff --git a/lmdeploy/serve/core/async_engine.py b/lmdeploy/serve/core/async_engine.py
index 2a68483ad4..17cdd90e7f 100644
--- a/lmdeploy/serve/core/async_engine.py
+++ b/lmdeploy/serve/core/async_engine.py
@@ -294,6 +294,7 @@ async def generate(
             input_ids: List | None = None,
             enable_thinking: bool | None = None,
             chat_template_kwargs: Dict | None = None,
+            media_io_kwargs: Dict[str, Any] | None = None,
             mm_processor_kwargs: Dict[str, Any] | None = None,
             **kwargs):
         """Generate responses.
@@ -338,6 +339,7 @@ async def generate(
                                                                         tools=tools,
                                                                         reasoning_effort=reasoning_effort,
                                                                         chat_template_kwargs=chat_template_kwargs,
+                                                                        media_io_kwargs=media_io_kwargs,
                                                                         mm_processor_kwargs=mm_processor_kwargs,
                                                                         **kwargs)
             prompt = prompt_input['prompt']
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 3e37caffe5..5d22aea8d3 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -482,6 +482,7 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
         do_preprocess=do_preprocess,
         adapter_name=adapter_name,
         chat_template_kwargs=chat_template_kwargs or None,
+        media_io_kwargs=request.media_io_kwargs,
         mm_processor_kwargs=request.mm_processor_kwargs)
 
     def create_stream_response_json(index: int,
@@ -973,6 +974,7 @@ async def generate(request: GenerateReqInput, raw_request: Request = None):
         sequence_start=True,
         sequence_end=True,
         do_preprocess=False,
+        media_io_kwargs=request.media_io_kwargs,
         mm_processor_kwargs=request.mm_processor_kwargs)
 
     def create_generate_response_json(res, text, output_ids, logprobs, finish_reason, routed_experts=None):
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
index 9749ade086..3b8a33c362 100644
--- a/lmdeploy/serve/openai/protocol.py
+++ b/lmdeploy/serve/openai/protocol.py
@@ -152,11 +152,17 @@ class ChatCompletionRequest(BaseModel):
     enable_thinking: Optional[bool] = None  # will be deprecated in the future
     return_token_ids: Optional[bool] = False
     include_stop_str_in_output: Optional[bool] = False
+    # kwargs for chat template renderer
     chat_template_kwargs: dict[str, Any] | None = Field(
         default=None,
         description=('Additional keyword args to pass to the template renderer. '
                      'Will be accessible by the chat template.'),
     )
+    # kwargs for media IO
+    media_io_kwargs: Optional[dict[str, Any]] = Field(
+        default=None,
+        description=('Additional kwargs to pass to the media IO processing, keyed by modality.'),
+    )
     # kwargs for hf processor
     mm_processor_kwargs: Optional[dict[str, Any]] = Field(
         default=None,
diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py
index 2ea599e687..e05163124a 100644
--- a/lmdeploy/serve/processors/multimodal.py
+++ b/lmdeploy/serve/processors/multimodal.py
@@ -7,6 +7,11 @@
 from lmdeploy.model import MODELS, BaseChatTemplate
 from lmdeploy.tokenizer import Tokenizer
 from lmdeploy.utils import get_logger
+from lmdeploy.vl.constants import Modality
+from lmdeploy.vl.media.connection import load_from_url
+from lmdeploy.vl.media.image import ImageMediaIO
+from lmdeploy.vl.media.time_series import TimeSeriesMediaIO
+from lmdeploy.vl.media.video import VideoMediaIO
 
 logger = get_logger('lmdeploy')
 
@@ -85,122 +90,66 @@ def merge_message_content(msg: Dict) -> Dict:
         return result
 
     @staticmethod
-    async def async_convert_multimodal_data(messages: List[Dict]) -> List[Dict]:
-        """Convert user-input multimodal data into GPT4V message format."""
-        from lmdeploy.vl.time_series_utils import load_time_series
-        from lmdeploy.vl.utils import load_image
+    def _parse_multimodal_item(i: int, in_messages: List[Dict], out_messages: List[Dict], media_io_kwargs: Dict[str,
+                                                                                                                Any]):
+        """Synchronous helper to parse a single multimodal message item."""
+        role = in_messages[i]['role']
+        content = in_messages[i]['content']
+
+        if role != 'user' or isinstance(content, str):
+            out_messages[i] = in_messages[i]
+            return
+
+        assert isinstance(content, list)
+        out_message = dict(role=role, content=[])
+
+        for item in content:
+            item_type = item.get('type')
+            if item_type == 'text':
+                out_message['content'].append(item)
+                continue
+
+            item_params = item.get(item_type, {}).copy()
+            data_src = item_params.pop('url', None) or item_params.pop('data', None)
+
+            if item_type == 'image_data':
+                modality = Modality.IMAGE
+                data = data_src
+            elif item_type == 'image_url':
+                modality = Modality.IMAGE
+                img_io = ImageMediaIO(**media_io_kwargs.get('image', {}))
+                data = load_from_url(data_src, img_io)
+            elif item_type == 'video_url':
+                modality = Modality.VIDEO
+                vid_io = VideoMediaIO(image_io=ImageMediaIO(), **media_io_kwargs.get('video', {}))
+                data, metadata = load_from_url(data_src, vid_io)
+                item_params['video_metadata'] = metadata
+            elif item_type == 'time_series_url':
+                modality = Modality.TIME_SERIES
+                ts_io = TimeSeriesMediaIO(**media_io_kwargs.get('time_series', {}))
+                data = load_from_url(data_src, ts_io)
+            else:
+                raise NotImplementedError(f'unknown type: {item_type}')
+
+            out_message['content'].append({'type': modality, 'data': data, **item_params})
+
+        out_messages[i] = out_message
 
-        if isinstance(messages, Dict):
+    @staticmethod
+    async def async_parse_multimodal_item(messages: List[Dict],
+                                          media_io_kwargs: Dict[str, Any] | None = None) -> List[Dict]:
+        """Convert user-input multimodal data into GPT4V message format."""
+        if isinstance(messages, dict):
             messages = [messages]
-        assert isinstance(messages, List)
+        assert isinstance(messages, list)
 
         out_messages = [None] * len(messages)
-
-        def _inner_call(i, in_messages, out_messages):
-            role = in_messages[i]['role']
-            content = in_messages[i]['content']
-            assert role in ['system', 'user', 'assistant'], \
-                f'unsupported role "{role}"'
-            if role != 'user' or isinstance(content, str):
-                # the content is a user's prompt or an assistant's prompt,
-                # returning it directly
-                out_messages[i] = in_messages[i]
-                return
-            # the role is a user and the content is a list, in which there
-            # might be image_url or image_data
-            assert isinstance(content, List)
-            message = dict(role=role, content=[])
-            for item in content:
-                # image url or base64-encoded image data
-                if item['type'] == 'image_url':
-                    """
-                    convert the following item:
-                    {
-                        'type': 'image_url',
-                        'image_url': {
-                            'url': 'image url or base64-encoded image data',
-                            'key': 'value'  # parameters used in image processing
-                            ...
-                        }
-                    }
-                    to:
-                    {
-                        'type': 'image',
-                        'image': Pillow.Image,
-                        'key': 'value'   # parameters used in image processing
-                        ...
-                    }
-                    """  # noqa
-                    data = item['image_url'].copy()
-                    try:
-                        url = data.pop('url')
-                        image = load_image(url)
-                        data.update(type='image', image=image)
-                        message['content'].append(data)
-                    except KeyError:
-                        logger.error(f'invalid format {message}')
-                elif item['type'] == 'image_data':
-                    """
-                    convert the following item:
-                    {
-                        'type': 'image_data',
-                        'image_data': {
-                            'data': Pillow.Image,
-                            'key': 'value'  # parameters used in image processing
-                            ...
-                        }
-                    }
-                    to:
-                    {
-                        'type': 'image',
-                        'image': Pillow.Image,
-                        'key': 'value'   # parameters used in image processing
-                        ...
-                    }
-                    """  # noqa
-                    data = item['image_data'].copy()
-                    try:
-                        image = data.pop('data')
-                        data.update(type='image', image=image)
-                        message['content'].append(data)
-                    except KeyError:
-                        logger.error(f'invalid format {message}')
-                elif item['type'] == 'time_series_url':
-                    """
-                    convert the following item:
-                    {
-                        'type': 'time_series_url',
-                        'time_series_url': {
-                            'url': 'time series url or base64-encoded time series data',
-                            'key': 'value'  # parameters used in time series processing
-                            ...
-                        }
-                    }
-                    to:
-                    {
-                        'type': 'time_series',
-                        'time_series': np.ndarray,
-                        'key': 'value'   # parameters used in time series processing
-                        ...
-                    }
-                    """  # noqa
-                    data = item['time_series_url'].copy()
-                    try:
-                        url = data.pop('url')
-                        time_series = load_time_series(url)
-                        data.update(type='time_series', time_series=time_series)
-                        message['content'].append(data)
-                    except KeyError:
-                        logger.error(f'invalid format {message}')
-                elif item['type'] in ['text']:
-                    message['content'].append(item)
-                else:
-                    logger.error(f'unexpected content type {message}')
-            out_messages[i] = message
+        media_io_kwargs = media_io_kwargs or {}
+        loop = asyncio.get_event_loop()
 
         await asyncio.gather(*[
-            asyncio.get_event_loop().run_in_executor(None, _inner_call, i, messages, out_messages)
-            for i in range(len(messages))
+            loop.run_in_executor(None, MultimodalProcessor._parse_multimodal_item, i, messages, out_messages,
+                                 media_io_kwargs) for i in range(len(messages))
         ])
         return out_messages
 
@@ -212,6 +161,7 @@ async def get_prompt_input(self,
                                tools: List[object] | None = None,
                                reasoning_effort: Literal['low', 'medium', 'high'] | None = None,
                                chat_template_kwargs: Dict | None = None,
+                               media_io_kwargs: Dict[str, Any] | None = None,
                                mm_processor_kwargs: Dict[str, Any] | None = None,
                                **kwargs):
         """Process prompt and return prompt string and input_ids.
@@ -227,6 +177,7 @@ async def get_prompt_input(self,
             tools: Optional list of tools.
             reasoning_effort: Optional reasoning effort level.
             chat_template_kwargs: Optional kwargs for chat template.
+            media_io_kwargs: Optional kwargs for media IO operations.
             mm_processor_kwargs: Optional kwargs for multimodal processor.
             **kwargs: Additional keyword arguments.
 
@@ -268,6 +219,7 @@ async def get_prompt_input(self,
                                                            adapter_name=adapter_name,
                                                            tools=tools,
                                                            chat_template_kwargs=chat_template_kwargs,
+                                                           media_io_kwargs=media_io_kwargs,
                                                            mm_processor_kwargs=mm_processor_kwargs,
                                                            **kwargs)
         else:
@@ -320,7 +272,7 @@ def _is_image_list(obj) -> bool:
     @staticmethod
     def _re_format_prompt_images_pair(prompt: Tuple) -> Dict:
         """Reformat the prompt to openai message format."""
-        from lmdeploy.vl.utils import load_image
+        from lmdeploy.vl import load_image
 
         messages = {'role': 'user', 'content': []}
         prompt, images = prompt
@@ -352,7 +304,7 @@ def _re_format_prompt_images_pair(prompt: Tuple) -> Dict:
 
     def _has_multimodal_input(self, messages: List[Dict]) -> bool:
         """Check if messages contain multimodal input (images)."""
-        multimodal_types = ['image_url', 'image_data', 'time_series_url']
+        multimodal_types = ['image_url', 'image_data', 'video_url', 'time_series_url']
         return any(
             isinstance(message.get('content'), list) and any(
                 item.get('type') in multimodal_types for item in message['content']) for message in messages)
@@ -397,12 +349,13 @@ async def _get_multimodal_prompt_input(self,
                                            adapter_name: str,
                                            tools: List[object] | None = None,
                                            chat_template_kwargs: Dict | None = None,
+                                           media_io_kwargs: Dict[str, Any] | None = None,
                                            mm_processor_kwargs: Dict[str, Any] | None = None,
                                            **kwargs):
         """Process multimodal prompt and return processed data for inference
         engines."""
         chat_template = self.chat_template if do_preprocess else BaseChatTemplate()
-        messages = await self.async_convert_multimodal_data(messages)
+        messages = await self.async_parse_multimodal_item(messages, media_io_kwargs)
         results = await self.vl_encoder.preprocess(messages, mm_processor_kwargs)
 
         if self.backend == 'turbomind':
@@ -412,19 +365,19 @@ async def _get_multimodal_prompt_input(self,
             # embedding_ranges and so on. All the returned values are passed
             # to tm engine for token generation
             results = await self.vl_encoder.async_infer(results)
-            results = await self.vl_encoder.wrap_for_turbomind(results,
-                                                               chat_template,
-                                                               self.tokenizer,
-                                                               sequence_start,
+            results = await self.vl_encoder.wrap_for_turbomind(messages=results,
+                                                               chat_template=chat_template,
+                                                               tokenizer=self.tokenizer,
+                                                               sequence_start=sequence_start,
                                                                tools=tools,
                                                                chat_template_kwargs=chat_template_kwargs)
         elif self.backend == 'pytorch':
             # for pt engine, this module only conduct the image preprocessing
             # It leaves the vision embedding to the pt engine
-            results = await self.vl_encoder.wrap_for_pytorch(results,
-                                                             chat_template,
-                                                             self.tokenizer,
-                                                             sequence_start,
+            results = await self.vl_encoder.wrap_for_pytorch(messages=results,
+                                                             chat_template=chat_template,
+                                                             tokenizer=self.tokenizer,
+                                                             sequence_start=sequence_start,
                                                              tools=tools,
                                                              chat_template_kwargs=chat_template_kwargs)
         return results
diff --git a/lmdeploy/vl/__init__.py b/lmdeploy/vl/__init__.py
index 7e800b2db2..5c8c69d8b5 100644
--- a/lmdeploy/vl/__init__.py
+++ b/lmdeploy/vl/__init__.py
@@ -1,5 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .time_series_utils import load_time_series
-from .utils import load_image
+from .utils import (encode_image_base64, encode_time_series_base64, encode_video_base64, load_image, load_time_series,
+                    load_video)
 
-__all__ = ['load_image', 'load_time_series']
+__all__ = [
+    'load_image',
+    'load_video',
+    'load_time_series',
+    'encode_image_base64',
+    'encode_video_base64',
+    'encode_time_series_base64',
+]
diff --git a/lmdeploy/vl/constants.py b/lmdeploy/vl/constants.py
index eeaa697cdf..e0cd744f15 100644
--- a/lmdeploy/vl/constants.py
+++ b/lmdeploy/vl/constants.py
@@ -1,3 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-IMAGE_DUMMY_TOKEN_INDEX = 0
+from enum import Enum
+
 IMAGE_TOKEN = '<IMAGE_TOKEN>'
+
+
+class Modality(str, Enum):
+    IMAGE = 'image'
+    VIDEO = 'video'
+    AUDIO = 'audio'
+    TIME_SERIES = 'time_series'
diff --git a/lmdeploy/vl/media/__init__.py b/lmdeploy/vl/media/__init__.py
new file mode 100644
index 0000000000..ef101fec61
--- /dev/null
+++ b/lmdeploy/vl/media/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/lmdeploy/vl/media/base.py b/lmdeploy/vl/media/base.py
new file mode 100644
index 0000000000..2ba58897a9
--- /dev/null
+++ b/lmdeploy/vl/media/base.py
@@ -0,0 +1,23 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# adapted from https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/media/base.py
+
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Generic, TypeVar
+
+_T = TypeVar('_T')
+
+
+class MediaIO(ABC, Generic[_T]):
+
+    @abstractmethod
+    def load_bytes(self, data: bytes) -> _T:
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_base64(self, media_type: str, data: str) -> _T:
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_file(self, filepath: Path) -> _T:
+        raise NotImplementedError
diff --git a/lmdeploy/vl/media/connection.py b/lmdeploy/vl/media/connection.py
new file mode 100644
index 0000000000..7389d67d8f
--- /dev/null
+++ b/lmdeploy/vl/media/connection.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from pathlib import Path
+from typing import TypeVar
+from urllib.parse import ParseResult, urlparse
+from urllib.request import url2pathname
+
+import requests
+
+from .base import MediaIO
+
+_M = TypeVar('_M')
+
+FETCH_TIMEOUT = int(os.environ.get('LMDEPLOY_FETCH_TIMEOUT', 10))
+headers = {
+    'User-Agent':
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
+    '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
+}
+
+
+def _load_http_url(url_spec: ParseResult, media_io: MediaIO[_M]) -> _M:
+    if url_spec.scheme not in ('http', 'https'):
+        raise ValueError(f'Unsupported URL scheme: {url_spec.scheme}')
+
+    client = requests.Session()
+    # TODO: zhouxinyu, timeout for video should be longer
+    response = client.get(url_spec.geturl(), headers=headers, timeout=FETCH_TIMEOUT)
+    response.raise_for_status()
+
+    return media_io.load_bytes(response.content)
+
+
+def _load_data_url(url_spec: ParseResult, media_io: MediaIO[_M]) -> _M:
+    url_spec_path = url_spec.path or ''
+    data_spec, data = url_spec_path.split(',', 1)
+    media_type, data_type = data_spec.split(';', 1)
+    # media_type starts with a leading "/" (e.g., "/video/jpeg")
+    media_type = media_type.lstrip('/')
+
+    if data_type != 'base64':
+        msg = 'Only base64 data URLs are supported for now.'
+        raise NotImplementedError(msg)
+
+    return media_io.load_base64(media_type, data)
+
+
+def _load_file_url(url_spec: ParseResult, media_io: MediaIO[_M]) -> _M:
+    url_spec_path = url_spec.path or ''
+    url_spec_netloc = url_spec.netloc or ''
+    filepath = Path(url2pathname(url_spec_netloc + url_spec_path))
+    return media_io.load_file(filepath)
+
+
+def load_from_url(url: str, media_io: MediaIO[_M]) -> _M:
+    """Load media from a HTTP, data or file url."""
+    url_spec = urlparse(url)
+
+    if url_spec.scheme and url_spec.scheme.startswith('http'):
+        return _load_http_url(url_spec, media_io)
+
+    if url_spec.scheme == 'data':
+        return _load_data_url(url_spec, media_io)
+
+    # file url or raw file path (absolute or relative)
+    if url_spec.scheme == 'file' or os.path.exists(url) or os.path.exists(url_spec.path):
+        return _load_file_url(url_spec, media_io)
+
+    msg = 'The URL must be either a HTTP, data or file URL.'
+    raise ValueError(msg)
diff --git a/lmdeploy/vl/media/image.py b/lmdeploy/vl/media/image.py
new file mode 100644
index 0000000000..7a6c12a086
--- /dev/null
+++ b/lmdeploy/vl/media/image.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# adapted from https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/media/image.py
+
+from io import BytesIO
+from pathlib import Path
+
+import pybase64
+from PIL import Image, ImageFile
+
+from .base import MediaIO
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+
+class ImageMediaIO(MediaIO[Image.Image]):
+
+    def __init__(self, image_mode: str = 'RGB', **kwargs) -> None:
+        super().__init__()
+        self.image_mode = image_mode
+
+        # for potential custom arguments from --media-io-kwargs
+        self.kwargs = kwargs
+
+    def load_bytes(self, data: bytes) -> Image.Image:
+        image = Image.open(BytesIO(data))
+        return image.convert(self.image_mode)
+
+    def load_base64(self, media_type: str, data: str) -> Image.Image:
+        return self.load_bytes(pybase64.b64decode(data))
+
+    def load_file(self, file_path: Path) -> Image.Image:
+        with open(file_path, 'rb') as f:
+            data = f.read()
+        image = Image.open(BytesIO(data))
+        return image.convert(self.image_mode)
+
+    def encode_base64(self, image: Image.Image, image_format: str = 'PNG') -> str:
+        with BytesIO() as buffer:
+            image = image.convert(self.image_mode)
+            image.save(buffer, image_format)
+            data = buffer.getvalue()
+
+        return pybase64.b64encode(data).decode('utf-8')
diff --git a/lmdeploy/vl/media/time_series.py b/lmdeploy/vl/media/time_series.py
new file mode 100644
index 0000000000..2e22ad278d
--- /dev/null
+++ b/lmdeploy/vl/media/time_series.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from io import BytesIO
+from pathlib import Path
+
+import numpy as np
+import numpy.typing as npt
+import pybase64
+
+from lmdeploy.utils import get_logger
+
+from .base import MediaIO
+
+logger = get_logger('lmdeploy')
+
+
+class TimeSeriesMediaIO(MediaIO[npt.NDArray]):
+
+    def __init__(self, **kwargs):
+        super().__init__()
+
+        # for potential custom arguments from --media-io-kwargs
+        self.kwargs = kwargs
+
+    def load_bytes(self, data: bytes) -> npt.NDArray:
+        ts_array = np.load(BytesIO(data), allow_pickle=False)
+        return ts_array
+
+    def load_base64(self, media_type: str, data: str) -> npt.NDArray:
+        return self.load_bytes(pybase64.b64decode(data))
+
+    def load_file(self, filepath: Path) -> npt.NDArray:
+        suffix = filepath.suffix.lower()
+
+        if suffix == '.npy':
+            return np.load(filepath, allow_pickle=False)
+        elif suffix == '.csv':
+            try:
+                ts_array = np.genfromtxt(filepath, delimiter=',', dtype=np.float32)
+                if ts_array.size == 0:
+                    raise ValueError(f'CSV file {filepath} yielded no data.')
+                return ts_array
+            except Exception as e:
+                logger.error(f'Failed to load CSV {filepath}: {e}')
+                raise
+        elif suffix in ['.wav', '.mp3', '.flac']:
+            try:
+                import soundfile as sf
+            except ImportError:
+                raise ImportError('Please install soundfile via `pip install soundfile`.')
+
+            ts_array, _ = sf.read(filepath)
+            return ts_array
+
+        raise ValueError(f'Unsupported file format: {suffix}')
+
+    def encode_base64(self, data: npt.NDArray) -> str:
+        """Encode numpy array to base64 string using NPY format."""
+        buffer = BytesIO()
+        np.save(buffer, data, allow_pickle=False)
+        return pybase64.b64encode(buffer.getvalue()).decode('utf-8')
diff --git a/lmdeploy/vl/media/video.py b/lmdeploy/vl/media/video.py
new file mode 100644
index 0000000000..5fe0ff6477
--- /dev/null
+++ b/lmdeploy/vl/media/video.py
@@ -0,0 +1,117 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# adapted from https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/media/video.py
+
+import base64
+from functools import partial
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import numpy.typing as npt
+from PIL import Image
+
+from lmdeploy.utils import get_logger
+
+from .base import MediaIO
+from .image import ImageMediaIO
+from .video_loader import (DecordVideoLoader, OpenCVVideoLoader, TorchCodecVideoLoader, TorchVisionVideoLoader,
+                           VideoLoader)
+
+logger = get_logger('lmdeploy')
+
+
+class VideoMediaIO(MediaIO[tuple[npt.NDArray, dict[str, Any]]]):
+
+    def __init__(
+        self,
+        image_io: ImageMediaIO,
+        num_frames: int = 32,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+
+        self.image_io = image_io
+        self.num_frames = num_frames
+
+        # for potential custom arguments from --media-io-kwargs
+        self.kwargs = kwargs
+        self.video_loader = self._get_video_loader_backend()
+
+    def _get_video_loader_backend(self) -> VideoLoader:
+        """Determines the best available video loader backend."""
+        # vLLM:          OpenCV
+        # SGLang:        Decord
+        # qwen-vl-utils: TorchCodec -> Decord -> TorchVision (deprecated soon)
+        backends = [
+            ('cv2', OpenCVVideoLoader),
+            ('decord', DecordVideoLoader),
+            ('torchcodec', TorchCodecVideoLoader),
+            ('torchvision', TorchVisionVideoLoader),
+        ]
+
+        for module_name, loader_cls in backends:
+            try:
+                __import__(module_name)
+                return loader_cls()
+            except (ImportError, RuntimeError):
+                logger.warning(f"Video backend '{module_name}' not found. Trying next backend...")
+                continue
+
+        raise ImportError(
+            'No video backend found. Install either opencv-python-headless, decord, torchcodec, or torchvision.')
+
+    def load_bytes(self, data: bytes) -> tuple[npt.NDArray, dict[str, Any]]:
+        return self.video_loader.load_bytes(data, num_frames=self.num_frames, **self.kwargs)
+
+    def load_base64(self, media_type: str, data: str) -> tuple[npt.NDArray, dict[str, Any]]:
+        if media_type.lower() == 'video/jpeg':
+            load_frame = partial(
+                self.image_io.load_base64,
+                'image/jpeg',
+            )
+
+            # NOTE: known issue in https://github.com/QwenLM/Qwen3-VL/issues/1643
+            # when passing a video as a sequence of JPEG frames, we cannot obtain the video metadata
+            # therefore we construct a default metadata dictionary with common values.
+            frames = np.stack([np.asarray(load_frame(frame_data)) for frame_data in data.split(',')])
+
+            total_frames_num = int(frames.shape[0])
+            fps = float(self.kwargs.get('fps', 2))  # default to 2 fps if not specified
+            duration = (total_frames_num / fps) if fps > 0 else 0
+            frame_idx = list(range(total_frames_num))
+
+            metadata = {
+                'total_num_frames': total_frames_num,
+                'fps': fps,
+                'duration': duration,
+                'video_backend': 'jpeg_sequence',
+                'frames_indices': frame_idx,
+            }
+
+            logger.info('Loading video from base64-encoded JPEG frames misses video metadata.'
+                        f'Fall back to default metadata values:\n{metadata}')
+            return frames, metadata
+
+        return self.load_bytes(base64.b64decode(data))
+
+    def load_file(self, filepath: Path) -> tuple[npt.NDArray, dict[str, Any]]:
+        return self.video_loader.load_file(filepath, num_frames=self.num_frames, **self.kwargs)
+
+    def encode_base64(
+        self,
+        media: npt.NDArray,
+        *,
+        video_format: str = 'JPEG',
+    ) -> str:
+        video = media
+
+        if video_format == 'JPEG':
+            encode_frame = partial(
+                self.image_io.encode_base64,
+                image_format=video_format,
+            )
+
+            return ','.join(encode_frame(Image.fromarray(frame)) for frame in video)
+
+        msg = 'Only JPEG format is supported for now.'
+        raise NotImplementedError(msg)
diff --git a/lmdeploy/vl/media/video_loader.py b/lmdeploy/vl/media/video_loader.py
new file mode 100644
index 0000000000..0cf921f3e5
--- /dev/null
+++ b/lmdeploy/vl/media/video_loader.py
@@ -0,0 +1,342 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# adapted from https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/video.py
+# adapted from https://github.com/QwenLM/Qwen3-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py
+
+import math
+import os
+import tempfile
+from abc import abstractmethod
+from io import BytesIO
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import numpy.typing as npt
+
+from lmdeploy.utils import get_logger
+
+logger = get_logger('lmdeploy')
+
+
+class VideoLoader:
+
+    @classmethod
+    @abstractmethod
+    def load_bytes(self, data: bytes, num_frames: int = -1, **kwargs) -> tuple[npt.NDArray, dict[str, Any]]:
+        raise NotImplementedError
+
+    @classmethod
+    def smart_nframes(self, total_frames_num: int, num_frames: int, fps: int, duration: int) -> tuple[int, list[int]]:
+        # resample video to target num_frames and fps
+        # - the minimum of the two will be used
+        num_frames_to_sample = total_frames_num
+        if num_frames > 0:
+            num_frames_to_sample = min(num_frames, total_frames_num)
+        if fps > 0:
+            num_frames_to_sample = min(num_frames_to_sample, math.floor(duration * fps))
+        num_frames_to_sample = max(1, num_frames_to_sample)  # at least one sample
+
+        if num_frames_to_sample == total_frames_num:
+            frame_idx = list(range(0, num_frames_to_sample))
+        else:
+            uniform_sampled_frames = np.linspace(0, total_frames_num - 1, num_frames_to_sample, dtype=int)
+            frame_idx = uniform_sampled_frames.tolist()
+        return num_frames_to_sample, frame_idx
+
+
+class OpenCVVideoLoader(VideoLoader):
+
+    def get_cv2_video_api(self):
+        import cv2.videoio_registry as vr
+
+        api_pref = None
+        for backend in vr.getStreamBufferedBackends():
+            if not vr.hasBackend(backend):
+                continue
+            if not vr.isBackendBuiltIn(backend):
+                _, abi, api = vr.getStreamBufferedBackendPluginVersion(backend)
+                if abi < 1 or (abi == 1 and api < 2):
+                    continue
+            api_pref = backend
+            break
+        return api_pref
+
+    @staticmethod
+    def _read_frames(
+        cap,
+        frame_indices: set[int],
+        num_expected_frames: int,
+        max_frame_idx: int,
+    ) -> tuple[npt.NDArray, int, list[int]]:
+        import cv2
+
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames = np.empty((num_expected_frames, height, width, 3), dtype=np.uint8)  # THWC
+
+        i = 0
+        valid_frame_indices = []
+        for idx in range(max_frame_idx + 1):
+            ok = cap.grab()
+            if not ok:
+                # Frame is broken/unreadable, log warning
+                if idx in frame_indices:
+                    logger.warning(
+                        'Failed to grab frame %d during video loading. '
+                        'This frame will be skipped.',
+                        idx,
+                    )
+                continue
+            if idx in frame_indices:
+                ret, frame = cap.retrieve()
+                if ret:
+                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    valid_frame_indices.append(idx)
+                    i += 1
+                else:
+                    # retrieve() failed even though grab() succeeded
+                    logger.warning(
+                        'Failed to retrieve frame %d during video loading. '
+                        'This frame will be skipped.',
+                        idx,
+                    )
+
+        valid_num_frames = len(valid_frame_indices)
+        if valid_num_frames < num_expected_frames:
+            logger.warning(
+                'Video loading completed with %d broken/unreadable frames. '
+                'Expected %d frames but only loaded %d frames.',
+                num_expected_frames - valid_num_frames,
+                num_expected_frames,
+                valid_num_frames,
+            )
+
+        return frames[:valid_num_frames], valid_num_frames, valid_frame_indices
+
+    @classmethod
+    def load_file(
+        self,
+        filepath: Path,
+        num_frames: int = -1,
+        fps: int = -1,
+        max_duration: int = 300,
+        **kwargs,
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
+        with open(filepath, 'rb') as f:
+            data = f.read()
+        return self.load_bytes(data, num_frames=num_frames, fps=fps, max_duration=max_duration, **kwargs)
+
+    @classmethod
+    def load_bytes(
+        cls,
+        data: bytes,
+        num_frames: int = -1,
+        fps: int = -1,
+        max_duration: int = 300,
+        **kwargs,
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
+        """Load video frames from bytes.
+
+        Args:
+            data: Raw video bytes
+            num_frames: Target number of frames to sample (-1 for all)
+            fps: Target FPS for sampling (-1 for original)
+            max_duration: Maximum duration (unused in base backend)
+
+        Returns:
+            Tuple of (frames_array, metadata_dict)
+        """
+        import cv2
+
+        backend = cls().get_cv2_video_api()
+        cap = cv2.VideoCapture(BytesIO(data), backend, [])
+        if not cap.isOpened():
+            raise ValueError('Could not open video stream')
+
+        total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        original_fps = cap.get(cv2.CAP_PROP_FPS)
+        duration = total_frames_num / original_fps if original_fps > 0 else 0
+
+        num_frames_to_sample, frame_idx = cls.smart_nframes(total_frames_num, num_frames, fps, duration)
+
+        frame_idx_set = set(frame_idx)
+        frames, valid_num_frames, valid_frame_indices = cls._read_frames(cap, frame_idx_set, num_frames_to_sample,
+                                                                         max(frame_idx))
+
+        # Use transformers transformers.video_utils.VideoMetadata format
+        # For models like Qwen3-VL/GLM4.5V, this metadata
+        # can cause incorrect timestamp calculation without num_frames=-1.
+        # TODO: zhouxinyu, support per-request do_sample_frames
+        metadata = {
+            'total_num_frames': total_frames_num,
+            'fps': original_fps,
+            'duration': duration,
+            'video_backend': 'opencv',
+            'frames_indices': valid_frame_indices,
+            # extra field used to control hf processor's video
+            # sampling behavior
+            # "do_sample_frames": valid_num_frames == total_frames_num,
+        }
+        return frames, metadata
+
+
+class DecordVideoLoader(VideoLoader):
+
+    @classmethod
+    def load_file(self,
+                  filepath: Path,
+                  num_frames: int = -1,
+                  fps: int = -1,
+                  max_duration: int = 300,
+                  **kwargs) -> tuple[npt.NDArray, dict[str, Any]]:
+        import decord
+        vr = decord.VideoReader(str(filepath))
+        total_frames_num = len(vr)
+        original_fps = vr.get_avg_fps()
+        duration = total_frames_num / original_fps if original_fps > 0 else 0
+
+        num_frames_to_sample, frame_idx = self.smart_nframes(total_frames_num, num_frames, fps, duration)
+
+        video = vr.get_batch(frame_idx).asnumpy()  # THWC
+        metadata = {
+            'total_num_frames': total_frames_num,
+            'fps': original_fps,
+            'duration': duration,
+            'video_backend': 'decord',
+            'frames_indices': frame_idx,
+        }
+        return video, metadata
+
+    @classmethod
+    def load_bytes(self,
+                   data: bytes,
+                   num_frames: int = -1,
+                   fps: int = -1,
+                   max_duration: int = 300,
+                   **kwargs) -> tuple[npt.NDArray, dict[str, Any]]:
+        tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
+        try:
+            tmp_file.write(data)
+            tmp_file.close()
+            return self.load_file(Path(tmp_file.name),
+                                  num_frames=num_frames,
+                                  fps=fps,
+                                  max_duration=max_duration,
+                                  **kwargs)
+        finally:
+            # always cleanup, even if load_file crashes
+            try:
+                os.unlink(tmp_file.name)
+            except OSError:
+                pass  # file might not exist if write failed
+
+
+class TorchCodecVideoLoader(VideoLoader):
+
+    @classmethod
+    def load_file(self,
+                  filepath: Path,
+                  num_frames: int = -1,
+                  fps: int = -1,
+                  max_duration: int = 300,
+                  **kwargs) -> tuple[npt.NDArray, dict[str, Any]]:
+        # torchcodec requires matched ffmpeg, torchcodec, and torch versions
+        # ffmpeg 5.1.2, torch 2.8.0, torchcodec 0.7.0 are verified to work together
+        from torchcodec.decoders import VideoDecoder
+
+        torch_codec_num_threads = 8
+        decoder = VideoDecoder(str(filepath), num_ffmpeg_threads=torch_codec_num_threads)
+        total_frames_num = decoder.metadata.num_frames
+        original_fps = decoder.metadata.average_fps
+        duration = total_frames_num / original_fps if original_fps > 0 else 0
+
+        num_frames_to_sample, frame_idx = self.smart_nframes(total_frames_num, num_frames, fps, duration)
+
+        video = decoder.get_frames_at(frame_idx).data
+        metadata = {
+            'total_num_frames': total_frames_num,
+            'fps': original_fps,
+            'duration': duration,
+            'video_backend': 'torchcodec',
+            'frames_indices': frame_idx,
+        }
+        return video, metadata
+
+    @classmethod
+    def load_bytes(self,
+                   data: bytes,
+                   num_frames: int = -1,
+                   fps: int = -1,
+                   max_duration: int = 300,
+                   **kwargs) -> tuple[npt.NDArray, dict[str, Any]]:
+        tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
+        try:
+            tmp_file.write(data)
+            tmp_file.close()
+            return self.load_file(Path(tmp_file.name),
+                                  num_frames=num_frames,
+                                  fps=fps,
+                                  max_duration=max_duration,
+                                  **kwargs)
+        finally:
+            # always cleanup, even if load_file crashes
+            try:
+                os.unlink(tmp_file.name)
+            except OSError:
+                pass  # file might not exist if write failed
+
+
+class TorchVisionVideoLoader(VideoLoader):
+
+    @classmethod
+    def load_file(self,
+                  filepath: Path,
+                  num_frames: int = -1,
+                  fps: int = -1,
+                  max_duration: int = 300,
+                  **kwargs) -> tuple[npt.NDArray, dict[str, Any]]:
+        import torchvision
+
+        video, audio, info = torchvision.io.read_video(
+            filepath,
+            pts_unit='sec',
+            output_format='THWC',
+        )
+        total_frames_num = video.size(0)
+        original_fps = info['video_fps']
+        duration = total_frames_num / original_fps if original_fps > 0 else 0
+
+        num_frames_to_sample, frame_idx = self.smart_nframes(total_frames_num, num_frames, fps, duration)
+
+        video = video[frame_idx]
+        metadata = {
+            'total_num_frames': total_frames_num,
+            'fps': original_fps,
+            'duration': duration,
+            'video_backend': 'torchvision',
+            'frames_indices': frame_idx,
+        }
+        return video, metadata
+
+    @classmethod
+    def load_bytes(self,
+                   data: bytes,
+                   num_frames: int = -1,
+                   fps: int = -1,
+                   max_duration: int = 300,
+                   **kwargs) -> tuple[npt.NDArray, dict[str, Any]]:
+        tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
+        try:
+            tmp_file.write(data)
+            tmp_file.close()
+            return self.load_file(Path(tmp_file.name),
+                                  num_frames=num_frames,
+                                  fps=fps,
+                                  max_duration=max_duration,
+                                  **kwargs)
+        finally:
+            # always cleanup, even if load_file crashes
+            try:
+                os.unlink(tmp_file.name)
+            except OSError:
+                pass  # file might not exist if write failed
diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py
index 521cbc7985..f282e44e67 100644
--- a/lmdeploy/vl/model/base.py
+++ b/lmdeploy/vl/model/base.py
@@ -165,45 +165,34 @@ def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, chat_
             raise NotImplementedError()
 
     @staticmethod
-    def collect_images(messages):
-        """Gather all images along with their respective parameters from the
-        messages and compile them into a single list. Each image is converted
-        to RGB color space.
+    def collect_multimodal_items(messages):
+        """Gather all multimodal items along with their respective parameters
+        from the messages and compile them into a single list.
 
         Args:
-            messages (List[Tuple[Image, Dict]]): a list of images with their
-                corresponding parameters
-        """  # noqa
-        images = []
+            messages (List[Dict]): a list of message
+        Returns:
+            List[Tuple[Modality, Any, Dict]]: a list of (modality, data, params) for each multimodal item
+        """
+        multimodal_items = []
         for message in messages:
             content = message['content']
-            if not isinstance(content, List):
+            if not isinstance(content, list):
                 continue
-            images.extend([(x['image'], {
-                k: v
-                for k, v in x.items() if k not in {'type', 'image'}
-            }) for x in content if x['type'] == 'image'])
-        return images
 
-    @staticmethod
-    def collect_time_series(messages):
-        """Gather all time series data along with their respective parameters
-        from the messages and compile them into a single list.
+            for x in content:
+                if not isinstance(x, dict):
+                    continue
 
-        Args:
-            messages (List[Tuple[np.ndarray, Dict]]): a list of time
-                series data with their corresponding parameters
-        """  # noqa
-        time_series = []
-        for message in messages:
-            content = message['content']
-            if not isinstance(content, List):
-                continue
-            time_series.extend([(x['time_series'], {
-                k: v
-                for k, v in x.items() if k not in {'type', 'time_series'}
-            }) for x in content if x['type'] == 'time_series'])
-        return time_series
+                modality = x.get('type')
+                if modality is None or modality == 'text':
+                    continue
+
+                data = x.get('data')
+                params = {k: v for k, v in x.items() if k not in ['type', 'data']}
+                multimodal_items.append((modality, data, params))
+
+        return multimodal_items
 
     @staticmethod
     def IMAGE_TOKEN_included(messages):
diff --git a/lmdeploy/vl/model/cogvlm.py b/lmdeploy/vl/model/cogvlm.py
index ca5e41a96f..8b7da1b57e 100644
--- a/lmdeploy/vl/model/cogvlm.py
+++ b/lmdeploy/vl/model/cogvlm.py
@@ -41,9 +41,9 @@ def build_model(self):
 
     def preprocess(self, messages: List[Dict]) -> List[Dict]:
         """Refer to the spec of `super().preprocess`"""
-        images = self.collect_images(messages)
+        images = self.collect_multimodal_items(messages)
         outputs = []
-        for image, _ in images:
+        for modality, image, _ in images:
             image = image.convert('RGB')
             pixel_values = self.image_transform(image)
             outputs.append(
diff --git a/lmdeploy/vl/model/deepseek.py b/lmdeploy/vl/model/deepseek.py
index 0a1f6c12e9..91c2d2ef45 100644
--- a/lmdeploy/vl/model/deepseek.py
+++ b/lmdeploy/vl/model/deepseek.py
@@ -88,9 +88,9 @@ def build_model(self):
 
     def preprocess(self, messages: List[Dict]) -> List[Dict]:
         """Refers to the spec of `super.preprocess()"""
-        images = self.collect_images(messages)
+        images = self.collect_multimodal_items(messages)
         outputs = []
-        for image, _ in images:
+        for modality, image, _ in images:
             image = image.convert('RGB')
             pixel_values = self.image_processor([image], return_tensors='pt').pixel_values
             outputs.append(
diff --git a/lmdeploy/vl/model/deepseek_vl2.py b/lmdeploy/vl/model/deepseek_vl2.py
index a2e4b034ca..823e162d07 100644
--- a/lmdeploy/vl/model/deepseek_vl2.py
+++ b/lmdeploy/vl/model/deepseek_vl2.py
@@ -69,7 +69,7 @@ def build_model(self):
 
     def preprocess(self, messages: List[Dict]) -> List[Dict]:
         """Refers to the spec of `super.preprocess()"""
-        images = self.collect_images(messages)
+        images = self.collect_multimodal_items(messages)
 
         # convert to upstream api formats
         images = [img_parameter[0] for img_parameter in images]
diff --git a/lmdeploy/vl/model/gemma3_vl.py b/lmdeploy/vl/model/gemma3_vl.py
index c2879a6b83..4d246c752a 100644
--- a/lmdeploy/vl/model/gemma3_vl.py
+++ b/lmdeploy/vl/model/gemma3_vl.py
@@ -72,8 +72,8 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
                 'add_special_tokens': False
             },
         )
-        images = self.collect_images(messages)
-        images = [image.convert('RGB') for image, _ in images]
+        images = self.collect_multimodal_items(messages)
+        images = [image.convert('RGB') for modality, image, _ in images]
         num_image = len(images)
         images = make_nested_list_of_images(images)
         image_inputs = self.processor.image_processor(images, **output_kwargs['images_kwargs'])
diff --git a/lmdeploy/vl/model/glm4_1v.py b/lmdeploy/vl/model/glm4_1v.py
index 3b4b2ab937..a5bb0ff81f 100644
--- a/lmdeploy/vl/model/glm4_1v.py
+++ b/lmdeploy/vl/model/glm4_1v.py
@@ -35,10 +35,10 @@ def build_model(self):
 
     def preprocess(self, messages: List[Dict]) -> List[Dict]:
         """Refer to `super().preprocess()` for spec."""
-        images = self.collect_images(messages)
+        images = self.collect_multimodal_items(messages)
         optional_keys = {'resized_height', 'resized_width', 'min_pixels', 'max_pixels'}
         outputs = []
-        for image, params in images:
+        for modality, image, params in images:
             image = image.convert('RGB')
 
             item = dict(type='image', image=image)
diff --git a/lmdeploy/vl/model/interns1_pro.py b/lmdeploy/vl/model/interns1_pro.py
index e11efbb32a..ce9f6a481f 100644
--- a/lmdeploy/vl/model/interns1_pro.py
+++ b/lmdeploy/vl/model/interns1_pro.py
@@ -6,6 +6,7 @@
 from transformers import AutoProcessor
 
 from lmdeploy.utils import get_logger
+from lmdeploy.vl.constants import Modality
 from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
 
 logger = get_logger('lmdeploy')
@@ -31,15 +32,23 @@ class InternS1ProVisionModel(VisionModel):
     def build_preprocessor(self):
         check_transformers()
         self.processor = AutoProcessor.from_pretrained(self.model_path, trust_remote_code=True)
-        tokenizer = self.processor.tokenizer
+
+        # image tokens
         self.image_token = self.processor.image_token
-        self.image_token_id = tokenizer.encode(self.image_token)[-1]
-        self.mm_processor_kwargs = None
+        self.image_token_id = self.processor.image_token_id
+
+        # video tokens
+        self.video_token = self.processor.video_token
+        self.video_token_id = self.processor.video_token_id
 
-        # Time Series tokens
+        # time series tokens
         self.ts_token = getattr(self.processor, 'ts_token', None)
         self.ts_token_id = getattr(self.processor, 'ts_token_id', None)
 
+        # vision start and end tokens
+        self.vision_start_token = self.processor.vision_start_token
+        self.vision_end_token = self.processor.vision_end_token
+
     def get_processor_args(self, mm_processor_kwargs: Optional[Dict[str, Any]] = None):
         min_pixels = self.processor.image_processor.size['shortest_edge']
         max_pixels = self.processor.image_processor.size['longest_edge']
@@ -82,7 +91,64 @@ def check_time_series_input(self, messages):
             for message in messages)
         self.has_time_series_input = has_time_series_input
 
-    def time_series_processor(self, ts_input, sr):
+    def _preprocess_image(self,
+                          data: List[Any],
+                          params: Dict[str, Any],
+                          mm_processor_kwargs: Dict[str, Any] | None = None) -> List[Dict]:
+
+        image = data.convert('RGB')
+        min_pixels, max_pixels = self.get_processor_args(mm_processor_kwargs)
+
+        result = self.processor.image_processor(images=image,
+                                                size={
+                                                    'shortest_edge': min_pixels,
+                                                    'longest_edge': max_pixels
+                                                },
+                                                return_tensors='pt')
+        merge_length = self.processor.image_processor.merge_size**2
+        image_tokens = result['image_grid_thw'].prod(dim=1) // merge_length
+        result.update(dict(image_size=image.size, image_tokens=image_tokens, image_token_id=self.image_token_id))
+        return result
+
+    def _preprocess_video(self,
+                          data: List[Any],
+                          params: Dict[str, Any],
+                          mm_processor_kwargs: Dict[str, Any] | None = None) -> List[Dict]:
+
+        # TODO: zhouxinyu, apply transformers smart_resize using per-request kwargs
+        metadata = params['video_metadata']
+        video_kwargs = dict(return_metadata=True,
+                            do_resize=True,
+                            do_sample_frames=False,
+                            video_metadata=metadata,
+                            return_tensors='pt')
+        result = self.processor.video_processor(videos=data, **video_kwargs)
+        video_grid_thw = result['video_grid_thw']
+
+        merge_length = self.processor.video_processor.merge_size**2
+        if metadata.get('fps') is None:
+            logger.warning_once('Qwen3VL: fps not found, defaulting to 24.')
+            metadata['fps'] = metadata['fps'] or 24
+
+        # if timestamps are not provided, calculate them
+        curr_timestamp = self.processor._calculate_timestamps(
+            metadata['frames_indices'],
+            metadata['fps'],
+            self.processor.video_processor.merge_size,
+        )
+
+        frame_seqlen = video_grid_thw[0][1:].prod() // merge_length
+        result.update(curr_timestamp=curr_timestamp, frame_seqlen=frame_seqlen, video_token_id=self.video_token_id)
+        return result
+
+    def _preprocess_time_series(self,
+                                data: List[Any],
+                                params: Dict[str, Any],
+                                mm_processor_kwargs: Dict[str, Any] | None = None) -> List[Dict]:
+
+        ts_input = data
+        sr = params.get('sampling_rate') if params is not None else None
+
         if not isinstance(ts_input, np.ndarray):
             ts_input = np.array(ts_input, dtype=np.float32)
 
@@ -108,43 +174,34 @@ def time_series_processor(self, ts_input, sr):
         stride = np.floor(160 / ((1 + np.exp(-sr / 100))**6))
         patch_size = stride * 2
         embed_length = (np.ceil((ts_len - patch_size) / stride) + 1)
-        num_ts_tokens = int((embed_length // 2 + 1) // 2)
+        ts_tokens = int((embed_length // 2 + 1) // 2)
 
-        return dict(ts_values=[ts_input], ts_sr=[sr], ts_lens=[ts_len], num_ts_tokens=[num_ts_tokens])
+        return dict(ts_values=[ts_input],
+                    ts_sr=[sr],
+                    ts_lens=[ts_len],
+                    ts_tokens=[ts_tokens],
+                    ts_token_id=self.ts_token_id)
 
-    def preprocess(self, messages: List[Dict], mm_processor_kwargs: Optional[Dict[str, Any]] = None) -> List[Dict]:
+    def preprocess(self, messages: List[Dict], mm_processor_kwargs: Dict[str, Any] | None = None) -> List[Dict]:
         """Refer to `super().preprocess()` for spec."""
-
-        self.check_time_series_input(messages)
-
-        if self.has_time_series_input:
-            time_series = self.collect_time_series(messages)
-            outputs = []
-            for ts_input, params in time_series:
-                sr = params.get('sampling_rate') if params is not None else None
-                time_series_inputs = self.time_series_processor(ts_input, sr)
-                time_series_inputs.update({'ts_token_id': self.ts_token_id})
-                outputs.append(time_series_inputs)
-        else:
-            min_pixels, max_pixels = self.get_processor_args(mm_processor_kwargs)
-
-            images = self.collect_images(messages)
-            outputs = []
-            for image, params in images:
-                image = image.convert('RGB')
-
-                result = self.processor.image_processor(images=image,
-                                                        videos=None,
-                                                        size={
-                                                            'shortest_edge': min_pixels,
-                                                            'longest_edge': max_pixels
-                                                        },
-                                                        return_tensors='pt')
-                merge_length = self.processor.image_processor.merge_size**2
-                image_tokens = result['image_grid_thw'].prod(dim=1) // merge_length
-                result.update(dict(image_size=image.size, image_tokens=image_tokens,
-                                   image_token_id=self.image_token_id))
-                outputs.append(result)
+        outputs = []
+        self.contains_video_input = False
+        self.contains_ts_input = False
+
+        mm_items = self.collect_multimodal_items(messages)
+        for modality, data, params in mm_items:
+            result = {}
+            if modality == Modality.IMAGE:
+                result = self._preprocess_image(data, params, mm_processor_kwargs)
+            elif modality == Modality.VIDEO:
+                self.contains_video_input = True
+                result = self._preprocess_video(data, params, mm_processor_kwargs)
+            elif modality == Modality.TIME_SERIES:
+                self.contains_ts_input = True
+                result = self._preprocess_time_series(data, params, mm_processor_kwargs)
+
+            result.update(modality=modality)
+            outputs.append(result)
 
         messages.append(dict(role='preprocess', content=outputs))
         return messages
@@ -161,11 +218,6 @@ def proc_messages(self,
         IMAGE_TOKEN = '<IMAGE_TOKEN>'
         messages = [x for x in messages if x['role'] not in ['preprocess', 'forward']]
 
-        if self.has_time_series_input:
-            prompt_messages = messages
-            prompt = chat_template.messages2prompt(prompt_messages, sequence_start, tools=tools, **chat_template_kwargs)
-            return prompt, self.ts_token
-
         if VisionModel.IMAGE_TOKEN_included(messages):
             # backward compatibility
             for message in messages:
@@ -179,21 +231,61 @@ def proc_messages(self,
                 prompt_messages.append(dict(role='user', content=prompt))
         else:
             prompt_messages = messages
+
+        # time series input requires enabling_thinking = False
+        if self.contains_ts_input:
+            chat_template_kwargs['enable_thinking'] = False
+
         prompt = chat_template.messages2prompt(prompt_messages, sequence_start, tools=tools, **chat_template_kwargs)
-        return prompt, self.image_token
-
-    def ts_to_pytorch_aux(self, messages, prompt, TS_TOKEN, tokenizer, sequence_start):
-        """Auxiliary function to pack the preprocessing results in a format
-        compatible with what is required by pytorch engine.
-
-        Args:
-            messages(List[Dict]): the output of `preprocess`
-            prompt(str): the prompt after applying chat template
-            TS_TOKEN(str): a placeholder where time series tokens will be
-                inserted
-            tokenizer: the tokenizer model
-            sequence_start: starting flag of a sequence
-        """
+        return prompt, None
+
+    def to_pytorch_aux_video(self, messages, prompt, VIDEO_TOKEN, tokenizer, sequence_start):
+        """Pack the video input to the compatible format with pytorch
+        engine."""
+
+        # collect all preprocessing result from messages
+        preps = [x['content'] for x in messages if x['role'] == 'preprocess']
+        assert len(preps) == 1
+        preps = preps[0]
+
+        # split prompt into segments and validate data
+        segs = prompt.split(self.vision_start_token + self.video_token + self.vision_end_token)
+        assert len(segs) == len(preps) + 1, (f'the number of {self.video_token} is not equal '
+                                             f'to input videos, {len(segs) - 1} vs {len(preps)}')
+
+        # calculate the video token offset for each video
+        input_ids = []
+        for i, seg in enumerate(segs):
+            if i > 0 and i <= len(preps):
+                preps[i - 1].update(offset=len(input_ids))
+                frame_seqlen = preps[i - 1]['frame_seqlen']
+                assert self.video_token_id == preps[i - 1]['video_token_id']
+
+                video_grid_thw = preps[i - 1]['video_grid_thw']
+                curr_timestamp = preps[i - 1]['curr_timestamp']
+
+                # update prompt with timestamp index tokens and video pad tokens
+                video_placeholder = ''
+                for frame_idx in range(video_grid_thw[0][0]):
+                    curr_time = curr_timestamp[frame_idx]
+                    video_placeholder += f'<{curr_time:.1f} seconds>'
+                    video_placeholder += (self.vision_start_token + '<|placeholder|>' * frame_seqlen +
+                                          self.vision_end_token)
+
+                video_placeholder = video_placeholder.replace('<|placeholder|>', self.video_token)
+                video_token_ids = tokenizer.encode(video_placeholder)
+                input_ids.extend(video_token_ids)
+
+                preps[i - 1].update(video_tokens=len(video_token_ids))
+
+            token_ids = tokenizer.encode(seg, add_bos=((i == 0) and sequence_start))
+            input_ids.extend(token_ids)
+
+        return dict(prompt=prompt, input_ids=input_ids, multimodal=preps)
+
+    def to_pytorch_aux_ts(self, messages, prompt, TS_TOKEN, tokenizer, sequence_start):
+        """Pack the time series input to the compatible format with pytorch
+        engine."""
         # collect all preprocessing result from messages
         preps = [x['content'] for x in messages if x['role'] == 'preprocess']
         assert len(preps) == 1
@@ -208,13 +300,12 @@ def ts_to_pytorch_aux(self, messages, prompt, TS_TOKEN, tokenizer, sequence_star
         for i, seg in enumerate(segs):
             if i > 0 and i <= len(preps):
                 preps[i - 1].update(offset=len(input_ids))
-                ts_tokens = preps[i - 1]['num_ts_tokens']
+                ts_tokens = preps[i - 1]['ts_tokens']
 
-                # NOTE: zhouxinyu, better to be valid type in the processor
                 ts_tokens = ts_tokens[0]
                 ts_array = np.array(preps[i - 1]['ts_values'])
 
-                preps[i - 1].update(num_ts_tokens=ts_tokens)
+                preps[i - 1].update(ts_tokens=ts_tokens)
                 preps[i - 1].update(ts_values=torch.from_numpy(ts_array).to(dtype=torch.bfloat16))
                 preps[i - 1].update(ts_lens=torch.tensor(preps[i - 1]['ts_lens']))
                 preps[i - 1].update(ts_sr=torch.tensor(preps[i - 1]['ts_sr']))
@@ -235,22 +326,18 @@ def to_pytorch(self,
                    chat_template_kwargs: Optional[Dict] = None,
                    **kwargs):
         """Return to the information needed by pytorch engine."""
-        if self.has_time_series_input:
-            # time series input requires enabling_thinking = False
-            chat_template_kwargs = {'enable_thinking': False}
-            prompt, TS_TOKEN = self.proc_messages(messages,
-                                                  chat_template,
-                                                  sequence_start,
-                                                  tools=tools,
-                                                  chat_template_kwargs=chat_template_kwargs)
-            return self.ts_to_pytorch_aux(messages, prompt, TS_TOKEN, tokenizer, sequence_start)
+        prompt, _ = self.proc_messages(messages,
+                                       chat_template,
+                                       sequence_start,
+                                       tools=tools,
+                                       chat_template_kwargs=chat_template_kwargs)
+
+        if self.contains_video_input:
+            return self.to_pytorch_aux_video(messages, prompt, self.video_token, tokenizer, sequence_start)
+        elif self.contains_ts_input:
+            return self.to_pytorch_aux_ts(messages, prompt, self.ts_token, tokenizer, sequence_start)
         else:
-            prompt, IMAGE_TOKEN = self.proc_messages(messages,
-                                                     chat_template,
-                                                     sequence_start,
-                                                     tools=tools,
-                                                     chat_template_kwargs=chat_template_kwargs)
-            return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
+            return self.to_pytorch_aux(messages, prompt, self.image_token, tokenizer, sequence_start)
 
     def build_model(self):
         # TODO: implement for turbomind
diff --git a/lmdeploy/vl/model/internvl.py b/lmdeploy/vl/model/internvl.py
index 9036866818..7f49dd9799 100644
--- a/lmdeploy/vl/model/internvl.py
+++ b/lmdeploy/vl/model/internvl.py
@@ -192,9 +192,9 @@ def _forward(self, inputs, max_batch_size):
 
     def preprocess(self, messages: List[Dict]) -> List[Dict]:
         """Refers to `super.preprocess() for spec."""
-        images = self.collect_images(messages)
+        images = self.collect_multimodal_items(messages)
         outputs = []
-        for image, params in images:
+        for modality, image, params in images:
             image = image.convert('RGB')
             pixel_values = self.processor(image, params)
             image_tokens = (pixel_values.shape[0] * self.image_tokens_per_patch)
diff --git a/lmdeploy/vl/model/internvl3_hf.py b/lmdeploy/vl/model/internvl3_hf.py
index 85eb40bbdc..4601b01930 100644
--- a/lmdeploy/vl/model/internvl3_hf.py
+++ b/lmdeploy/vl/model/internvl3_hf.py
@@ -94,8 +94,8 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
                 'add_special_tokens': False
             },
         )
-        images = self.collect_images(messages)
-        images = [image.convert('RGB') for image, _ in images]
+        images = self.collect_multimodal_items(messages)
+        images = [image.convert('RGB') for modality, image, _ in images]
         num_image = len(images)
         images = make_flat_list_of_images(images)
         image_inputs = self.processor.image_processor(images, **output_kwargs['images_kwargs'])
diff --git a/lmdeploy/vl/model/llama4.py b/lmdeploy/vl/model/llama4.py
index e0752d7b99..1661d9fc63 100644
--- a/lmdeploy/vl/model/llama4.py
+++ b/lmdeploy/vl/model/llama4.py
@@ -60,13 +60,13 @@ def build_model(self):
 
     def preprocess(self, messages: List[Dict]) -> List[Dict]:
         """Refers to `super.preprocess() for spec."""
-        images = self.collect_images(messages)
+        images = self.collect_multimodal_items(messages)
         outputs = []
         processor = self.processor
         patch_size = processor.patch_size
         downsample_ratio = processor.downsample_ratio
         images_kwargs = self.images_kwargs
-        for image, params in images:
+        for modality, image, params in images:
             image_inputs = processor.image_processor(images=[image], **images_kwargs)
             pixel_values = image_inputs['pixel_values']
             image_height, image_width = image_inputs['pixel_values'][0].shape[-2:]
diff --git a/lmdeploy/vl/model/llava.py b/lmdeploy/vl/model/llava.py
index 91da486643..06ef05d6c8 100644
--- a/lmdeploy/vl/model/llava.py
+++ b/lmdeploy/vl/model/llava.py
@@ -297,9 +297,9 @@ def encode_images(self, images: torch.Tensor) -> torch.Tensor:
 
     def preprocess(self, messages: List[Dict]) -> List[Dict]:
         """Refer to `super().preprocess() for spec."""
-        images = self.collect_images(messages)
+        images = self.collect_multimodal_items(messages)
         outputs = []
-        for image, params in images:
+        for modality, image, params in images:
             image = image.convert('RGB')
             pixel_values = process_images([image], self.image_processor, self.config)
             outputs.append(
diff --git a/lmdeploy/vl/model/llava_hf.py b/lmdeploy/vl/model/llava_hf.py
index c66f68be68..cf470d09ba 100644
--- a/lmdeploy/vl/model/llava_hf.py
+++ b/lmdeploy/vl/model/llava_hf.py
@@ -57,9 +57,9 @@ def build_model(self):
 
     def preprocess(self, messages: List[Dict]) -> List[Dict]:
         """Refers to `super.preprocess() for spec."""
-        images = self.collect_images(messages)
+        images = self.collect_multimodal_items(messages)
         outputs = []
-        for image, params in images:
+        for modality, image, params in images:
             image = image.convert('RGB')
             pixel_values = self.processor(image, return_tensors='pt', input_data_format='channels_last').pixel_values
             outputs.append(
diff --git a/lmdeploy/vl/model/llava_next.py b/lmdeploy/vl/model/llava_next.py
index b705f237b8..08ffbe44c4 100644
--- a/lmdeploy/vl/model/llava_next.py
+++ b/lmdeploy/vl/model/llava_next.py
@@ -66,9 +66,9 @@ def build_model(self):
     def preprocess(self, messages: List[Dict]) -> List[Dict]:
         """Refers to the spec of `super.preprocess()"""
         from transformers.models.llava_next.modeling_llava_next import image_size_to_num_patches
-        images = self.collect_images(messages)
+        images = self.collect_multimodal_items(messages)
         outputs = []
-        for image, params in images:
+        for modality, image, params in images:
             image = image.convert('RGB')
             result = self.processor(image, return_tensors='pt', input_data_format='channels_last')
             # ! infer image_num_patches from image_sizes
diff --git a/lmdeploy/vl/model/mllama.py b/lmdeploy/vl/model/mllama.py
index ab0949fe03..e9d20684de 100644
--- a/lmdeploy/vl/model/mllama.py
+++ b/lmdeploy/vl/model/mllama.py
@@ -26,9 +26,9 @@ def build_preprocessor(self):
 
     def preprocess(self, messages: List[Dict]) -> List[Dict]:
         """Refer to the spec of `super().preprocess`"""
-        images = self.collect_images(messages)
+        images = self.collect_multimodal_items(messages)
         outputs = []
-        for image, params in images:
+        for modality, image, params in images:
             image = image.convert('RGB')
             results = self.processor.image_processor(images=image, return_tensors='pt')
             results.update(image_size=image.size, image_tokens=1, image_token_id=self.image_token_id)
diff --git a/lmdeploy/vl/model/phi3_vision.py b/lmdeploy/vl/model/phi3_vision.py
index 683220c29c..8167ae883d 100644
--- a/lmdeploy/vl/model/phi3_vision.py
+++ b/lmdeploy/vl/model/phi3_vision.py
@@ -31,9 +31,9 @@ def build_model(self):
 
     def preprocess(self, messages: List[Dict]) -> List[Dict]:
         """Refers to `super.preprocess() for spec."""
-        images = self.collect_images(messages)
+        images = self.collect_multimodal_items(messages)
         outputs = []
-        for image, params in images:
+        for modality, image, params in images:
             result = self.processor.image_processor([image], return_tensors='pt')
             image_tokens = result['num_img_tokens']
             result.update(dict(image_size=image.size, image_tokens=image_tokens, image_token_id=self.image_token_id))
diff --git a/lmdeploy/vl/model/qwen.py b/lmdeploy/vl/model/qwen.py
index db54fb5f57..d959c0ffbf 100644
--- a/lmdeploy/vl/model/qwen.py
+++ b/lmdeploy/vl/model/qwen.py
@@ -71,9 +71,9 @@ def build_model(self):
 
     def preprocess(self, messages: List[Dict]) -> List[Dict]:
         """Refers to `super.preprocess() for spec."""
-        images = self.collect_images(messages)
+        images = self.collect_multimodal_items(messages)
         outputs = []
-        for image, params in images:
+        for modality, image, params in images:
             image = image.convert('RGB')
             pixel_values = self.image_transform(image)
             outputs.append(
diff --git a/lmdeploy/vl/model/qwen2.py b/lmdeploy/vl/model/qwen2.py
index 52ade4980c..a802242d6c 100644
--- a/lmdeploy/vl/model/qwen2.py
+++ b/lmdeploy/vl/model/qwen2.py
@@ -37,10 +37,10 @@ def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refer to `super().preprocess()` for spec."""
         from qwen_vl_utils import process_vision_info
 
-        images = self.collect_images(messages)
+        images = self.collect_multimodal_items(messages)
         optional_keys = {'resized_height', 'resized_width', 'min_pixels', 'max_pixels'}
         outputs = []
-        for image, params in images:
+        for modality, image, params in images:
             image = image.convert('RGB')
 
             item = dict(type='image', image=image)
diff --git a/lmdeploy/vl/model/qwen3.py b/lmdeploy/vl/model/qwen3.py
index 265bf9f729..404f84d781 100644
--- a/lmdeploy/vl/model/qwen3.py
+++ b/lmdeploy/vl/model/qwen3.py
@@ -1,10 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List
 
 import torch
 from transformers import AutoProcessor
 
 from lmdeploy.utils import get_logger
+from lmdeploy.vl.constants import Modality
 from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
 
 logger = get_logger('lmdeploy')
@@ -27,12 +28,20 @@ class Qwen3VLModel(VisionModel):
     def build_preprocessor(self):
         check_transformers()
         self.processor = AutoProcessor.from_pretrained(self.model_path)
-        tokenizer = self.processor.tokenizer
+
+        # image tokens
         self.image_token = self.processor.image_token
-        self.image_token_id = tokenizer.encode(self.image_token)[-1]
-        self.mm_processor_kwargs = None
+        self.image_token_id = self.processor.image_token_id
+
+        # video tokens
+        self.video_token = self.processor.video_token
+        self.video_token_id = self.processor.video_token_id
+
+        # vision start and end tokens
+        self.vision_start_token = self.processor.vision_start_token
+        self.vision_end_token = self.processor.vision_end_token
 
-    def get_processor_args(self, mm_processor_kwargs: Optional[Dict[str, Any]] = None):
+    def get_processor_args(self, mm_processor_kwargs: Dict[str, Any] | None = None):
         min_pixels = self.processor.image_processor.size['shortest_edge']
         max_pixels = self.processor.image_processor.size['longest_edge']
 
@@ -68,26 +77,73 @@ def get_processor_args(self, mm_processor_kwargs: Optional[Dict[str, Any]] = Non
 
         return min_pixels, max_pixels
 
-    def preprocess(self, messages: List[Dict], mm_processor_kwargs: Optional[Dict[str, Any]] = None) -> List[Dict]:
-        """Refer to `super().preprocess()` for spec."""
+    def _preprocess_image(self,
+                          data: List[Any],
+                          params: Dict[str, Any],
+                          mm_processor_kwargs: Dict[str, Any] | None = None) -> List[Dict]:
 
+        image = data.convert('RGB')
         min_pixels, max_pixels = self.get_processor_args(mm_processor_kwargs)
 
-        images = self.collect_images(messages)
+        result = self.processor.image_processor(images=image,
+                                                size={
+                                                    'shortest_edge': min_pixels,
+                                                    'longest_edge': max_pixels
+                                                },
+                                                return_tensors='pt')
+        merge_length = self.processor.image_processor.merge_size**2
+        image_tokens = result['image_grid_thw'].prod(dim=1) // merge_length
+        result.update(dict(image_size=image.size, image_tokens=image_tokens, image_token_id=self.image_token_id))
+        return result
+
+    def _preprocess_video(self,
+                          data: List[Any],
+                          params: Dict[str, Any],
+                          mm_processor_kwargs: Dict[str, Any] | None = None) -> List[Dict]:
+
+        # TODO: zhouxinyu, apply transformers smart_resize using per-request kwargs
+        metadata = params['video_metadata']
+        video_kwargs = dict(return_metadata=True,
+                            do_resize=True,
+                            do_sample_frames=False,
+                            video_metadata=metadata,
+                            return_tensors='pt')
+        result = self.processor.video_processor(videos=data, **video_kwargs)
+        video_grid_thw = result['video_grid_thw']
+
+        merge_length = self.processor.video_processor.merge_size**2
+        if metadata.get('fps') is None:
+            logger.warning_once('Qwen3VL: fps not found, defaulting to 24.')
+            metadata['fps'] = metadata['fps'] or 24
+
+        # if timestamps are not provided, calculate them
+        curr_timestamp = self.processor._calculate_timestamps(
+            metadata['frames_indices'],
+            metadata['fps'],
+            self.processor.video_processor.merge_size,
+        )
+
+        frame_seqlen = video_grid_thw[0][1:].prod() // merge_length
+        result.update(curr_timestamp=curr_timestamp, frame_seqlen=frame_seqlen, video_token_id=self.video_token_id)
+        return result
+
+    def preprocess(self, messages: List[Dict], mm_processor_kwargs: Dict[str, Any] | None = None) -> List[Dict]:
+        """Refer to `super().preprocess()` for spec."""
         outputs = []
-        for image, params in images:
-            image = image.convert('RGB')
-
-            result = self.processor.image_processor(images=image,
-                                                    size={
-                                                        'shortest_edge': min_pixels,
-                                                        'longest_edge': max_pixels
-                                                    },
-                                                    return_tensors='pt')
-            merge_length = self.processor.image_processor.merge_size**2
-            image_tokens = result['image_grid_thw'].prod(dim=1) // merge_length
-            result.update(dict(image_size=image.size, image_tokens=image_tokens, image_token_id=self.image_token_id))
+        self.contains_video_input = False
+
+        mm_items = self.collect_multimodal_items(messages)
+        for modality, data, params in mm_items:
+            result = {}
+            if modality == Modality.IMAGE:
+                result = self._preprocess_image(data, params, mm_processor_kwargs)
+            elif modality == Modality.VIDEO:
+                self.contains_video_input = True
+                result = self._preprocess_video(data, params, mm_processor_kwargs)
+
+            result.update(modality=modality)
             outputs.append(result)
+
         messages.append(dict(role='preprocess', content=outputs))
         return messages
 
@@ -111,18 +167,66 @@ def proc_messages(self, messages, chat_template, sequence_start, chat_template_k
         else:
             prompt_messages = messages
         prompt = chat_template.messages2prompt(prompt_messages, sequence_start, **chat_template_kwargs)
-        return prompt, self.image_token
+        return prompt, None
+
+    def to_pytorch_aux_video(self, messages, prompt, VIDEO_TOKEN, tokenizer, sequence_start):
+        """Pack the video input to the compatible format with pytorch
+        engine."""
+
+        # collect all preprocessing result from messages
+        preps = [x['content'] for x in messages if x['role'] == 'preprocess']
+        assert len(preps) == 1
+        preps = preps[0]
+
+        # split prompt into segments and validate data
+        segs = prompt.split(self.vision_start_token + self.video_token + self.vision_end_token)
+        assert len(segs) == len(preps) + 1, (f'the number of {self.video_token} is not equal '
+                                             f'to input videos, {len(segs) - 1} vs {len(preps)}')
+
+        # calculate the video token offset for each video
+        input_ids = []
+        for i, seg in enumerate(segs):
+            if i > 0 and i <= len(preps):
+                preps[i - 1].update(offset=len(input_ids))
+                frame_seqlen = preps[i - 1]['frame_seqlen']
+                assert self.video_token_id == preps[i - 1]['video_token_id']
+
+                video_grid_thw = preps[i - 1]['video_grid_thw']
+                curr_timestamp = preps[i - 1]['curr_timestamp']
+
+                # update prompt with timestamp index tokens and video pad tokens
+                video_placeholder = ''
+                for frame_idx in range(video_grid_thw[0][0]):
+                    curr_time = curr_timestamp[frame_idx]
+                    video_placeholder += f'<{curr_time:.1f} seconds>'
+                    video_placeholder += (self.vision_start_token + '<|placeholder|>' * frame_seqlen +
+                                          self.vision_end_token)
+
+                video_placeholder = video_placeholder.replace('<|placeholder|>', self.video_token)
+                video_token_ids = tokenizer.encode(video_placeholder)
+                input_ids.extend(video_token_ids)
+
+                preps[i - 1].update(video_tokens=len(video_token_ids))
+
+            token_ids = tokenizer.encode(seg, add_bos=((i == 0) and sequence_start))
+            input_ids.extend(token_ids)
+
+        return dict(prompt=prompt, input_ids=input_ids, multimodal=preps)
 
     def to_pytorch(self,
                    messages,
                    chat_template,
                    tokenizer,
                    sequence_start,
-                   chat_template_kwargs: Optional[Dict] = None,
+                   chat_template_kwargs: Dict | None = None,
                    **kwargs):
         """Return to the information needed by pytorch engine."""
-        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start, chat_template_kwargs)
-        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
+        prompt, _ = self.proc_messages(messages, chat_template, sequence_start, chat_template_kwargs)
+
+        if self.contains_video_input:
+            return self.to_pytorch_aux_video(messages, prompt, self.video_token, tokenizer, sequence_start)
+        else:
+            return self.to_pytorch_aux(messages, prompt, self.image_token, tokenizer, sequence_start)
 
     def build_model(self):
         # TODO: implement for turbomind
@@ -138,7 +242,7 @@ def to_turbomind(self,
                      chat_template,
                      tokenizer,
                      sequence_start,
-                     chat_template_kwargs: Optional[Dict] = None,
+                     chat_template_kwargs: Dict | None = None,
                      **kwargs):
         # TODO: implement for turbomind
         pass
diff --git a/lmdeploy/vl/model/qwen3_5.py b/lmdeploy/vl/model/qwen3_5.py
index c75b87bed0..f030de5e5e 100644
--- a/lmdeploy/vl/model/qwen3_5.py
+++ b/lmdeploy/vl/model/qwen3_5.py
@@ -25,8 +25,17 @@ class Qwen3_5Model(Qwen3VLModel):
 
     def build_preprocessor(self):
         check_transformers()
+
         self.processor = AutoProcessor.from_pretrained(self.model_path)
-        tokenizer = self.processor.tokenizer
+
+        # image tokens
         self.image_token = self.processor.image_token
-        self.image_token_id = tokenizer.encode(self.image_token)[-1]
-        self.mm_processor_kwargs = None
+        self.image_token_id = self.processor.image_token_id
+
+        # video tokens
+        self.video_token = self.processor.video_token
+        self.video_token_id = self.processor.video_token_id
+
+        # vision start and end tokens
+        self.vision_start_token = self.processor.vision_start_token
+        self.vision_end_token = self.processor.vision_end_token
diff --git a/lmdeploy/vl/model/xcomposer2.py b/lmdeploy/vl/model/xcomposer2.py
index cff3b808f5..5cc4eac5f4 100644
--- a/lmdeploy/vl/model/xcomposer2.py
+++ b/lmdeploy/vl/model/xcomposer2.py
@@ -206,9 +206,9 @@ def _preprocess_4khd_7b(self, image: Image, params: Dict) -> Dict:
 
     def preprocess(self, messages: List[Dict]) -> List[Dict]:
         """Refer to `super().preprocess() for spec."""
-        images = self.collect_images(messages)
+        images = self.collect_multimodal_items(messages)
         outputs = []
-        for image, params in images:
+        for modality, image, params in images:
             image = image.convert('RGB')
             pixel_values, n_token = self.preprocess_func(image, params)
             outputs.append(
diff --git a/lmdeploy/vl/model/yi.py b/lmdeploy/vl/model/yi.py
index 02dd1c83e5..317f8afac4 100644
--- a/lmdeploy/vl/model/yi.py
+++ b/lmdeploy/vl/model/yi.py
@@ -119,9 +119,9 @@ def build_model(self):
 
     def preprocess(self, messages: List[Dict]) -> List[Dict]:
         """Refer to `super().preprocess() for spec."""
-        images = self.collect_images(messages)
+        images = self.collect_multimodal_items(messages)
         outputs = []
-        for image, params in images:
+        for modality, image, params in images:
             image = image.convert('RGB')
             pixel_values = process_images([image], self.image_processor, self.config)
             outputs.append(
diff --git a/lmdeploy/vl/time_series_utils.py b/lmdeploy/vl/time_series_utils.py
deleted file mode 100644
index 5651f65697..0000000000
--- a/lmdeploy/vl/time_series_utils.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import csv
-import os
-from io import BytesIO, StringIO
-
-import numpy as np
-import pybase64
-import requests
-
-from lmdeploy.utils import get_logger
-
-logger = get_logger('lmdeploy')
-
-FETCH_TIMEOUT = int(os.environ.get('LMDEPLOY_FETCH_TIMEOUT', 10))
-HEADERS = {
-    'User-Agent':
-    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
-    '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
-}
-
-
-def encode_time_series_base64(data: str | np.ndarray) -> str:
-    """Encode time series data to base64.
-
-    Supports: HTTP URL, local path, or numpy array.
-    """
-    buffered = BytesIO()
-
-    try:
-        if isinstance(data, str):
-            if data.startswith('http'):
-                response = requests.get(data, headers=HEADERS, timeout=FETCH_TIMEOUT)
-                response.raise_for_status()
-                ts_array = _load_bytes(response.content, data)
-            elif data.startswith('file://'):
-                path = data.removeprefix('file://')
-                ts_array = _load_path(path)
-            elif os.path.exists(data):
-                ts_array = _load_path(data)
-            else:
-                raise ValueError(f'Path does not exist: {data}')
-        elif isinstance(data, np.ndarray):
-            ts_array = data
-        else:
-            raise TypeError(f'Expected str or np.ndarray, got {type(data)}')
-
-        np.save(buffered, ts_array)
-
-    except Exception as error:
-        data_info = str(data)[:100] + ' ...' if isinstance(data, str) and len(data) > 100 else str(data)
-        logger.error(f'{error}, data={data_info}')
-        np.save(buffered, np.zeros((6000, 3), dtype=np.float32))  # dummy
-
-    return pybase64.b64encode(buffered.getvalue()).decode('utf-8')
-
-
-def load_time_series_from_base64(ts_base64: bytes | str) -> np.ndarray:
-    """Load time series from base64 format."""
-    if isinstance(ts_base64, str):
-        ts_base64 = ts_base64.encode('utf-8')
-    return np.load(BytesIO(pybase64.b64decode(ts_base64)), allow_pickle=False)
-
-
-def load_time_series(data_source: str | np.ndarray) -> np.ndarray:
-    """Load time series from URL, local path, base64 data URL, or numpy
-    array."""
-    try:
-        if isinstance(data_source, np.ndarray):
-            return data_source
-
-        if data_source.startswith('http'):
-            response = requests.get(data_source, headers=HEADERS, timeout=FETCH_TIMEOUT)
-            response.raise_for_status()
-            return _load_bytes(response.content, data_source)
-
-        if data_source.startswith('data:time_series'):
-            return load_time_series_from_base64(data_source.split(',')[1])
-
-        if data_source.startswith('file://'):
-            path = data_source.removeprefix('file://')
-            return _load_path(path)
-
-        if os.path.exists(data_source):
-            return _load_path(data_source)
-
-        raise ValueError(f'Invalid data source: {data_source}')
-    except Exception as error:
-        data_info = str(data_source)[:100] + ' ...' if isinstance(data_source,
-                                                                  str) and len(data_source) > 100 else str(data_source)
-        logger.error(f'{error}, data_source={data_info}')
-        return np.zeros((6000, 3), dtype=np.float32)  # dummy
-
-
-def _load_bytes(content: bytes, hint: str = '') -> np.ndarray:
-    """Auto-detect format from bytes.
-
-    Try: npy -> csv -> audio.
-    """
-    hint = hint.lower()
-
-    # Format hints from URL/path
-    if '.npy' in hint:
-        return np.load(BytesIO(content))
-    if '.csv' in hint:
-        return _load_csv(content)
-    if any(ext in hint for ext in ['.wav', '.mp3', '.flac']):
-        return _load_audio(content)
-
-    # Fallback: try all formats
-    loaders = [lambda: np.load(BytesIO(content)), lambda: _load_csv(content), lambda: _load_audio(content)]
-    for loader in loaders:
-        try:
-            return loader()
-        except Exception:
-            continue
-    raise ValueError(f'Cannot detect format from bytes: {hint[:50]}')
-
-
-def _load_path(path: str) -> np.ndarray:
-    """Load from local file path based on extension."""
-    ext = os.path.splitext(path)[-1].lower()
-
-    if ext == '.npy':
-        return np.load(path)
-    if ext == '.csv':
-        return _load_csv(path)
-    if ext in ['.wav', '.mp3', '.flac']:
-        return _load_audio(path)
-
-    raise ValueError(f'Unsupported format: {ext}')
-
-
-def _load_csv(source: bytes | str) -> np.ndarray:
-    """Load CSV from bytes or file path."""
-    # Read content as text
-    if isinstance(source, bytes):
-        text = source.decode('utf-8')
-    else:
-        with open(source, 'r', newline='') as f:
-            text = f.read()
-
-    # Parse CSV
-    f = StringIO(text)
-    reader = csv.reader(f)
-    rows = list(reader)
-
-    return np.array(rows, dtype=np.float32)
-
-
-def _load_audio(source: bytes | str) -> np.ndarray:
-    """Load audio from bytes or file path."""
-    try:
-        import soundfile as sf
-    except ImportError:
-        raise ImportError('Please install soundfile to process audio files.')
-
-    if isinstance(source, bytes):
-        source = BytesIO(source)
-    ts, sr = sf.read(source)
-    return ts
diff --git a/lmdeploy/vl/utils.py b/lmdeploy/vl/utils.py
index a089d06ad7..2ebb998b1d 100644
--- a/lmdeploy/vl/utils.py
+++ b/lmdeploy/vl/utils.py
@@ -1,83 +1,54 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import os
-from io import BytesIO
-from typing import Union
+from typing import Any, Dict, Tuple
 
-import pybase64
-import requests
-from PIL import Image, ImageFile
+import numpy.typing as npt
+from PIL import Image
 
-from lmdeploy.utils import get_logger
+from .media.connection import load_from_url
+from .media.image import ImageMediaIO
+from .media.time_series import TimeSeriesMediaIO
+from .media.video import VideoMediaIO
 
-logger = get_logger('lmdeploy')
 
+def load_image(image_url: str, **kwargs) -> Image.Image:
+    """Fetch and decode an image from a URL, path, or base64 string."""
+    image_io = ImageMediaIO(**kwargs)
+    return load_from_url(image_url, image_io)
 
-def encode_image_base64(image: Union[str, Image.Image]) -> str:
-    """Encode raw data to base64 format."""
-    buffered = BytesIO()
-    FETCH_TIMEOUT = int(os.environ.get('LMDEPLOY_FETCH_TIMEOUT', 10))
-    headers = {
-        'User-Agent':
-        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
-        '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
-    }
-    try:
-        if isinstance(image, str):
-            url_or_path = image
-            if url_or_path.startswith('http'):
-                response = requests.get(url_or_path, headers=headers, timeout=FETCH_TIMEOUT)
-                response.raise_for_status()
-                buffered.write(response.content)
-            elif os.path.exists(url_or_path):
-                with open(url_or_path, 'rb') as image_file:
-                    buffered.write(image_file.read())
-        elif isinstance(image, Image.Image):
-            image.save(buffered, format='PNG')
-    except Exception as error:
-        if isinstance(image, str) and len(image) > 100:
-            image = image[:100] + ' ...'
-        logger.error(f'{error}, image={image}')
-        # use dummy image
-        image = Image.new('RGB', (32, 32))
-        image.save(buffered, format='PNG')
-    res = pybase64.b64encode(buffered.getvalue()).decode('utf-8')
-    return res
 
+def load_video(video_url: str, **kwargs) -> Tuple[npt.NDArray, Dict[str, Any]]:
+    """Fetch and decode video frames from a URL, path, or base64 string."""
+    image_io = ImageMediaIO()
+    video_io = VideoMediaIO(image_io=image_io, **kwargs)
+    return load_from_url(video_url, video_io)
 
-def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
-    """Load image from base64 format."""
-    return Image.open(BytesIO(pybase64.b64decode(image)))
 
+def load_time_series(ts_url: str, **kwargs) -> npt.NDArray:
+    """Fetch and decode time-series from a URL or path or base64 string.."""
+    ts_io = TimeSeriesMediaIO(**kwargs)
+    return load_from_url(ts_url, ts_io)
 
-def load_image(image_url: Union[str, Image.Image]) -> Image.Image:
-    """Load image from url, local path or openai GPT4V."""
-    FETCH_TIMEOUT = int(os.environ.get('LMDEPLOY_FETCH_TIMEOUT', 10))
-    headers = {
-        'User-Agent':
-        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
-        '(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
-    }
-    try:
-        ImageFile.LOAD_TRUNCATED_IMAGES = True
-        if isinstance(image_url, Image.Image):
-            img = image_url
-        elif image_url.startswith('http'):
-            response = requests.get(image_url, headers=headers, timeout=FETCH_TIMEOUT)
-            response.raise_for_status()
-            img = Image.open(BytesIO(response.content))
-        elif image_url.startswith('data:image'):
-            img = load_image_from_base64(image_url.split(',')[1])
-        else:
-            # Load image from local path
-            img = Image.open(image_url)
 
-        # check image valid
-        img = img.convert('RGB')
-    except Exception as error:
-        if isinstance(image_url, str) and len(image_url) > 100:
-            image_url = image_url[:100] + ' ...'
-        logger.error(f'{error}, image_url={image_url}')
-        # use dummy image
-        img = Image.new('RGB', (32, 32))
+def encode_image_base64(image: str | Image.Image, format: str = 'PNG', **kwargs) -> str:
+    """Encode image (path or PIL image) to a base64 string."""
+    if isinstance(image, str):
+        image = load_image(image, **kwargs)
+    image_io = ImageMediaIO(**kwargs)
+    return image_io.encode_base64(image, image_format=format)
 
-    return img
+
+def encode_video_base64(video: str | npt.NDArray, format: str = 'JPEG', **kwargs) -> str:
+    """Encode video (path or frames) to a base64 string."""
+    if isinstance(video, str):
+        video, _ = load_video(video, **kwargs)
+    image_io = ImageMediaIO()
+    video_io = VideoMediaIO(image_io=image_io, **kwargs)
+    return video_io.encode_base64(video, video_format=format)
+
+
+def encode_time_series_base64(data: str | npt.NDArray, **kwargs) -> str:
+    """Encode time-series (path or numpy array) to a base64 string."""
+    if isinstance(data, str):
+        data = load_time_series(data, **kwargs)
+    ts_io = TimeSeriesMediaIO(**kwargs)
+    return ts_io.encode_base64(data)
diff --git a/requirements/runtime_cuda.txt b/requirements/runtime_cuda.txt
index 3f5c687a03..f8929b6601 100644
--- a/requirements/runtime_cuda.txt
+++ b/requirements/runtime_cuda.txt
@@ -9,6 +9,7 @@ mmengine-lite
 numpy
 openai
 openai_harmony
+opencv-python-headless
 partial_json_parser
 peft<=0.14.0
 pillow
diff --git a/tests/test_lmdeploy/test_vl/test_qwen3vl_processor.py b/tests/test_lmdeploy/test_vl/test_qwen3vl_processor.py
index 46bf84c887..a80acbdef8 100644
--- a/tests/test_lmdeploy/test_vl/test_qwen3vl_processor.py
+++ b/tests/test_lmdeploy/test_vl/test_qwen3vl_processor.py
@@ -2,8 +2,8 @@
 
 import pytest
 
+from lmdeploy.vl import load_image
 from lmdeploy.vl.model.qwen3 import Qwen3VLModel
-from lmdeploy.vl.utils import load_image
 
 QWEN3VL_MODELS = [
     'Qwen/Qwen3-VL-4B-Instruct',
@@ -35,7 +35,7 @@ def sample_messages():
             },
             {
                 'type': 'image',
-                'image': pil_image
+                'data': pil_image
             },
         ]
     }]
diff --git a/tests/test_lmdeploy/test_vl/test_vl_encode.py b/tests/test_lmdeploy/test_vl/test_vl_encode.py
index eaa29e37a9..607b4822a5 100644
--- a/tests/test_lmdeploy/test_vl/test_vl_encode.py
+++ b/tests/test_lmdeploy/test_vl/test_vl_encode.py
@@ -1,39 +1,123 @@
-# yapf: disable
-from lmdeploy.vl.utils import encode_image_base64, load_image, load_image_from_base64
+import math
 
-# yapf: enable
+import numpy as np
 
+from lmdeploy.vl import (encode_image_base64, encode_time_series_base64, encode_video_base64, load_image,
+                         load_time_series, load_video)
 
-def test_encode_image_base64():
-    url = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg'  # noqa E501
-    im1 = load_image(url)
-    base64 = encode_image_base64(url)
-    im2 = load_image_from_base64(base64)
-    assert im1 == im2.convert('RGB')
 
+def test_image_encode_decode():
+    url = 'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg'
 
-def test_load_truncated_image():
+    img1 = load_image(url)
+    # use PNG for lossless pixel-perfect comparison
+    b64 = encode_image_base64(url, format='PNG')
+    img2 = load_image(f'data:image/png;base64,{b64}')
+
+    assert img1.size == img2.size
+    assert img1.mode == img2.mode
+    assert img1.tobytes() == img2.tobytes()
+
+
+def test_video_encode_decode():
+    # url = 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/space_woaudio.mp4'
+    url = 'https://raw.githubusercontent.com/CUHKSZzxy/Online-Data/main/clip_3_removed.mp4'
+
+    # num_frames=4 to keep test fast
+    vid1, meta1 = load_video(url, num_frames=4)
+    b64 = encode_video_base64(url, num_frames=4, format='JPEG')
+    vid2, meta2 = load_video(f'data:video/jpeg;base64,{b64}')
+
+    gt_meta = {
+        'total_num_frames': 498,
+        'fps': 29.97002997002997,
+        'duration': 16.616600000000002,
+        'video_backend': 'opencv',
+        'frames_indices': [0, 165, 331, 497]
+    }
+
+    assert vid1.shape == vid2.shape
+    assert np.mean(np.abs(vid1.astype(float) - vid2.astype(float))) < 2.0  # JPEG is lossy
+    assert meta1['total_num_frames'] == gt_meta['total_num_frames']
+    assert meta1['frames_indices'] == gt_meta['frames_indices']
+
+
+def test_time_series_encode_decode():
+    # url = "https://huggingface.co/internlm/Intern-S1-Pro/raw/main/0092638_seism.npy"
+    url = 'https://raw.githubusercontent.com/CUHKSZzxy/Online-Data/main/0092638_seism.npy'
+
+    ts1 = load_time_series(url)
+    b64 = encode_time_series_base64(url)
+    ts2 = load_time_series(f'data:time_series/npy;base64,{b64}')
+
+    assert ts1.shape == ts2.shape
+    assert np.allclose(ts1, ts2)
+
+
+def test_image_modes():
+    import numpy as np
+    from PIL import Image
+
+    grayscale_img = Image.fromarray(np.zeros((100, 100), dtype=np.uint8)).convert('L')
+    b64 = encode_image_base64(grayscale_img)  # should convert L -> RGB internally
+
+    img_out = load_image(f'data:image/png;base64,{b64}')
+    assert img_out.mode == 'RGB'
+
+
+def test_truncated_image():
     url = 'https://github.com/irexyc/lmdeploy/releases/download/v0.0.1/tr.jpeg'
     im = load_image(url)
     assert im.width == 1638
     assert im.height == 2048
 
 
-def test_load_invalid_url():
-    url = ('https://raw.githubusercontent.com/open-mmlab/'
-           'mmdeploy/main/tests/data/tiger.jpeg')
-    # invalid
-    im1 = load_image(url[:-1])
-    assert im1.width == 32
-    assert im1.height == 32
-    # valid
-    im2 = load_image(url)
-    assert im2.height == 182
-    assert im2.width == 278
-
-
-def test_load_invalid_base64():
-    base64 = 'data:image/jpeg;base64,xxx'
-    im = load_image(base64)
-    assert im.width == 32
-    assert im.height == 32
+def test_single_frame_video():
+    url = 'https://raw.githubusercontent.com/CUHKSZzxy/Online-Data/main/clip_3_removed.mp4'
+    vid, meta = load_video(url, num_frames=1)
+    assert vid.shape[0] == 1
+
+    b64 = encode_video_base64(vid)
+    assert isinstance(b64, str)
+    assert ',' not in b64  # should only be one JPEG block, no commas
+
+
+def test_video_sampling_params():
+    url = 'https://raw.githubusercontent.com/CUHKSZzxy/Online-Data/main/clip_3_removed.mp4'
+
+    # 1. test num_frames constraint
+    num_frames = 5
+    vid, meta = load_video(url, num_frames=num_frames)
+    assert vid.shape[0] == num_frames
+    assert len(meta['frames_indices']) == num_frames
+
+    # 2. test fps constraint (original fps is ~29.97, duration ~16.6s)
+    fps = 1
+    vid, meta = load_video(url, fps=fps)
+    expected_frames = max(1, int(math.floor(meta['duration'] * fps)))
+    assert vid.shape[0] == expected_frames
+
+    # 3. test both constraints (should take the minimum)
+    # 10 fps x 16.6s ~= 166 frames > 10 frames, so will be limited by num_frames
+    num_frames = 10
+    fps = 10
+    vid, meta = load_video(url, num_frames=num_frames, fps=fps)
+    assert vid.shape[0] == num_frames
+
+    # 1 fps x 16.6s ~= 16 frames < 100 frames, so will be limited by fps
+    num_frames = 100
+    fps = 1
+    vid, meta = load_video(url, num_frames=num_frames, fps=fps)
+    expected_frames = max(1, int(math.floor(meta['duration'] * fps)))
+    assert vid.shape[0] == expected_frames
+
+
+def test_invalid_inputs():
+    # non-existent local path
+    import pytest
+    with pytest.raises(Exception):
+        load_image('/non_existent/path/image.jpg')
+    with pytest.raises(Exception):
+        load_video('/non_existent/path/video.mp4')
+    with pytest.raises(Exception):
+        load_time_series('/non_existent/path/data.npy')