Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions autotest/tools/pipeline/mllm_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
from PIL import Image

from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline
from lmdeploy.vl import load_image
from lmdeploy.vl import encode_image_base64, load_image
from lmdeploy.vl.constants import IMAGE_TOKEN
from lmdeploy.vl.utils import encode_image_base64

gen_config = GenerationConfig(max_new_tokens=500, min_new_tokens=10)

Expand Down
2 changes: 1 addition & 1 deletion docs/en/multi_modal/internvl.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ import numpy as np
from lmdeploy import pipeline, GenerationConfig
from decord import VideoReader, cpu
from lmdeploy.vl.constants import IMAGE_TOKEN
from lmdeploy.vl.utils import encode_image_base64
from lmdeploy.vl import encode_image_base64
from PIL import Image
pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')

Expand Down
2 changes: 1 addition & 1 deletion docs/en/multi_modal/minicpmv.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ print(out.text)

```python
from lmdeploy import pipeline, GenerationConfig
from lmdeploy.vl.utils import encode_image_base64
from lmdeploy.vl import encode_image_base64
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
Expand Down
2 changes: 1 addition & 1 deletion docs/en/multi_modal/qwen2_5_vl.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ import numpy as np
from lmdeploy import pipeline, GenerationConfig
from decord import VideoReader, cpu
from lmdeploy.vl.constants import IMAGE_TOKEN
from lmdeploy.vl.utils import encode_image_base64
from lmdeploy.vl import encode_image_base64
from PIL import Image
pipe = pipeline('Qwen/Qwen2.5-VL-7B-Instruct', log_level='INFO')

Expand Down
2 changes: 1 addition & 1 deletion docs/zh_cn/multi_modal/internvl.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ import numpy as np
from lmdeploy import pipeline, GenerationConfig
from decord import VideoReader, cpu
from lmdeploy.vl.constants import IMAGE_TOKEN
from lmdeploy.vl.utils import encode_image_base64
from lmdeploy.vl import encode_image_base64
from PIL import Image
pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')

Expand Down
2 changes: 1 addition & 1 deletion docs/zh_cn/multi_modal/minicpmv.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ print(out.text)

```python
from lmdeploy import pipeline, GenerationConfig
from lmdeploy.vl.utils import encode_image_base64
from lmdeploy.vl import encode_image_base64
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
Expand Down
2 changes: 1 addition & 1 deletion docs/zh_cn/multi_modal/qwen2_5_vl.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ import numpy as np
from lmdeploy import pipeline, GenerationConfig
from decord import VideoReader, cpu
from lmdeploy.vl.constants import IMAGE_TOKEN
from lmdeploy.vl.utils import encode_image_base64
from lmdeploy.vl import encode_image_base64
from PIL import Image
pipe = pipeline('Qwen/Qwen2.5-VL-7B-Instruct', log_level='INFO')

Expand Down
6 changes: 3 additions & 3 deletions lmdeploy/pytorch/model_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import lmdeploy.pytorch.distributed as dist
from lmdeploy.pytorch.backends import get_backend
from lmdeploy.pytorch.config import CacheConfig, DLLMConfig, ModelConfig, QuantizationConfig
from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
from lmdeploy.pytorch.multimodal.data_type import MultiModalData
from lmdeploy.pytorch.utils import CtxMgrBase, singleton

if TYPE_CHECKING:
Expand Down Expand Up @@ -66,7 +66,7 @@ class VisionModelInputs:
input_embeddings: List[List[torch.Tensor]] = None
input_embedding_ranges: List[torch.LongTensor] = None
input_embedding_indexing: torch.BoolTensor = None
input_multimodals: List[MultiModalTensor] = None
input_multimodals: List[MultiModalData] = None

def to_device(self, device: str, non_blocking: bool = False):
"""To device."""
Expand Down Expand Up @@ -255,7 +255,7 @@ class StepContext:
local_adapter_ids: torch.LongTensor = None
input_embeddings: torch.Tensor = None
input_embedding_indexing: torch.Tensor = None
input_multimodals: List[MultiModalTensor] = None
input_multimodals: List[MultiModalData] = None
vision_inputs: VisionModelInputs = None
attn_metadata: Any = None
kv_quant_policy: Literal[0, 4, 8] = 0
Expand Down
10 changes: 5 additions & 5 deletions lmdeploy/pytorch/models/chatglm2.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
from lmdeploy.pytorch.multimodal.data_type import MultiModalData
from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding,
build_rotary_params)
from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_down_linear, build_gateup_linear, build_o_proj,
Expand Down Expand Up @@ -866,10 +866,10 @@ def preprocess_input(self,
if isinstance(num_pad, torch.Tensor):
num_pad = num_pad.item()

mm_data = MultiModalTensor(data=pixel_values,
start=offset,
end=offset + num_pad,
meta=dict(image_token_id=image_token_id))
mm_data = MultiModalData(data=pixel_values,
start=offset,
end=offset + num_pad,
meta=dict(image_token_id=image_token_id))
input_imgs.append(mm_data)

result = PreprocessInputResult(
Expand Down
10 changes: 5 additions & 5 deletions lmdeploy/pytorch/models/cogvlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from lmdeploy.pytorch.distributed import get_tp_world_rank
from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
from lmdeploy.pytorch.multimodal.data_type import MultiModalData
from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding
from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_merged_colwise_linear, build_qkv_proj,
build_rowwise_linear)
Expand Down Expand Up @@ -901,10 +901,10 @@ def preprocess_input(self, input_ids: List[int], input_multimodals=None, **kwarg
if isinstance(num_pad, torch.Tensor):
num_pad = num_pad.item()

mm_data = MultiModalTensor(data=pixel_values,
start=offset,
end=offset + num_pad,
meta=dict(image_token_id=image_token_id))
mm_data = MultiModalData(data=pixel_values,
start=offset,
end=offset + num_pad,
meta=dict(image_token_id=image_token_id))
input_imgs.append(mm_data)

result = PreprocessInputResult(
Expand Down
16 changes: 8 additions & 8 deletions lmdeploy/pytorch/models/deepseek_vl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
from lmdeploy.pytorch.multimodal.data_type import MultiModalData
from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight

from .deepseek_v2 import DeepseekV2ForCausalLM
Expand Down Expand Up @@ -440,13 +440,13 @@ def preprocess_input(self,
if isinstance(num_pad, torch.Tensor):
num_pad = num_pad.item()

mm_data = MultiModalTensor(data=pixel_values,
start=offset,
end=offset + num_pad,
meta=dict(
image_token_id=image_token_id,
images_spatial_crop=images_spatial_crop,
))
mm_data = MultiModalData(data=pixel_values,
start=offset,
end=offset + num_pad,
meta=dict(
image_token_id=image_token_id,
images_spatial_crop=images_spatial_crop,
))

input_imgs.append(mm_data)

Expand Down
10 changes: 5 additions & 5 deletions lmdeploy/pytorch/models/gemma3_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
from lmdeploy.pytorch.multimodal.data_type import MultiModalData
from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight

from .patch import build_model_from_hf_config
Expand Down Expand Up @@ -108,10 +108,10 @@ def preprocess_input(self,
if isinstance(num_pad, torch.Tensor):
num_pad = num_pad.item()

mm_data = MultiModalTensor(data=pixel_values,
start=offset,
end=offset + num_pad,
meta=dict(image_token_id=image_token_id))
mm_data = MultiModalData(data=pixel_values,
start=offset,
end=offset + num_pad,
meta=dict(image_token_id=image_token_id))
input_imgs.append(mm_data)

result = PreprocessInputResult(
Expand Down
10 changes: 5 additions & 5 deletions lmdeploy/pytorch/models/glm4_1v.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
from lmdeploy.pytorch.multimodal.data_type import MultiModalData
from lmdeploy.pytorch.nn import ApplyRotaryEmb, FlashAttention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config
from lmdeploy.pytorch.nn.linear import build_merged_colwise_linear, build_qkv_proj, build_rowwise_linear
from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
Expand Down Expand Up @@ -865,10 +865,10 @@ def preprocess_input(self,
if isinstance(num_pad, torch.Tensor):
num_pad = num_pad.item()

mm_data = MultiModalTensor(data=pixel_values,
start=start,
end=start + num_pad,
meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id))
mm_data = MultiModalData(data=pixel_values,
start=start,
end=start + num_pad,
meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id))
input_imgs.append(mm_data)

result = PreprocessInputResult(
Expand Down
136 changes: 88 additions & 48 deletions lmdeploy/pytorch/models/interns1_pro.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@

from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
from lmdeploy.pytorch.multimodal.data_type import MultiModalData
from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
from lmdeploy.vl.constants import Modality

from .interns1_pro_ts import InternS1ProTimeSeriesModel
from .patch import add_prefix, get_build_model_context
Expand Down Expand Up @@ -173,25 +174,28 @@ def prepare_inputs_for_generation(
ts_sr = None
ts_mask = None
if context.input_multimodals is not None:
mm_data = [input_mm.get('image', []) for input_mm in context.input_multimodals]
mm_inputs = [input_mm.get('mm_data', []) for input_mm in context.input_multimodals]
# flatten batch
mm_data = [data for im_data in mm_data for data in im_data]
mm_inputs = [item for sublist in mm_inputs for item in sublist]

if len(mm_data) > 0:
is_time_series = mm_data[0].meta.get('ts_token_id', False)
if len(mm_inputs) > 0:
modality = mm_inputs[0].modality
image_token_id = mm_inputs[0].meta.get('image_token_id')
video_token_id = mm_inputs[0].meta.get('video_token_id')
ts_token_id = mm_inputs[0].meta.get('ts_token_id')

if is_time_series:
ts_values = mm_data
ts_token_id = ts_values[0].meta['ts_token_id']
ts_lens = ts_values[0].meta['ts_lens']
ts_sr = ts_values[0].meta['ts_sr']
if modality == Modality.TIME_SERIES:
ts_values = torch.cat([inp.data for inp in mm_inputs])
ts_mask = input_ids == ts_token_id
ts_values = torch.cat([data.data for data in ts_values])

ts_lens = mm_inputs[0].meta['ts_lens']
ts_sr = mm_inputs[0].meta['ts_sr']
else:
pixel_values = torch.cat([data.data for data in mm_data])
image_token_id = mm_data[0].meta['image_token_id']
image_mask = input_ids == image_token_id
grid_thw = torch.cat([data.meta['grid_thw'] for data in mm_data]).cpu()
pixel_values = torch.cat([inp.data for inp in mm_inputs])
mm_token_id = image_token_id if modality == Modality.IMAGE else video_token_id
image_mask = (input_ids == mm_token_id)

grid_thw = torch.cat([data.meta['grid_thw'] for data in mm_inputs]).cpu()
vis_pos_emb = self.visual.rot_pos_emb(grid_thw)
pos_embeds = self.visual.fast_pos_embed_interpolate(grid_thw)
vis_cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
Expand Down Expand Up @@ -365,6 +369,63 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype) -> None:
self.config = config
self.dtype = dtype

def _make_image_mm_data(self, input_mm: Dict[str, Any]) -> MultiModalData:
"""Make image MultiModalData."""
pixel_values = input_mm['pixel_values'].to(self.dtype)
image_grid_thw = input_mm['image_grid_thw']
offset = input_mm['offset']
start = offset
image_token_id = input_mm['image_token_id']
num_pad = input_mm['image_tokens']
if isinstance(num_pad, torch.Tensor):
num_pad = num_pad.item()

mm_data = MultiModalData(modality=Modality.IMAGE,
data=pixel_values,
start=start,
end=start + num_pad,
meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id))
return mm_data

def _make_video_mm_data(self, input_mm: Dict[str, Any]) -> MultiModalData:
"""Make video MultiModalData."""
pixel_values_videos = input_mm['pixel_values_videos'].to(self.dtype)
video_grid_thw = input_mm['video_grid_thw']
offset = input_mm['offset']
start = offset
video_token_id = input_mm['video_token_id']
num_pad = input_mm['video_tokens']
if isinstance(num_pad, torch.Tensor):
num_pad = num_pad.item()

mm_data = MultiModalData(modality=Modality.VIDEO,
data=pixel_values_videos,
start=start,
end=start + num_pad,
meta=dict(
grid_thw=video_grid_thw,
video_token_id=video_token_id,
))
return mm_data

def _make_time_series_mm_data(self, input_mm: Dict[str, Any]) -> MultiModalData:
"""Make time series MultiModalData."""
ts_values = input_mm['ts_values'].to(self.dtype)
offset = input_mm['offset']
ts_token_id = input_mm['ts_token_id']
ts_lens = input_mm['ts_lens']
ts_sr = input_mm['ts_sr']
num_pad = input_mm['ts_tokens']
if isinstance(num_pad, torch.Tensor):
num_pad = num_pad.item()

mm_data = MultiModalData(modality=Modality.TIME_SERIES,
data=ts_values,
start=offset,
end=offset + num_pad,
meta=dict(ts_lens=ts_lens, ts_sr=ts_sr, ts_token_id=ts_token_id))
return mm_data

def preprocess_input(self,
input_ids: List[int],
input_multimodals: List[Dict[str, Any]] = None,
Expand All @@ -373,38 +434,17 @@ def preprocess_input(self,
if input_multimodals is None or len(input_multimodals) == 0:
return input_ids, input_multimodals

input_imgs = []
input_mm_data = []
for input_mm in input_multimodals:
if 'ts_values' in input_mm:
ts_values = input_mm['ts_values'].to(self.dtype)
offset = input_mm['offset']
ts_token_id = input_mm['ts_token_id']
ts_lens = input_mm['ts_lens']
ts_sr = input_mm['ts_sr']
num_pad = input_mm['num_ts_tokens']

mm_data = MultiModalTensor(data=ts_values,
start=offset,
end=offset + num_pad,
meta=dict(ts_token_id=ts_token_id, ts_lens=ts_lens, ts_sr=ts_sr))
else:
pixel_values = input_mm['pixel_values'].to(self.dtype)
image_grid_thw = input_mm['image_grid_thw']
offset = input_mm['offset']
start = offset
image_token_id = input_mm['image_token_id']
num_pad = input_mm['image_tokens']
if isinstance(num_pad, torch.Tensor):
num_pad = num_pad.item()

mm_data = MultiModalTensor(data=pixel_values,
start=start,
end=start + num_pad,
meta=dict(grid_thw=image_grid_thw, image_token_id=image_token_id))
input_imgs.append(mm_data)

result = PreprocessInputResult(
input_ids=input_ids,
input_multimodals=dict(image=input_imgs),
)
modality = input_mm.get('modality')
if modality == Modality.IMAGE:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extract code in branch into a function would make the code more readable, for example:

if modality == IMAGE:
    mm_data = self.make_image_mm(...)
elif modality == VIDEO:
    mm_data = self.make_video_mm(...)
...

mm_data = self._make_image_mm_data(input_mm)
elif modality == Modality.VIDEO:
mm_data = self._make_video_mm_data(input_mm)
elif modality == Modality.TIME_SERIES:
mm_data = self._make_time_series_mm_data(input_mm)
input_mm_data.append(mm_data)

result = PreprocessInputResult(input_ids=input_ids, input_multimodals=dict(mm_data=input_mm_data))

return result
2 changes: 1 addition & 1 deletion lmdeploy/pytorch/models/interns1_pro_ts.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def __init__(self, d_model, max_len=20000, dtype: torch.dtype = None, device: to
div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
# TODO: zhouxinyu, hf forces float32 during init, but becomes bf16 during forward
# hf forces float32 during init, but becomes bf16 during forward
pe = pe.unsqueeze(0).transpose(0, 1).to(dtype=dtype, device=device) # (max_len, 1, d_model)
self.register_buffer('pe', pe, persistent=True)

Expand Down
Loading
Loading