-
Notifications
You must be signed in to change notification settings - Fork 201
/
Copy pathmllm_mapper.py
115 lines (98 loc) · 4.06 KB
/
mllm_mapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from data_juicer.ops.base_op import OPERATORS, Mapper
from data_juicer.ops.op_fusion import LOADED_IMAGES
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import load_image
from data_juicer.utils.model_utils import get_model, prepare_model
OP_NAME = 'mllm_mapper'
torch = LazyLoader('torch', 'torch')
transformers = LazyLoader('transformers', 'transformers')
torch.set_num_threads(1)
@LOADED_IMAGES.register_module(OP_NAME)
@OPERATORS.register_module(OP_NAME)
class MllmMapper(Mapper):
"""Mapper to use MLLMs for visual question answering tasks.
Recommended model list: [
llava-hf/llava-v1.6-vicuna-7b-hf,
Qwen/Qwen2-VL-7B-Instruct,
]
"""
_accelerator = 'cuda'
def __init__(self,
hf_model: str = 'llava-hf/llava-v1.6-vicuna-7b-hf',
max_new_tokens=256,
temperature=0.2,
top_p=None,
num_beams=1,
*args,
**kwargs):
"""
Initialization method.
:param hf_model: hugginface model id.
:param max_new_tokens: the maximum number of new tokens
generated by the model.
:param temperature: used to control the randomness of \
generated text. The higher the temperature, the more \
random and creative the generated text will be.
:param top_p: randomly select the next word from the group \
of words whose cumulative probability reaches p.
:param num_beams: the larger the beam search size, the higher \
the quality of the generated text.
:param args: extra args
:param kwargs: extra args
"""
kwargs.setdefault('mem_required', '32GB')
kwargs.setdefault('num_proc', 1)
super().__init__(*args, **kwargs)
self.hf_model = hf_model
self.model_key = prepare_model(model_type='huggingface',
pretrained_model_name_or_path=hf_model)
self.max_new_tokens = max_new_tokens
self.temperature = temperature
self.top_p = top_p
self.num_beams = num_beams
def process_single(self, sample=None, rank=None):
# there is no image in this sample
if self.image_key not in sample or not sample[self.image_key]:
return sample
# load images
loaded_image_keys = sample[self.image_key]
images = {}
for loaded_image_key in loaded_image_keys:
if loaded_image_key not in images:
# avoid loading the same images
image = load_image(loaded_image_key)
images[loaded_image_key] = image
model, processor = get_model(model_key=self.model_key,
rank=rank,
use_cuda=self.use_cuda())
conversation = [
{
'role':
'user',
'content': [
{
'type': 'text',
'text': sample[self.text_key]
},
{
'type': 'image'
},
],
},
]
prompt = processor.apply_chat_template(conversation,
add_generation_prompt=True)
sample[self.text_key] = []
for image_key in images:
inputs = processor(images=images[image_key],
text=prompt,
return_tensors='pt').to(model.device)
response = model.generate(**inputs,
max_new_tokens=self.max_new_tokens,
temperature=self.temperature,
top_p=self.top_p,
num_beams=self.num_beams)
output = processor.decode(response.cpu()[0],
skip_special_tokens=True)
sample[self.text_key].append(output)
return sample