From d2c760626622f259437dc49268140085b4cba9bd Mon Sep 17 00:00:00 2001
From: aamita
 <aamita@sdg-slurm-bm-gpu-b4-8-ad3-009.compute.sdgdevvcn.oraclevcn.com>
Date: Thu, 25 Jul 2024 04:35:59 +0000
Subject: [PATCH 01/15] VILA added

---
 README.md               |   4 +-
 vlmeval/config.py       |  16 ++++--
 vlmeval/vlm/__init__.py |   1 +
 vlmeval/vlm/vila.py     | 110 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 125 insertions(+), 6 deletions(-)
 create mode 100644 vlmeval/vlm/vila.py

diff --git a/README.md b/README.md
index 9a19ccd8..2075ae2c 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,7 @@ English | [简体中文](/docs/zh-CN/README_zh-CN.md) | [日本語](/docs/ja/REA
 
 ## 🆕 News
 
+- **[2024-07-24]** We have supported [**VILA**](https://github.com/NVlabs/VILA/) 🔥🔥🔥, evaluation results coming soon🔥🔥🔥
 - **[2024-07-23]** We have supported [**Video-LLaVA**](https://github.com/PKU-YuanGroup/Video-LLaVA) 🔥🔥🔥, the first Video-LLM to be supported by our repository! Using [**this fork version**](https://github.com/FangXinyu-0913/Video-LLaVA) to install Video-LLaVA (**More Recommended**) or install [**transformers**](https://huggingface.co/docs/transformers/v4.42.0/en/model_doc/video_llava) to use it!
 - **[2024-07-23]** We have supported [**Video-MME**](https://video-mme.github.io/), thanks to [**Yuhan Dai**](https://github.com/dirkiedai)🔥🔥🔥
 - **[2024-07-22]** We have supported [**MT-VQA**](https://github.com/bytedance/MTVQA), thanks to Jinghui Lu 🔥🔥🔥
@@ -99,7 +100,7 @@ VLMEvalKit will use a **judge LLM** to extract answer from the output if you set
 | [**DeepSeek-VL**](https://github.com/deepseek-ai/DeepSeek-VL/tree/main)🎞️ | [**LLaVA-NeXT**](https://llava-vl.github.io/blog/2024-01-30-llava-next/)🚅 | [**Bunny-Llama3**](https://huggingface.co/BAAI/Bunny-v1_1-Llama-3-8B-V)🚅 | [**XVERSE-V-13B**](https://github.com/xverse-ai/XVERSE-V-13B/blob/main/vxverse/models/vxverse.py) |
 | [**PaliGemma-3B**](https://huggingface.co/google/paligemma-3b-pt-448) 🚅 | [**360VL-70B**](https://huggingface.co/qihoo360/360VL-70B) 🚅 | [**Phi-3-Vision**](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)🚅 | [**WeMM**](https://github.com/scenarios/WeMM)🚅               |
 | [**GLM-4v-9B**](https://huggingface.co/THUDM/glm-4v-9b) 🚅    | [**Cambrian-[8B/13B/34B]**](https://cambrian-mllm.github.io/) | [**LLaVA-Next-[Interleave-7B/LLaMA-3/Qwen-32B]**](https://huggingface.co/lmms-lab/llava-next-interleave-qwen-7b) 🎞️ | [**Chameleon-[7B/30B]**](https://huggingface.co/facebook/chameleon-7b)🚅🎞️ |
-| [**Video-LLaVA-7B-[HF]**](https://github.com/PKU-YuanGroup/Video-LLaVA) 🎬 |  |  |  |
+| [**Video-LLaVA-7B-[HF]**](https://github.com/PKU-YuanGroup/Video-LLaVA) 🎬 | [**VILA1.5-[8B/13B/40B]**](https://github.com/NVlabs/VILA/) |  |  |
 
 🎞️: Support multiple images as inputs.
 
@@ -112,6 +113,7 @@ VLMEvalKit will use a **judge LLM** to extract answer from the output if you set
 Note that some VLMs may not be able to run under certain transformer versions, we recommend the following settings to evaluate each VLM:
 
 - **Please use** `transformers==4.33.0` **for**: `Qwen series`, `Monkey series`, `InternLM-XComposer Series`, `mPLUG-Owl2`, `OpenFlamingo v2`, `IDEFICS series`, `VisualGLM`, `MMAlaya`, `ShareCaptioner`, `MiniGPT-4 series`, `InstructBLIP series`, `PandaGPT`, `VXVERSE`, `GLM-4v-9B`.
+- **Please use** `transformers==4.36.2` **for**: `VILA Series`
 - **Please use** `transformers==4.37.0` **for**: `LLaVA series`, `ShareGPT4V series`, `TransCore-M`, `LLaVA (XTuner)`, `CogVLM Series`, `EMU2 Series`, `Yi-VL Series`, `MiniCPM-[V1/V2]`, `OmniLMM-12B`, `DeepSeek-VL series`, `InternVL series`, `Cambrian Series`.
 - **Please use** `transformers==4.40.0` **for**: `IDEFICS2`, `Bunny-Llama3`, `MiniCPM-Llama3-V2.5`, `LLaVA-Next series`, `360VL-70B`, `Phi-3-Vision`, `WeMM`.
 - **Please use** `transformers==latest` **for**: `PaliGemma-3B`, `Chameleon series`, `Video-LLaVA-7B-HF`.
diff --git a/vlmeval/config.py b/vlmeval/config.py
index e0fdfccc..7a1127b7 100644
--- a/vlmeval/config.py
+++ b/vlmeval/config.py
@@ -50,9 +50,9 @@
     'QwenVLPlus': partial(QwenVLAPI, model='qwen-vl-plus', temperature=0, retry=10),
     'QwenVLMax': partial(QwenVLAPI, model='qwen-vl-max', temperature=0, retry=10),
     # Reka Series
-    'RekaEdge': partial(Reka, model='reka-edge-20240208'), 
-    'RekaFlash': partial(Reka, model='reka-flash-20240226'), 
-    'RekaCore': partial(Reka, model='reka-core-20240415'), 
+    'RekaEdge': partial(Reka, model='reka-edge-20240208'),
+    'RekaFlash': partial(Reka, model='reka-flash-20240226'),
+    'RekaCore': partial(Reka, model='reka-core-20240415'),
     # Step1V Series
     'Step1V': partial(GPT4V, model='step-1v-8k', api_base="https://api.stepfun.com/v1/chat/completions", temperature=0, retry=10),
     'Step1V-0701': partial(GPT4V, model='step-1v-beta0701', api_base="https://api.stepfun.com/v1/chat/completions", temperature=0, retry=10),
@@ -183,14 +183,20 @@
     'chameleon_30b': partial(Chameleon, model_path='facebook/chameleon-30b'),
 }
 
+vila_series = {
+    'vila_8b': partial(VILA, model_path='Efficient-Large-Model/Llama-3-VILA1.5-8b'),
+    'vila_13b': partial(VILA, model_path='Efficient-Large-Model/VILA1.5-13b'),
+    'vila_40b': partial(VILA, model_path='Efficient-Large-Model/VILA1.5-40b'),
+}
+
 supported_VLM = {}
 
 model_groups = [
-    ungrouped, api_models, 
+    ungrouped, api_models,
     xtuner_series, qwen_series, llava_series, internvl_series, yivl_series,
     xcomposer_series, minigpt4_series, idefics_series, instructblip_series,
     deepseekvl_series, minicpm_series, cogvlm_series, wemm_series,
-    cambrian_series, chameleon_series, video_models,
+    cambrian_series, chameleon_series, video_models,vila_series
 ]
 
 for grp in model_groups:
diff --git a/vlmeval/vlm/__init__.py b/vlmeval/vlm/__init__.py
index 77ea57a7..d751aacc 100644
--- a/vlmeval/vlm/__init__.py
+++ b/vlmeval/vlm/__init__.py
@@ -33,3 +33,4 @@
 from .cambrian import Cambrian
 from .chameleon import Chameleon
 from .video_llm.video_llava import VideoLLaVA, VideoLLaVA_HF
+from .vila import VILA
diff --git a/vlmeval/vlm/vila.py b/vlmeval/vlm/vila.py
new file mode 100644
index 00000000..24be5b7d
--- /dev/null
+++ b/vlmeval/vlm/vila.py
@@ -0,0 +1,110 @@
+import torch
+from PIL import Image
+from abc import abstractproperty
+import sys
+import os.path as osp
+from .base import BaseModel
+from ..smp import *
+from ..dataset import DATASET_TYPE
+import copy
+
+
+class VILA(BaseModel):
+    INSTALL_REQ = True
+    INTERLEAVE = True
+
+    def __init__(self,
+                 model_path='Efficient-Large-Model/Llama-3-VILA1.5-8b',
+                 **kwargs):
+        try:
+            from llava.model.builder import load_pretrained_model
+            from llava.mm_utils import get_model_name_from_path
+            from llava.mm_utils import process_images, tokenizer_image_token, KeywordsStoppingCriteria
+            from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN # noqa E501
+            from llava.conversation import conv_templates, SeparatorStyle
+        except:
+            warnings.warn('Please install VILA before using VILA')
+            warnings.warn('Please install VILA from https://github.com/NVlabs/VILA')
+            warnings.warn('Please install VLMEvalKit after installing VILA')
+            warnings.warn('VILA is supported only with transformers==4.36.2')
+            sys.exit(-1)
+
+        warnings.warn('Please install the latest version of VILA from GitHub before you evaluate the VILA model.')
+        assert osp.exists(model_path) or len(model_path.split('/')) == 2
+
+        model_name = get_model_name_from_path(model_path)
+
+        try:
+            self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
+                model_path=model_path,
+                model_base=None,
+                model_name=model_name,
+                device='cpu',
+                device_map='cpu'
+            )
+        except Exception as e:
+            warnings.warn(f'Error loading VILA model: {e}')
+            exit(-1)
+
+        self.model = self.model.cuda()
+        if '8b' in model_path:
+            self.conv_mode = 'llama_3'
+        elif '13b' in model_path:
+            self.conv_mode = 'vicuna_v1'
+        elif '40b' in model_path:
+            self.conv_mode = 'hermes-2'
+
+        kwargs_default = dict(do_sample=False, temperature=0, max_new_tokens=512, top_p=None, num_beams=1, use_cache=True) # noqa E501
+
+        kwargs_default.update(kwargs)
+        self.kwargs = kwargs_default
+        warnings.warn(f'Using the following kwargs for generation config: {self.kwargs}')
+
+        self.conv_templates = conv_templates
+        self.process_images = process_images
+        self.tokenizer_image_token = tokenizer_image_token
+        self. DEFAULT_IMAGE_TOKEN = DEFAULT_IMAGE_TOKEN
+        self.SeparatorStyle = SeparatorStyle
+        self.IMAGE_TOKEN_INDEX = IMAGE_TOKEN_INDEX
+        self.KeywordsStoppingCriteria = KeywordsStoppingCriteria
+
+    def use_custom_prompt(self, dataset):
+        assert dataset is not None
+        # TODO see if custom prompt needed
+        return False
+
+    def generate_inner(self, message, dataset=None):
+
+        content, images = '', []
+
+        for msg in message:
+            if msg['type'] == 'text':
+                content += msg['value']
+            elif msg['type'] == 'image':
+                image = Image.open(msg['value']).convert('RGB')
+                images.append(image)
+                content += (self.DEFAULT_IMAGE_TOKEN + '\n')
+
+        image_tensor = self.process_images(
+            images, self.image_processor,
+            self.model.config).to(self.model.device, dtype=torch.float16)
+
+        # Support interleave text and image
+        conv = self.conv_templates[self.conv_mode].copy()
+        conv.append_message(conv.roles[0], content)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        input_ids = self.tokenizer_image_token(prompt, self.tokenizer, self.IMAGE_TOKEN_INDEX,
+                                               return_tensors='pt').unsqueeze(0).cuda()
+
+        stop_str = conv.sep if conv.sep_style != self.SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = self.KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids)
+
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                input_ids, images=image_tensor, stopping_criteria=[stopping_criteria], **self.kwargs)
+
+            output = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+        return output

From 58b2f24fb2a9df5ffbc5dbb020fa0b8bcb1700c8 Mon Sep 17 00:00:00 2001
From: Junming Yang <60545459+junming-yang@users.noreply.github.com>
Date: Thu, 25 Jul 2024 16:25:22 +0800
Subject: [PATCH 02/15] Update README.md

---
 README.md | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/README.md b/README.md
index f7aa0c98..8f67385c 100644
--- a/README.md
+++ b/README.md
@@ -35,10 +35,6 @@ English | [简体中文](/docs/zh-CN/README_zh-CN.md) | [日本語](/docs/ja/REA
 - **[2024-07-19]** We have supported [**LLaVA-Next-Interleave-7B**](https://huggingface.co/lmms-lab/llava-next-interleave-qwen-7b) and [**LLaVA-Next-LLaMA-3**](https://huggingface.co/lmms-lab/llama3-llava-next-8b) 🔥🔥🔥
 - **[2024-07-18]** We have supported [**BLINK**](https://zeyofu.github.io/blink/), thanks to [**zeyofu**](https://zeyofu.github.io)🔥🔥🔥
 - **[2024-07-18]** We released the first version of [**VLMEvalKit Technical Report**](https://www.arxiv.org/abs/2407.11691). We greatly welcome contributions from the community and are glad to share the corresponding credit: All Contributions will be acknowledged in the report and Contributors with 3 or more major contributions (implementing an MLLM, benchmark, or major feature) can share the authorship of the tech report 🔥🔥🔥
-- **[2024-07-12]** We have supported [**MMLongBench-Doc**](https://mayubo2333.github.io/MMLongBench-Doc/), a benchmark for long-context document understanding, thanks to [**mayubo2333**](https://github.com/mayubo2333) 🔥🔥🔥
-- **[2024-07-12]** We have supported [**VCR**](https://github.com/tianyu-z/vcr), a benchmark for visual caption restoration evaluation, thanks to [**tianyu-z**](https://github.com/tianyu-z) and [**sheryc**](https://github.com/sheryc) 🔥🔥🔥
-- **[2024-07-08]** We have supported [**InternLM-XComposer-2.5**](https://github.com/InternLM/InternLM-XComposer), thanks to [**LightDXY**](https://github.com/LightDXY) 🔥🔥🔥
-- **[2024-07-08]** We have supported [**InternVL2**](https://huggingface.co/OpenGVLab/InternVL2-26B), thanks to [**czczup**](https://github.com/czczup) 🔥🔥🔥
 
 ## 📊 Datasets, Models, and Evaluation Results
 

From 004614bb242f3cac5b88404a1fc29d8229aaaa1a Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amit.pinaki@gmail.com>
Date: Thu, 25 Jul 2024 17:28:27 +0000
Subject: [PATCH 03/15] resolve config merge conflict

---
 vlmeval/config.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vlmeval/config.py b/vlmeval/config.py
index 116c4651..5b3ffeb2 100644
--- a/vlmeval/config.py
+++ b/vlmeval/config.py
@@ -187,10 +187,7 @@
     'vila_8b': partial(VILA, model_path='Efficient-Large-Model/Llama-3-VILA1.5-8b'),
     'vila_13b': partial(VILA, model_path='Efficient-Large-Model/VILA1.5-13b'),
     'vila_40b': partial(VILA, model_path='Efficient-Large-Model/VILA1.5-40b'),
-<<<<<<< HEAD
-=======
 }
->>>>>>> upstream/main
 
 ovis_series = {
     'Ovis1.5-Llama3-8B': partial(Ovis, model_path='AIDC-AI/Ovis1.5-Llama3-8B')

From d3149dcc302d861abcab8837951ea808dd97f844 Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amit.pinaki@gmail.com>
Date: Thu, 25 Jul 2024 19:21:30 +0000
Subject: [PATCH 04/15] Fix error on Idefics for longer prompt

---
 vlmeval/vlm/idefics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vlmeval/vlm/idefics.py b/vlmeval/vlm/idefics.py
index 0c35d483..99363ed7 100644
--- a/vlmeval/vlm/idefics.py
+++ b/vlmeval/vlm/idefics.py
@@ -20,7 +20,7 @@ def __init__(self, model_pth='HuggingFaceM4/idefics-9b-instruct', **kwargs):
             model_pth, torch_dtype=torch.bfloat16, device_map='auto'
         )
         self.processor = AutoProcessor.from_pretrained(model_pth)
-        kwargs_default = {'max_length': 512}
+        kwargs_default = {'max_new_tokens': 512}
         kwargs_default.update(kwargs)
         self.kwargs = kwargs_default
         self.file_root = osp.dirname(__file__)

From 30523a49c56314444b8545f04bdd3c9a5bf59074 Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amit.pinaki@gmail.com>
Date: Sat, 27 Jul 2024 02:36:00 +0000
Subject: [PATCH 05/15] Fix naming convention to make consistent with Idefics2
 and better readability

---
 vlmeval/vlm/idefics.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vlmeval/vlm/idefics.py b/vlmeval/vlm/idefics.py
index 99363ed7..178333c5 100644
--- a/vlmeval/vlm/idefics.py
+++ b/vlmeval/vlm/idefics.py
@@ -12,14 +12,14 @@ class IDEFICS(BaseModel):
     INSTALL_REQ = False
     INTERLEAVE = True
 
-    def __init__(self, model_pth='HuggingFaceM4/idefics-9b-instruct', **kwargs):
-        assert osp.exists(model_pth) or splitlen(model_pth) == 2
+    def __init__(self, model_path='HuggingFaceM4/idefics-9b-instruct', **kwargs):
+        assert osp.exists(model_path) or splitlen(model_path) == 2
         from transformers import IdeficsForVisionText2Text, AutoProcessor
 
         self.model = IdeficsForVisionText2Text.from_pretrained(
-            model_pth, torch_dtype=torch.bfloat16, device_map='auto'
+            model_path, torch_dtype=torch.bfloat16, device_map='auto'
         )
-        self.processor = AutoProcessor.from_pretrained(model_pth)
+        self.processor = AutoProcessor.from_pretrained(model_path)
         kwargs_default = {'max_new_tokens': 512}
         kwargs_default.update(kwargs)
         self.kwargs = kwargs_default
@@ -31,7 +31,7 @@ def __init__(self, model_pth='HuggingFaceM4/idefics-9b-instruct', **kwargs):
     def generate_inner(self, message, dataset=None):
         prompts = (
             ['Users:']
-            + [x['value'] if x['type'] == 'text' else Image.open(x['value']) for x in message]
+            + [msg['value'] if msg['type'] == 'text' else Image.open(msg['value']) for msg in message]
             + ['<end_of_utterance>', '\nAssistant: ']
         )
         inputs = self.processor(

From 072f20c556cc11b7404a09087eeb50520feec360 Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amit.pinaki@gmail.com>
Date: Sat, 27 Jul 2024 15:45:01 +0000
Subject: [PATCH 06/15] update config for idefics

---
 vlmeval/config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vlmeval/config.py b/vlmeval/config.py
index a8fb015b..588c352c 100644
--- a/vlmeval/config.py
+++ b/vlmeval/config.py
@@ -143,8 +143,8 @@
 }
 
 idefics_series = {
-    'idefics_9b_instruct': partial(IDEFICS, model_pth='HuggingFaceM4/idefics-9b-instruct'),
-    'idefics_80b_instruct': partial(IDEFICS, model_pth='HuggingFaceM4/idefics-80b-instruct'),
+    'idefics_9b_instruct': partial(IDEFICS, model_path='HuggingFaceM4/idefics-9b-instruct'),
+    'idefics_80b_instruct': partial(IDEFICS, model_path='HuggingFaceM4/idefics-80b-instruct'),
     'idefics2_8b': partial(IDEFICS2, model_path='HuggingFaceM4/idefics2-8b'),
 }
 

From 63561bb5999524b56b1830c747391299401737da Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amit.pinaki@gmail.com>
Date: Sat, 27 Jul 2024 15:48:14 +0000
Subject: [PATCH 07/15] Make LLava consistent as well

---
 vlmeval/config.py          | 18 +++++++++---------
 vlmeval/vlm/llava/llava.py | 38 +++++++++++++++++++-------------------
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/vlmeval/config.py b/vlmeval/config.py
index 588c352c..10b87575 100644
--- a/vlmeval/config.py
+++ b/vlmeval/config.py
@@ -90,15 +90,15 @@
 }
 
 llava_series = {
-    'llava_v1.5_7b': partial(LLaVA, model_pth='liuhaotian/llava-v1.5-7b'),
-    'llava_v1.5_13b': partial(LLaVA, model_pth='liuhaotian/llava-v1.5-13b'),
-    'llava_v1_7b': partial(LLaVA, model_pth=LLAVA_V1_7B_MODEL_PTH),
-    'sharegpt4v_7b': partial(LLaVA, model_pth='Lin-Chen/ShareGPT4V-7B'),
-    'sharegpt4v_13b': partial(LLaVA, model_pth='Lin-Chen/ShareGPT4V-13B'),
-    'llava_next_vicuna_7b': partial(LLaVA_Next, model_pth='llava-hf/llava-v1.6-vicuna-7b-hf'),
-    'llava_next_vicuna_13b': partial(LLaVA_Next, model_pth='llava-hf/llava-v1.6-vicuna-13b-hf'),
-    'llava_next_mistral_7b': partial(LLaVA_Next, model_pth='llava-hf/llava-v1.6-mistral-7b-hf'),
-    'llava_next_yi_34b': partial(LLaVA_Next, model_pth='llava-hf/llava-v1.6-34b-hf'),
+    'llava_v1.5_7b': partial(LLaVA, model_path='liuhaotian/llava-v1.5-7b'),
+    'llava_v1.5_13b': partial(LLaVA, model_path='liuhaotian/llava-v1.5-13b'),
+    'llava_v1_7b': partial(LLaVA, model_path=LLAVA_V1_7B_MODEL_PTH),
+    'sharegpt4v_7b': partial(LLaVA, model_path='Lin-Chen/ShareGPT4V-7B'),
+    'sharegpt4v_13b': partial(LLaVA, model_path='Lin-Chen/ShareGPT4V-13B'),
+    'llava_next_vicuna_7b': partial(LLaVA_Next, model_path='llava-hf/llava-v1.6-vicuna-7b-hf'),
+    'llava_next_vicuna_13b': partial(LLaVA_Next, model_path='llava-hf/llava-v1.6-vicuna-13b-hf'),
+    'llava_next_mistral_7b': partial(LLaVA_Next, model_path='llava-hf/llava-v1.6-mistral-7b-hf'),
+    'llava_next_yi_34b': partial(LLaVA_Next, model_path='llava-hf/llava-v1.6-34b-hf'),
     'llava_next_llama3': partial(LLaVA_Next2, model_path='lmms-lab/llama3-llava-next-8b'),
     'llava_next_qwen_32b': partial(LLaVA_Next2, model_path='lmms-lab/llava-next-qwen-32b'),
     'llava_next_interleave_7b': partial(LLaVA_Next2, model_path='lmms-lab/llava-next-interleave-qwen-7b'),
diff --git a/vlmeval/vlm/llava/llava.py b/vlmeval/vlm/llava/llava.py
index 242e970c..5918cfbc 100644
--- a/vlmeval/vlm/llava/llava.py
+++ b/vlmeval/vlm/llava/llava.py
@@ -15,7 +15,7 @@ class LLaVA(BaseModel):
     INTERLEAVE = True
 
     def __init__(self,
-                 model_pth='liuhaotian/llava_v1.5_7b',
+                 model_path='liuhaotian/llava_v1.5_7b',
                  **kwargs):
         try:
             from llava.model.builder import load_pretrained_model
@@ -25,30 +25,30 @@ def __init__(self,
             sys.exit(-1)
 
         warnings.warn('Please install the latest version of llava from github before you evaluate the LLaVA model. ')
-        assert osp.exists(model_pth) or splitlen(model_pth) == 2
+        assert osp.exists(model_path) or splitlen(model_path) == 2
         self.system_prompt = (
             'A chat between a curious human and an artificial intelligence assistant. '
             "The assistant gives helpful, detailed, and polite answers to the human's questions. "
         )
         self.stop_str = '</s>'
 
-        if model_pth == 'Lin-Chen/ShareGPT4V-7B':
+        if model_path == 'Lin-Chen/ShareGPT4V-7B':
             model_name = 'llava-v1.5-7b'
-        elif model_pth == 'Lin-Chen/ShareGPT4V-13B':
+        elif model_path == 'Lin-Chen/ShareGPT4V-13B':
             model_name = 'llava-v1.5-13b'
         else:
-            model_name = get_model_name_from_path(model_pth)
+            model_name = get_model_name_from_path(model_path)
 
         try:
             self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
-                model_path=model_pth,
+                model_path=model_path,
                 model_base=None,
                 model_name=model_name,
                 device='cpu',
                 device_map='cpu'
             )
         except:
-            if 'ShareGPT4V' in model_pth:
+            if 'ShareGPT4V' in model_path:
                 import llava
                 warnings.warn(
                     'Please manually remove the encoder type check in '
@@ -174,14 +174,14 @@ class LLaVA_Next(BaseModel):
     INSTALL_REQ = False
     INTERLEAVE = False
 
-    def __init__(self, model_pth='llava-hf/llava-v1.6-vicuna-7b-hf', **kwargs):
+    def __init__(self, model_path='llava-hf/llava-v1.6-vicuna-7b-hf', **kwargs):
         import transformers
         from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
-        self.model_pth = model_pth
-        if '34b' in model_pth.lower():
-            self.processor = LlavaNextProcessor.from_pretrained(self.model_pth, use_fast=False)
+        self.model_path = model_path
+        if '34b' in model_path.lower():
+            self.processor = LlavaNextProcessor.from_pretrained(self.model_path, use_fast=False)
         else:
-            self.processor = LlavaNextProcessor.from_pretrained(self.model_pth)
+            self.processor = LlavaNextProcessor.from_pretrained(self.model_path)
         flash_attn_flag = False
         try:
             import flash_attn
@@ -191,10 +191,10 @@ def __init__(self, model_pth='llava-hf/llava-v1.6-vicuna-7b-hf', **kwargs):
 
         if flash_attn_flag:
             model = LlavaNextForConditionalGeneration.from_pretrained(
-                self.model_pth, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_flash_attention_2=True)
+                self.model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_flash_attention_2=True)
         else:
             model = LlavaNextForConditionalGeneration.from_pretrained(
-                self.model_pth, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+                self.model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
 
         model = model.eval()
         self.model = model.cuda()
@@ -204,22 +204,22 @@ def __init__(self, model_pth='llava-hf/llava-v1.6-vicuna-7b-hf', **kwargs):
         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
 
     def apply_prompt_template(self, prompt):
-        model_pth = self.model_pth.lower()
-        if 'mistral' in model_pth:
+        model_path = self.model_path.lower()
+        if 'mistral' in model_path:
             template = '[INST] PLACEHOLDER [/INST]'
-        elif 'vicuna' in model_pth:
+        elif 'vicuna' in model_path:
             template = (
                 'A chat between a curious human and an artificial intelligence assistant. '
                 "The assistant gives helpful, detailed, and polite answers to the human's questions. "
                 'USER: PLACEHOLDER ASSISTANT:'
             )
-        elif '34b' in model_pth:
+        elif '34b' in model_path:
             template = (
                 '<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\nPLACEHOLDER<|im_end|>'
                 '<|im_start|>assistant\n'
             )
         else:
-            raise NotImplementedError(f'Prompt template for {model_pth} not implemented.')
+            raise NotImplementedError(f'Prompt template for {model_path} not implemented.')
 
         prompt = template.replace('PLACEHOLDER', f'<image>\n{prompt}')
         return prompt

From 2ca4d1dbdaa33aa74bec4930dfc372d5c984939f Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amit.pinaki@gmail.com>
Date: Mon, 29 Jul 2024 23:43:11 +0000
Subject: [PATCH 08/15] Add VILA 1.5 3B

---
 vlmeval/vlm/vila.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vlmeval/vlm/vila.py b/vlmeval/vlm/vila.py
index 24be5b7d..965dfb19 100644
--- a/vlmeval/vlm/vila.py
+++ b/vlmeval/vlm/vila.py
@@ -47,6 +47,8 @@ def __init__(self,
             exit(-1)
 
         self.model = self.model.cuda()
+        if '3b' in model_path:
+            self.conv_mode = 'vicuna_v1'
         if '8b' in model_path:
             self.conv_mode = 'llama_3'
         elif '13b' in model_path:

From da9eeddbece8cb116c9f0309447e971f5f056dac Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amit.pinaki@gmail.com>
Date: Mon, 29 Jul 2024 23:45:21 +0000
Subject: [PATCH 09/15] Add VILA 1.5 3B

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 293482e6..742da9d1 100644
--- a/README.md
+++ b/README.md
@@ -95,7 +95,7 @@ VLMEvalKit will use a **judge LLM** to extract answer from the output if you set
 | [**DeepSeek-VL**](https://github.com/deepseek-ai/DeepSeek-VL/tree/main)🎞️ | [**LLaVA-NeXT**](https://llava-vl.github.io/blog/2024-01-30-llava-next/)🚅 | [**Bunny-Llama3**](https://huggingface.co/BAAI/Bunny-v1_1-Llama-3-8B-V)🚅 | [**XVERSE-V-13B**](https://github.com/xverse-ai/XVERSE-V-13B/blob/main/vxverse/models/vxverse.py) |
 | [**PaliGemma-3B**](https://huggingface.co/google/paligemma-3b-pt-448) 🚅 | [**360VL-70B**](https://huggingface.co/qihoo360/360VL-70B) 🚅 | [**Phi-3-Vision**](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)🚅 | [**WeMM**](https://github.com/scenarios/WeMM)🚅               |
 | [**GLM-4v-9B**](https://huggingface.co/THUDM/glm-4v-9b) 🚅    | [**Cambrian-[8B/13B/34B]**](https://cambrian-mllm.github.io/) | [**LLaVA-Next-[Interleave-7B/LLaMA-3/Qwen-32B]**](https://huggingface.co/lmms-lab/llava-next-interleave-qwen-7b) 🎞️ | [**Chameleon-[7B/30B]**](https://huggingface.co/facebook/chameleon-7b)🚅🎞️ |
-| [**Video-LLaVA-7B-[HF]**](https://github.com/PKU-YuanGroup/Video-LLaVA) 🎬 | [**VILA1.5-[8B/13B/40B]**](https://github.com/NVlabs/VILA/)🎞️ | [**Ovis1.5-Llama3-8B**](https://github.com/AIDC-AI/Ovis) 🚅🎞 |  |
+| [**Video-LLaVA-7B-[HF]**](https://github.com/PKU-YuanGroup/Video-LLaVA) 🎬 | [**VILA1.5-[3B/8B/13B/40B]**](https://github.com/NVlabs/VILA/)🎞️ | [**Ovis1.5-Llama3-8B**](https://github.com/AIDC-AI/Ovis) 🚅🎞 |  |
 
 🎞️: Support multiple images as inputs.
 

From 28e33dc41be8200b875b7dcd5e59a9fa4edba051 Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amit.pinaki@gmail.com>
Date: Mon, 29 Jul 2024 23:45:47 +0000
Subject: [PATCH 10/15] fix naming convention to be similar to the HF models

---
 vlmeval/config.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vlmeval/config.py b/vlmeval/config.py
index fd7b8d51..bbbc8392 100644
--- a/vlmeval/config.py
+++ b/vlmeval/config.py
@@ -185,9 +185,10 @@
 }
 
 vila_series = {
-    'vila_8b': partial(VILA, model_path='Efficient-Large-Model/Llama-3-VILA1.5-8b'),
-    'vila_13b': partial(VILA, model_path='Efficient-Large-Model/VILA1.5-13b'),
-    'vila_40b': partial(VILA, model_path='Efficient-Large-Model/VILA1.5-40b'),
+    'VILA1.5-3b': partial(VILA, model_path='Efficient-Large-Model/VILA1.5-3b'),
+    'Llama-3-VILA1.5-8b': partial(VILA, model_path='Efficient-Large-Model/Llama-3-VILA1.5-8b'),
+    'VILA1.5-13b': partial(VILA, model_path='Efficient-Large-Model/VILA1.5-13b'),
+    'VILA1.5-40b': partial(VILA, model_path='Efficient-Large-Model/VILA1.5-40b'),
 }
 
 ovis_series = {

From 7f18c91e2a74eefe2ecc150b05628aac76bed6e8 Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amit.pinaki@gmail.com>
Date: Tue, 30 Jul 2024 22:13:30 +0000
Subject: [PATCH 11/15] Multi-Turn added for Phi3-Vision and tested with MMDU

---
 vlmeval/vlm/phi3_vision.py | 48 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/vlmeval/vlm/phi3_vision.py b/vlmeval/vlm/phi3_vision.py
index 5c1a3ac3..fe0e4d53 100644
--- a/vlmeval/vlm/phi3_vision.py
+++ b/vlmeval/vlm/phi3_vision.py
@@ -51,3 +51,51 @@ def generate_inner(self, message, dataset=None):
             clean_up_tokenization_spaces=False
         )[0]
         return response
+
+    def chat_inner(self, message, dataset=None):
+
+        messages = []
+        image_cnt = 1
+        image_list = []
+        for msg in message:
+            content = ''
+            # If message is just text in the conversation
+            if len(msg['content']) == 1 and msg['content'][0]['type'] == 'text':
+                msg_new = {'role': msg['role'], 'content': msg['content'][0]['value']}
+                messages.append(msg_new)
+                continue
+
+            # If both image & text is present
+            for x in msg['content']:
+                if x['type'] == 'text':
+                    content += x['value']
+                elif x['type'] == 'image':
+                    image = Image.open(x['value']).convert('RGB')
+                    content += f'<|image_{image_cnt}|>\n'
+                    image_list.append(image)
+                    image_cnt += 1
+            msg_new = {'role': msg['role'], 'content': content}
+            messages.append(msg_new)
+
+        prompt = self.processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = self.processor(prompt, image_list, return_tensors='pt').to('cuda')
+
+        generation_args = {
+            'max_new_tokens': 500,
+            'temperature': 0.0,
+            'do_sample': False,
+        }
+        generation_args.update(self.kwargs)
+
+        generate_ids = self.model.generate(
+            **inputs,
+            eos_token_id=self.processor.tokenizer.eos_token_id,
+            **generation_args
+        )
+        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+        response = self.processor.batch_decode(
+            generate_ids,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )[0]
+        return response

From baaea5e26538e408a952cb584544d737603a6708 Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amit.pinaki@gmail.com>
Date: Sun, 11 Aug 2024 00:29:07 +0000
Subject: [PATCH 12/15] Add multi turn for Intern VL

---
 vlmeval/vlm/internvl_chat.py | 127 ++++++++++++++++++++++++++++++++++-
 1 file changed, 126 insertions(+), 1 deletion(-)

diff --git a/vlmeval/vlm/internvl_chat.py b/vlmeval/vlm/internvl_chat.py
index 57edbdea..def9c151 100644
--- a/vlmeval/vlm/internvl_chat.py
+++ b/vlmeval/vlm/internvl_chat.py
@@ -12,6 +12,7 @@
 import transformers
 
 from torchvision.transforms.functional import InterpolationMode
+import re
 
 
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
@@ -137,6 +138,19 @@ def __init__(self, model_path='OpenGVLab/InternVL-Chat-V1-5', load_in_8bit=False
         self.model_path = model_path
         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
 
+        # Regular expression to match the pattern 'Image' followed by a number, e.g. Image1
+        self.pattern = r'Image(\d+)'
+        # Replacement pattern to insert a hyphen between 'Image' and the number, e.g. Image-1
+        self.replacement = r'Image-\1'
+
+        # Convert InternVL2 response to dataset format
+        # e.g. Image1 -> Image-1
+
+        # Regular expression to match the pattern 'Image-' followed by a number
+        self.reverse_pattern = r'Image-(\d+)'
+        # Replacement pattern to remove the hyphen (Image-1 -> Image1)
+        self.reverse_replacement = r'Image\1'
+
         if listinstr(['InternVL2-Llama3-76B'], model_path):
             device_map = split_model(model_path.split('/')[-1])
             self.model = AutoModel.from_pretrained(
@@ -163,7 +177,12 @@ def __init__(self, model_path='OpenGVLab/InternVL-Chat-V1-5', load_in_8bit=False
         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
 
     def use_custom_prompt(self, dataset):
-        return True
+
+        if dataset is not None and listinstr(['MMDU'], dataset):
+            # For Multi-Turn we don't have custom prompt
+            return False
+        else:
+            return True
 
     def build_multi_choice_prompt(self, line, dataset=None):
         question = line['question']
@@ -350,3 +369,109 @@ def generate_inner(self, message, dataset=None):
             return self.generate_v2(message, dataset)
         else:
             raise ValueError(f'Unsupported version: {self.version}')
+
+    def build_history(self, message):
+        # Global Variables
+        image_path = []
+        image_cnt = 0
+
+        def concat_tilist(tilist):
+            nonlocal image_cnt  # Declare image_cnt as nonlocal to modify it
+            prompt = ''
+            for item in tilist:
+                # Substitute the pattern in the text
+                if item['type'] == 'text':
+                    prompt += re.sub(self.pattern, self.replacement, item['value'])
+                elif item['type'] == 'image':
+                    image_cnt += 1
+                    prompt += '<image>\n'
+                    image_path.append(item['value'])
+            return prompt
+
+        # Only previous messages
+        assert len(message) % 2 == 0
+        history = []
+        for i in range(len(message) // 2):
+            m1, m2 = message[2 * i], message[2 * i + 1]
+            assert m1['role'] == 'user' and m2['role'] == 'assistant'
+            history.append((concat_tilist(m1['content']), concat_tilist(m2['content'])))
+
+        return history, image_path, image_cnt
+
+    def chat_inner_v2(self, message, dataset=None):
+
+        image_cnt = 0
+        if len(message) > 1:
+            history, image_path, image_cnt = self.build_history(message[:-1])
+        else:
+            history, image_path, image_cnt = None, [], 1
+        current_msg = message[-1]
+        question = ''
+
+        # If message is just text in the conversation
+        if len(current_msg['content']) == 1 and current_msg['content'][0]['type'] == 'text':
+            question = current_msg['content'][0]['value']
+            question = re.sub(self.pattern, self.replacement, question)  # Fix pattern as per InternVL
+        else:
+            for msg in current_msg['content']:
+                if msg['type'] == 'text':
+                    question += re.sub(self.pattern, self.replacement, msg['value'])
+                elif msg['type'] == 'image':
+                    image_cnt += 1
+                    question += '<image>\n'
+                    image_path.append(msg['value'])
+
+        if image_cnt > 1:
+            num_patches_list = []
+            pixel_values_list = []
+            for image_idx, file_name in enumerate(image_path):
+                upscale_flag = image_idx == 0 and dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset)
+                curr_pixel_values = load_image(
+                    file_name, max_num=self.max_num, upscale=upscale_flag).cuda().to(torch.bfloat16)
+                num_patches_list.append(curr_pixel_values.size(0))
+                pixel_values_list.append(curr_pixel_values)
+            pixel_values = torch.cat(pixel_values_list, dim=0)
+        elif image_cnt == 1:
+            upscale_flag = listinstr(['MMMU_DEV_VAL'], dataset)
+            pixel_values = load_image(
+                image_path, max_num=self.max_num, upscale=upscale_flag).cuda().to(torch.bfloat16)
+            num_patches_list = [pixel_values.size(0)]
+        else:
+            pixel_values = None
+            num_patches_list = []
+
+        # response, history = self.model.chat(
+        #                 self.tokenizer,
+        #                 pixel_values=pixel_values,
+        #                 num_patches_list=num_patches_list,
+        #                 question=question,
+        #                 generation_config=self.kwargs,
+        #                 history=history, return_history=True
+        #                                     )
+        response, history = self.model.chat(
+            self.tokenizer,
+            pixel_values=pixel_values,
+            num_patches_list=num_patches_list,
+            question=question,
+            generation_config=self.kwargs,
+            history=history,
+            return_history=True
+        )
+
+        response = re.sub(self.reverse_pattern, self.reverse_replacement, response)
+
+        return response
+
+    def chat_inner(self, message, dataset=None):
+        self.set_max_num(dataset)
+
+        if self.version in ['V1.1', 'V1.2']:
+            raise ValueError(f'Unsupported version for Multi-Turn: {self.version}')
+        elif self.version == 'V1.5':
+            raise ValueError(f'Unsupported version for Multi-Turn: {self.version}')
+        elif self.version == 'V2.0':
+            kwargs_default = dict(do_sample=False, max_new_tokens=512, top_p=None, num_beams=1)
+            self.kwargs = kwargs_default
+            return self.chat_inner_v2(message, dataset)
+        else:
+            raise ValueError(f'Unsupported version for Multi-Turn: {self.version}')

From 61f1df0c3d88c50353b033ba00225b2f79374991 Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amit.pinaki@gmail.com>
Date: Sun, 11 Aug 2024 00:29:46 +0000
Subject: [PATCH 13/15] fix formatting

---
 vlmeval/vlm/internvl_chat.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/vlmeval/vlm/internvl_chat.py b/vlmeval/vlm/internvl_chat.py
index def9c151..339e489c 100644
--- a/vlmeval/vlm/internvl_chat.py
+++ b/vlmeval/vlm/internvl_chat.py
@@ -440,14 +440,6 @@ def chat_inner_v2(self, message, dataset=None):
             pixel_values = None
             num_patches_list = []
 
-        # response, history = self.model.chat(
-        #                 self.tokenizer,
-        #                 pixel_values=pixel_values,
-        #                 num_patches_list=num_patches_list,
-        #                 question=question,
-        #                 generation_config=self.kwargs,
-        #                 history=history, return_history=True
-        #                                     )
         response, history = self.model.chat(
             self.tokenizer,
             pixel_values=pixel_values,

From 75c23dfb0ef0397dccf21c7ca06b5a7837a4900b Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amit.pinaki@gmail.com>
Date: Mon, 12 Aug 2024 02:01:16 +0000
Subject: [PATCH 14/15] Add Idefics3 Config

---
 README.md         | 10 +++++-----
 vlmeval/config.py |  4 ++++
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index d37a04a8..c3f7eecd 100644
--- a/README.md
+++ b/README.md
@@ -90,17 +90,17 @@ VLMEvalKit will use a **judge LLM** to extract answer from the output if you set
 
 **Supported PyTorch / HF Models**
 
-| [**IDEFICS-[9B/80B/v2-8B]-Instruct**](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct)🎞️🚅 | [**InstructBLIP-[7B/13B]**](https://github.com/salesforce/LAVIS/blob/main/projects/instructblip/README.md) | [**LLaVA-[v1-7B/v1.5-7B/v1.5-13B]**](https://github.com/haotian-liu/LLaVA) | [**MiniGPT-4-[v1-7B/v1-13B/v2-7B]**](https://github.com/Vision-CAIR/MiniGPT-4) |
+| [**IDEFICS-[9B/80B/v2-8B/v3-8B]-Instruct**](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct)🚅🎞️  | [**InstructBLIP-[7B/13B]**](https://github.com/salesforce/LAVIS/blob/main/projects/instructblip/README.md) | [**LLaVA-[v1-7B/v1.5-7B/v1.5-13B]**](https://github.com/haotian-liu/LLaVA) | [**MiniGPT-4-[v1-7B/v1-13B/v2-7B]**](https://github.com/Vision-CAIR/MiniGPT-4) |
 | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
-| [**mPLUG-Owl2**](https://github.com/X-PLUG/mPLUG-Owl/tree/main/mPLUG-Owl2)🎞️ | [**OpenFlamingo-v2**](https://github.com/mlfoundations/open_flamingo)🎞️ | [**PandaGPT-13B**](https://github.com/yxuansu/PandaGPT)      | [**Qwen-VL**](https://huggingface.co/Qwen/Qwen-VL)🎞️🚅, [**Qwen-VL-Chat**](https://huggingface.co/Qwen/Qwen-VL-Chat)🎞️**🚅** |
+| [**mPLUG-Owl2**](https://github.com/X-PLUG/mPLUG-Owl/tree/main/mPLUG-Owl2)🎞️ | [**OpenFlamingo-v2**](https://github.com/mlfoundations/open_flamingo)🎞️ | [**PandaGPT-13B**](https://github.com/yxuansu/PandaGPT)      | [**Qwen-VL**](https://huggingface.co/Qwen/Qwen-VL)🚅🎞️ , [**Qwen-VL-Chat**](https://huggingface.co/Qwen/Qwen-VL-Chat)🚅🎞️ |
 | [**VisualGLM-6B**](https://huggingface.co/THUDM/visualglm-6b)🚅 | [**InternLM-XComposer-[1/2]**](https://huggingface.co/internlm/internlm-xcomposer-7b)🚅 | [**ShareGPT4V-[7B/13B]**](https://sharegpt4v.github.io)🚅     | [**TransCore-M**](https://github.com/PCIResearch/TransCore-M) |
 | [**LLaVA (XTuner)**](https://huggingface.co/xtuner/llava-internlm-7b)🚅 | [**CogVLM-[Chat/Llama3]**](https://huggingface.co/THUDM/cogvlm-chat-hf)🚅 | [**ShareCaptioner**](https://huggingface.co/spaces/Lin-Chen/Share-Captioner)🚅 | [**CogVLM-Grounding-Generalist**](https://huggingface.co/THUDM/cogvlm-grounding-generalist-hf)🚅 |
 | [**Monkey**](https://github.com/Yuliang-Liu/Monkey)🚅, [**Monkey-Chat**](https://github.com/Yuliang-Liu/Monkey)🚅 | [**EMU2-Chat**](https://github.com/baaivision/Emu)🚅🎞️         | [**Yi-VL-[6B/34B]**](https://huggingface.co/01-ai/Yi-VL-6B)  | [**MMAlaya**](https://huggingface.co/DataCanvas/MMAlaya)🚅    |
 | [**InternLM-XComposer-2.5**](https://github.com/InternLM/InternLM-XComposer)🚅🎞️ | [**MiniCPM-[V1/V2/V2.5/V2.6]**](https://github.com/OpenBMB/MiniCPM-V)🚅🎞️ | [**OmniLMM-12B**](https://huggingface.co/openbmb/OmniLMM-12B) | [**InternVL-Chat-[V1-1/V1-2/V1-5/V2]**](https://github.com/OpenGVLab/InternVL)🚅🎞️, <br>[**Mini-InternVL-Chat-[2B/4B]-V1-5**](https://github.com/OpenGVLab/InternVL)🚅🎞️ |
 | [**DeepSeek-VL**](https://github.com/deepseek-ai/DeepSeek-VL/tree/main)🎞️ | [**LLaVA-NeXT**](https://llava-vl.github.io/blog/2024-01-30-llava-next/)🚅🎞️ | [**Bunny-Llama3**](https://huggingface.co/BAAI/Bunny-v1_1-Llama-3-8B-V)🚅 | [**XVERSE-V-13B**](https://github.com/xverse-ai/XVERSE-V-13B/blob/main/vxverse/models/vxverse.py) |
-| [**PaliGemma-3B**](https://huggingface.co/google/paligemma-3b-pt-448) 🚅 | [**360VL-70B**](https://huggingface.co/qihoo360/360VL-70B) 🚅 | [**Phi-3-Vision**](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)🚅 | [**WeMM**](https://github.com/scenarios/WeMM)🚅               |
+| [**PaliGemma-3B**](https://huggingface.co/google/paligemma-3b-pt-448) 🚅 | [**360VL-70B**](https://huggingface.co/qihoo360/360VL-70B) 🚅 | [**Phi-3-Vision**](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)🚅🎞️  | [**WeMM**](https://github.com/scenarios/WeMM)🚅               |
 | [**GLM-4v-9B**](https://huggingface.co/THUDM/glm-4v-9b) 🚅    | [**Cambrian-[8B/13B/34B]**](https://cambrian-mllm.github.io/) | [**LLaVA-Next-[Qwen-32B]**](https://huggingface.co/lmms-lab/llava-next-qwen-32b) 🎞️ | [**Chameleon-[7B/30B]**](https://huggingface.co/facebook/chameleon-7b)🚅🎞️ |
-| [**Video-LLaVA-7B-[HF]**](https://github.com/PKU-YuanGroup/Video-LLaVA) 🎬 | [**VILA1.5-[3B/8B/13B/40B]**](https://github.com/NVlabs/VILA/)🎞️ | [**Ovis1.5-[Llama3-8B/Gemma2-9B]**](https://github.com/AIDC-AI/Ovis) 🚅🎞️ | [**Mantis-8B-[siglip-llama3/clip-llama3/Idefics2/Fuyu]**](https://huggingface.co/TIGER-Lab/Mantis-8B-Idefics2) 🎞️ |
+| [**Video-LLaVA-7B-[HF]**](https://github.com/PKU-YuanGroup/Video-LLaVA) 🎬 | [**VILA1.5-[3B/8B/13B/40B]**](https://github.com/NVlabs/VILA/)🎞️🎬 | [**Ovis1.5-[Llama3-8B/Gemma2-9B]**](https://github.com/AIDC-AI/Ovis) 🚅🎞️ | [**Mantis-8B-[siglip-llama3/clip-llama3/Idefics2/Fuyu]**](https://huggingface.co/TIGER-Lab/Mantis-8B-Idefics2) 🎞️ |
 
 🎞️: Support multiple images as inputs.
 
@@ -115,7 +115,7 @@ Note that some VLMs may not be able to run under certain transformer versions, w
 - **Please use** `transformers==4.33.0` **for**: `Qwen series`, `Monkey series`, `InternLM-XComposer Series`, `mPLUG-Owl2`, `OpenFlamingo v2`, `IDEFICS series`, `VisualGLM`, `MMAlaya`, `ShareCaptioner`, `MiniGPT-4 series`, `InstructBLIP series`, `PandaGPT`, `VXVERSE`, `GLM-4v-9B`.
 - **Please use** `transformers==4.37.0` **for**: `LLaVA series`, `ShareGPT4V series`, `TransCore-M`, `LLaVA (XTuner)`, `CogVLM Series`, `EMU2 Series`, `Yi-VL Series`, `MiniCPM-[V1/V2]`, `OmniLMM-12B`, `DeepSeek-VL series`, `InternVL series`, `Cambrian Series`, `VILA Series`.
 - **Please use** `transformers==4.40.0` **for**: `IDEFICS2`, `Bunny-Llama3`, `MiniCPM-Llama3-V2.5`, `360VL-70B`, `Phi-3-Vision`, `WeMM`.
-- **Please use** `transformers==latest` **for**: `LLaVA-Next series`, `PaliGemma-3B`, `Chameleon series`, `Video-LLaVA-7B-HF`, `Ovis series`, `Mantis series`, `MiniCPM-V2.6`.
+- **Please use** `transformers==latest` **for**: `LLaVA-Next series`, `PaliGemma-3B`, `Chameleon series`, `Video-LLaVA-7B-HF`, `Ovis series`, `Mantis series`, `MiniCPM-V2.6`,`Idefics-3`.
 
 ```python
 # Demo
diff --git a/vlmeval/config.py b/vlmeval/config.py
index 9f046e31..ccb5fac0 100644
--- a/vlmeval/config.py
+++ b/vlmeval/config.py
@@ -155,6 +155,10 @@
     'idefics_9b_instruct': partial(IDEFICS, model_path='HuggingFaceM4/idefics-9b-instruct'),
     'idefics_80b_instruct': partial(IDEFICS, model_path='HuggingFaceM4/idefics-80b-instruct'),
     'idefics2_8b': partial(IDEFICS2, model_path='HuggingFaceM4/idefics2-8b'),
+
+    # Idefics3 follows Idefics2 Pattern
+    'Idefics3-8B-Llama3': partial(IDEFICS2, model_path='HuggingFaceM4/Idefics3-8B-Llama3'),
+
 }
 
 instructblip_series = {

From 2474225265c86f43d15a5b3aec9ae5b2aa4b8165 Mon Sep 17 00:00:00 2001
From: Amit Agarwal <amit.pinaki@gmail.com>
Date: Mon, 12 Aug 2024 02:12:27 +0000
Subject: [PATCH 15/15] Warning message to build from source

---
 vlmeval/vlm/idefics.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vlmeval/vlm/idefics.py b/vlmeval/vlm/idefics.py
index 96931d06..dddb5c17 100644
--- a/vlmeval/vlm/idefics.py
+++ b/vlmeval/vlm/idefics.py
@@ -64,6 +64,9 @@ class IDEFICS2(BaseModel):
     def __init__(self, model_path='HuggingFaceM4/idefics2-8b', **kwargs):
         assert model_path is not None
         self.model_path = model_path
+        if 'Idefics3' in self.model_path.lower():
+            warnings.warn('Install transfomers from source: PR https://github.com/open-compass/VLMEvalKit/pull/379')
+            warnings.warn('Reference: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3')
         self.processor = AutoProcessor.from_pretrained(model_path)
         self.model = AutoModelForVision2Seq.from_pretrained(
             model_path,