From dfe8241014b090fe9cb320235221a11c2b9578c4 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 20 Sep 2024 16:06:20 +0800 Subject: [PATCH] fix llavaqwen2 model conversion --- .../models/llava/convert_llava_weights_to_hf.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/llava/convert_llava_weights_to_hf.py b/src/transformers/models/llava/convert_llava_weights_to_hf.py index 9841b7cb3d1929..b8d936e8cc4473 100644 --- a/src/transformers/models/llava/convert_llava_weights_to_hf.py +++ b/src/transformers/models/llava/convert_llava_weights_to_hf.py @@ -76,7 +76,9 @@ def load_original_state_dict(model_id): if "lm_head.weight" not in original_state_dict: original_state_dict["lm_head.weight"] = original_state_dict["model.embed_tokens.weight"].clone() - del original_state_dict["model.image_newline"] # not used in the original implementation because "merge_type=flat" + if "model.image_newline" in original_state_dict: + # not used in the original implementation because "merge_type=flat" + del original_state_dict["model.image_newline"] return original_state_dict @@ -107,7 +109,7 @@ def convert_llava_llama_to_hf(text_model_id, vision_model_id, output_hub_path, o image_processor = AutoImageProcessor.from_pretrained(vision_model_id) processor = LlavaProcessor(tokenizer=tokenizer, image_processor=image_processor) - if "Qwen" in text_model_id: + if "siglip" in vision_model_id: vision_config = SiglipVisionConfig( hidden_size=1152, image_size=384, @@ -128,8 +130,9 @@ def convert_llava_llama_to_hf(text_model_id, vision_model_id, output_hub_path, o # llms-lab interleeave models do not use any selection startegy except for last hidden state if "Qwen" in text_model_id: config.image_token_index = 151646 - config.vision_feature_select_strategy = "full" - config.vision_feature_layer = -1 + if "siglip" in vision_model_id: + config.vision_feature_select_strategy = "full" + config.vision_feature_layer = -1 else: config.pad_token_id = 32001 config.image_token_index = 32000