eric-ai-lab · lenguyen2592004 · Aug 8, 2024 · Aug 8, 2024 · Aug 8, 2024 · Aug 8, 2024
diff --git a/GRiT/demo.py b/GRiT/demo.py
@@ -9,6 +9,7 @@
 from detectron2.config import get_cfg
 from detectron2.data.detection_utils import read_image
 from detectron2.utils.logger import setup_logger
+from transformers import Blip2Processor, Blip2ForConditionalGeneration
 
 sys.path.insert(0, 'third_party/CenterNet2/projects/CenterNet2/')
 from centernet.config import add_centernet_config
@@ -80,8 +81,57 @@ def get_parser():
         nargs=argparse.REMAINDER,
     )
     return parser
+def extract_features_with_blip2(image):
+    processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
+    model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl")   
 
 
+    inputs = processor(images=image, return_tensors="pt")
+    outputs = model.generate(**inputs)
+    text_description = processor.decode(outputs[0], skip_special_tokens=True)
+
+    return text_description
+
+def process_image(cfg, img_path, output_dir):
+    """
+    Processes a single image using GRiT and saves results in JSON format.
+
+    Args:
+        cfg (CfgNode): Detectron2 configuration.
+        img_path (str): Path to the image file.
+        output_dir (str): Directory to save the JSON file (optional).
+
+    Returns:
+        None
+    """
+
+    demo = VisualizationDemo(cfg)
+    img = read_image(img_path, format="BGR")
+    text_description = extract_features_with_blip2(img)
+    start_time = time.time()
+    predictions, visualized_output, bbox = demo.run_on_image(img)
+
+    if output_dir:
+        json_file = {}
+        predict_object = bbox.pred_object_descriptions.data
+        predict_box = bbox.pred_boxes
+
+        for (name, box) in zip(predict_object, predict_box):
+            if name not in json_file:
+                json_file[name] = [box.tolist()]
+            else:
+                json_file[name].append(box.tolist())   
+
+
+        out_filename = os.path.join(output_dir, os.path.splitext(os.path.basename(img_path))[0] + ".json")
+        with open(out_filename, "w") as outfile:
+            json.dump(json_file, outfile)
+    else:
+        cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+        cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
+        if cv2.waitKey(0) == 27:
+            cv2.destroyAllWindows()   
+
 if __name__ == "__main__":
     mp.set_start_method("spawn", force=True)
     args = get_parser().parse_args()
@@ -97,6 +147,7 @@ def get_parser():
         for path in tqdm.tqdm(os.listdir(args.input[0]), disable=not args.output):
             img = read_image(os.path.join(args.input[0], path), format="BGR")
             start_time = time.time()
+            process_image(cfg, img_path, output_dir)
             predictions, visualized_output, bbox = demo.run_on_image(img)
 
             if args.output:
@@ -123,4 +174,4 @@ def get_parser():
                 cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
                 cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
                 if cv2.waitKey(0) == 27:
-                    break  # esc to quit
+                    break  # esc to quit
diff --git a/image_retrieval/blip2.py b/image_retrieval/blip2.py
@@ -0,0 +1,121 @@
+from helper_function import *
+import pandas as pd
+import faiss
+import numpy as np
+import torch
+import clip
+import argparse
+from transformers import BLIP2Processor, BLIP2Model, AutoTokenizer
+parser = argparse.ArgumentParser()
+parser.add_argument("--dataset", type = str)
+parser.add_argument("--image_path", type=str, help="path to images")
+parser.add_argument("--text_relation_path", type=str, help="path to relation files")
+parser.add_argument("--dense_caption_path", type=str, help="path to densecaption")
+parser.add_argument("--model", type=str)
+args = parser.parse_args()
+
+device = "cuda:0"
+data = pd.read_pickle(args.dataset) ### Flickr30k or MSCOCO test set
+preprocess= BLIP2Processor.from_pretrained(args.model,device='cpu')
+model = BLIP2Model.from_pretrained(args.model,device='cpu')
+#model, preprocess = clip.load("RN50", device='cpu')
+model.cuda(device).eval()
+IMAGE_PATH = args.image_path + "{}.jpg"
+TEXT_JSON_PATH = args.text_relation_path + "/{}.json"
+DENSE_CAPTION_PAYTH = args.dense_caption_path + "/{}.json"
+
+def subimage_score_embedding(image, text):
+    if text:
+        # image = preprocess(image)
+        # text_input = clip.tokenize(text).cuda(device)
+        # image_input = torch.tensor(np.stack([image])).cuda(device)
+        # with torch.no_grad():
+        #     image_embed = model.encode_image(image_input).float()
+        #     text_embed = model.encode_text(text_input).float()
+        # score = text_embed @ image_embed.T
+        # return image_embed, score
+        image_inputs = processor(images=image, return_tensors="pt")
+        image_input = model.get_image_features(**image_inputs).cuda(device)
+        text_inputs = tokenizer(caption, padding=True, return_tensors="pt")
+        text_input = model.get_text_features(**text_inputs).cuda(device)
+        with torch.no_grad():
+            original_image_embed = image_input.float()
+            original_text_embed =  text_input.float()
+            #original_image_embed = model.encode_image(image_input).float()
+            #original_text_embed = model.encode_text(text_input).float()
+        score = original_text_embed @ original_image_embed.T
+        return original_image_embed, score
+
+    else:
+        return None, None
+
+def comclip_one_pair(row_id, caption, image_id):
+    image_inputs = processor(images=image, return_tensors="pt")
+    image_input = model.get_image_features(**image_inputs).cuda(device)
+    text_inputs = tokenizer(caption, padding=True, return_tensors="pt")
+    text_input = model.get_text_features(**text_inputs).cuda(device)
+    with torch.no_grad():
+        original_image_embed = image_input.float()
+        original_text_embed =  text_input.float()
+    text_json = get_sentence_json(row_id, TEXT_JSON_PATH)
+    object_images, key_map = create_sub_image_obj(row_id, image_id, IMAGE_PATH, TEXT_JSON_PATH, DENSE_CAPTION_PAYTH)
+    relation_images, relation_words = create_relation_object(object_images, text_json, image_id, key_map, IMAGE_PATH)
+    if relation_images and relation_words:
+        for relation_image, word in zip(relation_images, relation_words):
+            if word in object_images:
+                object_images[word+"_dup"] = relation_image
+            else:
+                object_images[word] = relation_image
+
+    ##subimages
+    # Create image embeddings array
+    image_embeds = []
+    image_scores = []
+    for key, sub_image in object_images.items():
+        if "_dup" in key:
+            key = key.replace("_dup", "")
+        image_embed, image_score = subimage_score_embedding(sub_image, key)
+        if image_embed is not None and image_score is not None:
+            # Append image features to image_embeds
+            image_input = model.get_image_features(**image_inputs).cuda(device)
+            image_embeds = np.append(image_embed, image_input.cpu().numpy(), axis=0)
+            image_scores.append(image_score)
+    image_embed_dim = image_embeds.ndim
+    index = faiss.IndexFlatL2(image_embed_dim)
+    # Thêm tất cả các embedding vào index
+    index.add(image_embeds)
+    #regularize the scores
+    similarity = normalize_tensor_list(image_scores)
+    for score, image in zip(similarity, index):
+        original_image_embed += score * image
+    image_features = original_image_embed / original_image_embed.norm(dim=-1, keepdim=True).float()
+    text_features = original_text_embed /original_text_embed.norm(dim=-1, keepdim=True).float()
+    similarity = text_features.detach().cpu().numpy() @ image_features.detach().cpu().numpy().T
+    return similarity
+
+def get_score(row_id):
+    result = {}
+    row = data.iloc[row_id]
+    candidates = row.clip_top_ten_pick
+    for candidate in candidates:
+        result[candidate[0]] = comclip_one_pair(row_id, row.sentence, candidate[0]).item()
+    result = dict(sorted(result.items(), key=lambda x: x[1], reverse=True))
+    return result
+
+if __name__ == "__main__":
+    comclip_score = {}
+    for idx, row in data.iterrows():
+        try:
+            comclip_score[idx] = get_score(idx)
+        except Exception as e:
+            print(e)
+    top_1 = 0
+    top_5 = 0
+    for idx, value in comclip_score.items():
+        candidates = list(value.keys())
+        candidates = [int(i) for i in candidates]
+        if candidates[0] == int(idx):
+            top_1 += 1
+        if int(idx) in candidates[:5]:
+            top_5 += 1
+    print("Top 1 score: {}. Top 5 score: {}".format(top_1/ 1000, top_5/ 1000))
diff --git a/image_retrieval/clip_baseline.py b/image_retrieval/clip_baseline.py
@@ -3,25 +3,39 @@
 import torch
 import clip
 import argparse
+import Image
+from transformers import BLIP2Processor, BLIP2Model, AutoTokenizer
 parser = argparse.ArgumentParser()
 parser.add_argument("--dataset", type = str, help="csv file for the flickr30k")
-parser.add_argument("--model", type=str, help="RN50, ViT/B-32, ViT/L-14")
+parser.add_argument("--model", type=str, help="Salesforce/blip2-opt-2.7b")
 parser.add_argument("--image_path", type=str, help="path to the image")
 args = parser.parse_args()
 
 device = "cuda:0"
 data = pd.read_pickle(args.dataset)
-model, preprocess = clip.load(args.model, device='cpu')
+preprocess= BLIP2Processor.from_pretrained(args.model,device='cpu')
+model = BLIP2Model.from_pretrained(args.model,device='cpu')
+tokenizer = AutoTokenizer.from_pretrained(args.model,device='cpu')
+#model, preprocess = clip.load(args.model, device='cpu')
 model.cuda(device).eval()
 IMAGE_PATH = args.image_path+ "/{}.jpg"
 
 def clip_compute_one_pair(caption, image_id):
+
+    image = Image.open(image_id)
+    image_inputs = processor(images=image, return_tensors="pt")
+    image_input = model.get_image_features(**image_inputs).cuda(device)
+
     image = preprocess(read_image(image_id, IMAGE_PATH))
-    text_input = clip.tokenize(caption).cuda(device)
-    image_input = torch.tensor(np.stack([image])).cuda(device)
+    text_inputs = tokenizer(caption, padding=True, return_tensors="pt")
+    text_input= = model.get_text_features(**text_inputs).cuda(device)
+    #text_input = clip.tokenize(caption).cuda(device)
+    #image_input = torch.tensor(np.stack([image])).cuda(device)
     with torch.no_grad():
-        original_image_embed = model.encode_image(image_input).float()
-        original_text_embed = model.encode_text(text_input).float()
+        original_image_embed = image_input.float()
+        original_text_embed =  text_input.float()
+        #original_image_embed = model.encode_image(image_input).float()
+        #original_text_embed = model.encode_text(text_input).float()
     image_features = original_image_embed / original_image_embed.norm(dim=-1, keepdim=True).float()
     text_features = original_text_embed /original_text_embed.norm(dim=-1, keepdim=True).float()
     similarity = text_features.detach().cpu().numpy() @ image_features.detach().cpu().numpy().T
@@ -57,4 +71,4 @@ def get_score(row_id):
             top_10 += 1
     print("Top 1 {}. Top 5 {}".format(top_1/ 1000, top_5/ 1000))
 
-
+
diff --git a/image_retrieval/comclip.py b/image_retrieval/comclip.py
@@ -3,6 +3,7 @@
 import torch
 import clip
 import argparse
+from transformers import BLIP2Processor, BLIP2Model, AutoTokenizer
 parser = argparse.ArgumentParser()
 parser.add_argument("--dataset", type = str)
 parser.add_argument("--image_path", type=str, help="path to images")
@@ -13,22 +14,36 @@
 
 device = "cuda:0"
 data = pd.read_pickle(args.dataset) ### Flickr30k or MSCOCO test set
-model, preprocess = clip.load("RN50", device='cpu')
+preprocess= BLIP2Processor.from_pretrained(args.model,device='cpu')
+model = BLIP2Model.from_pretrained(args.model,device='cpu')
+#model, preprocess = clip.load("RN50", device='cpu')
 model.cuda(device).eval()
 IMAGE_PATH = args.image_path + "{}.jpg"
 TEXT_JSON_PATH = args.text_relation_path + "/{}.json"
 DENSE_CAPTION_PAYTH = args.dense_caption_path + "/{}.json"
 
 def subimage_score_embedding(image, text):
     if text:
-        image = preprocess(image)
-        text_input = clip.tokenize(text).cuda(device)
-        image_input = torch.tensor(np.stack([image])).cuda(device)
+        # image = preprocess(image)
+        # text_input = clip.tokenize(text).cuda(device)
+        # image_input = torch.tensor(np.stack([image])).cuda(device)
+        # with torch.no_grad():
+        #     image_embed = model.encode_image(image_input).float()
+        #     text_embed = model.encode_text(text_input).float()
+        # score = text_embed @ image_embed.T
+        # return image_embed, score
+        image_inputs = processor(images=image, return_tensors="pt")
+        image_input = model.get_image_features(**image_inputs).cuda(device)
+        text_inputs = tokenizer(caption, padding=True, return_tensors="pt")
+        text_input= = model.get_text_features(**text_inputs).cuda(device)
         with torch.no_grad():
-            image_embed = model.encode_image(image_input).float()
-            text_embed = model.encode_text(text_input).float()
-        score = text_embed @ image_embed.T
-        return image_embed, score
+            original_image_embed = image_input.float()
+            original_text_embed =  text_input.float()
+            #original_image_embed = model.encode_image(image_input).float()
+            #original_text_embed = model.encode_text(text_input).float()
+        score = original_text_embed @ original_image_embed.T
+        return original_image_embed, score
+
     else:
         return None, None
 
@@ -93,4 +108,4 @@ def get_score(row_id):
             top_1 += 1
         if int(idx) in candidates[:5]:
             top_5 += 1
-    print("Top 1 score: {}. Top 5 score: {}".format(top_1/ 1000, top_5/ 1000))
+    print("Top 1 score: {}. Top 5 score: {}".format(top_1/ 1000, top_5/ 1000))
diff --git a/image_retrieval/comclip.sh b/image_retrieval/comclip.sh
@@ -11,4 +11,5 @@ mkdir image_retrieval/relation_json
 python image_retrieval/parse_relation.py --relation_json_path image_retrieval/relation_json --data_path $4
 mkdir image_retrieval/matched_relation
 python image_retrieval/match_relation.py --densecaption_path $2 --dataset_path $4 --openai $5
-python image_retrieval/comclip.py --dataset $4 --image_path $1 --text_relation_path image_retrieval/relation_json --densecaption_path $2 --model $6
+#python image_retrieval/comclip.py --dataset $4 --image_path $1 --text_relation_path image_retrieval/relation_json --densecaption_path $2 --model $6
+python image_retrieval/blip2.py --dataset $4 --image_path $1 --text_relation_path image_retrieval/relation_json --densecaption_path $2 --model $6