diff --git a/GRiT/demo.py b/GRiT/demo.py index 83e6459..7741e90 100644 --- a/GRiT/demo.py +++ b/GRiT/demo.py @@ -9,6 +9,7 @@ from detectron2.config import get_cfg from detectron2.data.detection_utils import read_image from detectron2.utils.logger import setup_logger +from transformers import Blip2Processor, Blip2ForConditionalGeneration sys.path.insert(0, 'third_party/CenterNet2/projects/CenterNet2/') from centernet.config import add_centernet_config @@ -80,8 +81,57 @@ def get_parser(): nargs=argparse.REMAINDER, ) return parser +def extract_features_with_blip2(image): + processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl") + model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl")   + inputs = processor(images=image, return_tensors="pt") + outputs = model.generate(**inputs) + text_description = processor.decode(outputs[0], skip_special_tokens=True) + + return text_description + +def process_image(cfg, img_path, output_dir): + """ + Processes a single image using GRiT and saves results in JSON format. + + Args: + cfg (CfgNode): Detectron2 configuration. + img_path (str): Path to the image file. + output_dir (str): Directory to save the JSON file (optional). + + Returns: + None + """ + + demo = VisualizationDemo(cfg) + img = read_image(img_path, format="BGR") + text_description = extract_features_with_blip2(img) + start_time = time.time() + predictions, visualized_output, bbox = demo.run_on_image(img) + + if output_dir: + json_file = {} + predict_object = bbox.pred_object_descriptions.data + predict_box = bbox.pred_boxes + + for (name, box) in zip(predict_object, predict_box): + if name not in json_file: + json_file[name] = [box.tolist()] + else: + json_file[name].append(box.tolist())   + + + out_filename = os.path.join(output_dir, os.path.splitext(os.path.basename(img_path))[0] + ".json") + with open(out_filename, "w") as outfile: + json.dump(json_file, outfile) + else: + cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) + cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1]) + if cv2.waitKey(0) == 27: + cv2.destroyAllWindows()   + if __name__ == "__main__": mp.set_start_method("spawn", force=True) args = get_parser().parse_args() @@ -97,6 +147,7 @@ def get_parser(): for path in tqdm.tqdm(os.listdir(args.input[0]), disable=not args.output): img = read_image(os.path.join(args.input[0], path), format="BGR") start_time = time.time() + process_image(cfg, img_path, output_dir) predictions, visualized_output, bbox = demo.run_on_image(img) if args.output: @@ -123,4 +174,4 @@ def get_parser(): cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1]) if cv2.waitKey(0) == 27: - break # esc to quit \ No newline at end of file + break # esc to quit diff --git a/image_retrieval/blip2.py b/image_retrieval/blip2.py new file mode 100644 index 0000000..73ae2d7 --- /dev/null +++ b/image_retrieval/blip2.py @@ -0,0 +1,121 @@ +from helper_function import * +import pandas as pd +import faiss +import numpy as np +import torch +import clip +import argparse +from transformers import BLIP2Processor, BLIP2Model, AutoTokenizer +parser = argparse.ArgumentParser() +parser.add_argument("--dataset", type = str) +parser.add_argument("--image_path", type=str, help="path to images") +parser.add_argument("--text_relation_path", type=str, help="path to relation files") +parser.add_argument("--dense_caption_path", type=str, help="path to densecaption") +parser.add_argument("--model", type=str) +args = parser.parse_args() + +device = "cuda:0" +data = pd.read_pickle(args.dataset) ### Flickr30k or MSCOCO test set +preprocess= BLIP2Processor.from_pretrained(args.model,device='cpu') +model = BLIP2Model.from_pretrained(args.model,device='cpu') +#model, preprocess = clip.load("RN50", device='cpu') +model.cuda(device).eval() +IMAGE_PATH = args.image_path + "{}.jpg" +TEXT_JSON_PATH = args.text_relation_path + "/{}.json" +DENSE_CAPTION_PAYTH = args.dense_caption_path + "/{}.json" + +def subimage_score_embedding(image, text): + if text: + # image = preprocess(image) + # text_input = clip.tokenize(text).cuda(device) + # image_input = torch.tensor(np.stack([image])).cuda(device) + # with torch.no_grad(): + # image_embed = model.encode_image(image_input).float() + # text_embed = model.encode_text(text_input).float() + # score = text_embed @ image_embed.T + # return image_embed, score + image_inputs = processor(images=image, return_tensors="pt") + image_input = model.get_image_features(**image_inputs).cuda(device) + text_inputs = tokenizer(caption, padding=True, return_tensors="pt") + text_input = model.get_text_features(**text_inputs).cuda(device) + with torch.no_grad(): + original_image_embed = image_input.float() + original_text_embed = text_input.float() + #original_image_embed = model.encode_image(image_input).float() + #original_text_embed = model.encode_text(text_input).float() + score = original_text_embed @ original_image_embed.T + return original_image_embed, score + + else: + return None, None + +def comclip_one_pair(row_id, caption, image_id): + image_inputs = processor(images=image, return_tensors="pt") + image_input = model.get_image_features(**image_inputs).cuda(device) + text_inputs = tokenizer(caption, padding=True, return_tensors="pt") + text_input = model.get_text_features(**text_inputs).cuda(device) + with torch.no_grad(): + original_image_embed = image_input.float() + original_text_embed = text_input.float() + text_json = get_sentence_json(row_id, TEXT_JSON_PATH) + object_images, key_map = create_sub_image_obj(row_id, image_id, IMAGE_PATH, TEXT_JSON_PATH, DENSE_CAPTION_PAYTH) + relation_images, relation_words = create_relation_object(object_images, text_json, image_id, key_map, IMAGE_PATH) + if relation_images and relation_words: + for relation_image, word in zip(relation_images, relation_words): + if word in object_images: + object_images[word+"_dup"] = relation_image + else: + object_images[word] = relation_image + + ##subimages + # Create image embeddings array + image_embeds = [] + image_scores = [] + for key, sub_image in object_images.items(): + if "_dup" in key: + key = key.replace("_dup", "") + image_embed, image_score = subimage_score_embedding(sub_image, key) + if image_embed is not None and image_score is not None: + # Append image features to image_embeds + image_input = model.get_image_features(**image_inputs).cuda(device) + image_embeds = np.append(image_embed, image_input.cpu().numpy(), axis=0) + image_scores.append(image_score) + image_embed_dim = image_embeds.ndim + index = faiss.IndexFlatL2(image_embed_dim) + # Thêm tất cả các embedding vào index + index.add(image_embeds) + #regularize the scores + similarity = normalize_tensor_list(image_scores) + for score, image in zip(similarity, index): + original_image_embed += score * image + image_features = original_image_embed / original_image_embed.norm(dim=-1, keepdim=True).float() + text_features = original_text_embed /original_text_embed.norm(dim=-1, keepdim=True).float() + similarity = text_features.detach().cpu().numpy() @ image_features.detach().cpu().numpy().T + return similarity + +def get_score(row_id): + result = {} + row = data.iloc[row_id] + candidates = row.clip_top_ten_pick + for candidate in candidates: + result[candidate[0]] = comclip_one_pair(row_id, row.sentence, candidate[0]).item() + result = dict(sorted(result.items(), key=lambda x: x[1], reverse=True)) + return result + +if __name__ == "__main__": + comclip_score = {} + for idx, row in data.iterrows(): + try: + comclip_score[idx] = get_score(idx) + except Exception as e: + print(e) + top_1 = 0 + top_5 = 0 + for idx, value in comclip_score.items(): + candidates = list(value.keys()) + candidates = [int(i) for i in candidates] + if candidates[0] == int(idx): + top_1 += 1 + if int(idx) in candidates[:5]: + top_5 += 1 + print("Top 1 score: {}. Top 5 score: {}".format(top_1/ 1000, top_5/ 1000)) diff --git a/image_retrieval/clip_baseline.py b/image_retrieval/clip_baseline.py index 996b74f..0883533 100644 --- a/image_retrieval/clip_baseline.py +++ b/image_retrieval/clip_baseline.py @@ -3,25 +3,39 @@ import torch import clip import argparse +import Image +from transformers import BLIP2Processor, BLIP2Model, AutoTokenizer parser = argparse.ArgumentParser() parser.add_argument("--dataset", type = str, help="csv file for the flickr30k") -parser.add_argument("--model", type=str, help="RN50, ViT/B-32, ViT/L-14") +parser.add_argument("--model", type=str, help="Salesforce/blip2-opt-2.7b") parser.add_argument("--image_path", type=str, help="path to the image") args = parser.parse_args() device = "cuda:0" data = pd.read_pickle(args.dataset) -model, preprocess = clip.load(args.model, device='cpu') +preprocess= BLIP2Processor.from_pretrained(args.model,device='cpu') +model = BLIP2Model.from_pretrained(args.model,device='cpu') +tokenizer = AutoTokenizer.from_pretrained(args.model,device='cpu') +#model, preprocess = clip.load(args.model, device='cpu') model.cuda(device).eval() IMAGE_PATH = args.image_path+ "/{}.jpg" def clip_compute_one_pair(caption, image_id): + + image = Image.open(image_id) + image_inputs = processor(images=image, return_tensors="pt") + image_input = model.get_image_features(**image_inputs).cuda(device) + image = preprocess(read_image(image_id, IMAGE_PATH)) - text_input = clip.tokenize(caption).cuda(device) - image_input = torch.tensor(np.stack([image])).cuda(device) + text_inputs = tokenizer(caption, padding=True, return_tensors="pt") + text_input= = model.get_text_features(**text_inputs).cuda(device) + #text_input = clip.tokenize(caption).cuda(device) + #image_input = torch.tensor(np.stack([image])).cuda(device) with torch.no_grad(): - original_image_embed = model.encode_image(image_input).float() - original_text_embed = model.encode_text(text_input).float() + original_image_embed = image_input.float() + original_text_embed = text_input.float() + #original_image_embed = model.encode_image(image_input).float() + #original_text_embed = model.encode_text(text_input).float() image_features = original_image_embed / original_image_embed.norm(dim=-1, keepdim=True).float() text_features = original_text_embed /original_text_embed.norm(dim=-1, keepdim=True).float() similarity = text_features.detach().cpu().numpy() @ image_features.detach().cpu().numpy().T @@ -57,4 +71,4 @@ def get_score(row_id): top_10 += 1 print("Top 1 {}. Top 5 {}".format(top_1/ 1000, top_5/ 1000)) - \ No newline at end of file + diff --git a/image_retrieval/comclip.py b/image_retrieval/comclip.py index 57fa4c8..056958a 100644 --- a/image_retrieval/comclip.py +++ b/image_retrieval/comclip.py @@ -3,6 +3,7 @@ import torch import clip import argparse +from transformers import BLIP2Processor, BLIP2Model, AutoTokenizer parser = argparse.ArgumentParser() parser.add_argument("--dataset", type = str) parser.add_argument("--image_path", type=str, help="path to images") @@ -13,7 +14,9 @@ device = "cuda:0" data = pd.read_pickle(args.dataset) ### Flickr30k or MSCOCO test set -model, preprocess = clip.load("RN50", device='cpu') +preprocess= BLIP2Processor.from_pretrained(args.model,device='cpu') +model = BLIP2Model.from_pretrained(args.model,device='cpu') +#model, preprocess = clip.load("RN50", device='cpu') model.cuda(device).eval() IMAGE_PATH = args.image_path + "{}.jpg" TEXT_JSON_PATH = args.text_relation_path + "/{}.json" @@ -21,14 +24,26 @@ def subimage_score_embedding(image, text): if text: - image = preprocess(image) - text_input = clip.tokenize(text).cuda(device) - image_input = torch.tensor(np.stack([image])).cuda(device) + # image = preprocess(image) + # text_input = clip.tokenize(text).cuda(device) + # image_input = torch.tensor(np.stack([image])).cuda(device) + # with torch.no_grad(): + # image_embed = model.encode_image(image_input).float() + # text_embed = model.encode_text(text_input).float() + # score = text_embed @ image_embed.T + # return image_embed, score + image_inputs = processor(images=image, return_tensors="pt") + image_input = model.get_image_features(**image_inputs).cuda(device) + text_inputs = tokenizer(caption, padding=True, return_tensors="pt") + text_input= = model.get_text_features(**text_inputs).cuda(device) with torch.no_grad(): - image_embed = model.encode_image(image_input).float() - text_embed = model.encode_text(text_input).float() - score = text_embed @ image_embed.T - return image_embed, score + original_image_embed = image_input.float() + original_text_embed = text_input.float() + #original_image_embed = model.encode_image(image_input).float() + #original_text_embed = model.encode_text(text_input).float() + score = original_text_embed @ original_image_embed.T + return original_image_embed, score + else: return None, None @@ -93,4 +108,4 @@ def get_score(row_id): top_1 += 1 if int(idx) in candidates[:5]: top_5 += 1 - print("Top 1 score: {}. Top 5 score: {}".format(top_1/ 1000, top_5/ 1000)) \ No newline at end of file + print("Top 1 score: {}. Top 5 score: {}".format(top_1/ 1000, top_5/ 1000)) diff --git a/image_retrieval/comclip.sh b/image_retrieval/comclip.sh index b3ac335..5695cb9 100644 --- a/image_retrieval/comclip.sh +++ b/image_retrieval/comclip.sh @@ -11,4 +11,5 @@ mkdir image_retrieval/relation_json python image_retrieval/parse_relation.py --relation_json_path image_retrieval/relation_json --data_path $4 mkdir image_retrieval/matched_relation python image_retrieval/match_relation.py --densecaption_path $2 --dataset_path $4 --openai $5 -python image_retrieval/comclip.py --dataset $4 --image_path $1 --text_relation_path image_retrieval/relation_json --densecaption_path $2 --model $6 +#python image_retrieval/comclip.py --dataset $4 --image_path $1 --text_relation_path image_retrieval/relation_json --densecaption_path $2 --model $6 +python image_retrieval/blip2.py --dataset $4 --image_path $1 --text_relation_path image_retrieval/relation_json --densecaption_path $2 --model $6