Skip to content

Blip2 #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 52 additions & 1 deletion GRiT/demo.py
Original file line number Diff line number Diff line change
@@ -9,6 +9,7 @@
from detectron2.config import get_cfg
from detectron2.data.detection_utils import read_image
from detectron2.utils.logger import setup_logger
from transformers import Blip2Processor, Blip2ForConditionalGeneration

sys.path.insert(0, 'third_party/CenterNet2/projects/CenterNet2/')
from centernet.config import add_centernet_config
@@ -80,8 +81,57 @@ def get_parser():
nargs=argparse.REMAINDER,
)
return parser
def extract_features_with_blip2(image):
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl")  


inputs = processor(images=image, return_tensors="pt")
outputs = model.generate(**inputs)
text_description = processor.decode(outputs[0], skip_special_tokens=True)

return text_description

def process_image(cfg, img_path, output_dir):
"""
Processes a single image using GRiT and saves results in JSON format.
Args:
cfg (CfgNode): Detectron2 configuration.
img_path (str): Path to the image file.
output_dir (str): Directory to save the JSON file (optional).
Returns:
None
"""

demo = VisualizationDemo(cfg)
img = read_image(img_path, format="BGR")
text_description = extract_features_with_blip2(img)
start_time = time.time()
predictions, visualized_output, bbox = demo.run_on_image(img)

if output_dir:
json_file = {}
predict_object = bbox.pred_object_descriptions.data
predict_box = bbox.pred_boxes

for (name, box) in zip(predict_object, predict_box):
if name not in json_file:
json_file[name] = [box.tolist()]
else:
json_file[name].append(box.tolist())  


out_filename = os.path.join(output_dir, os.path.splitext(os.path.basename(img_path))[0] + ".json")
with open(out_filename, "w") as outfile:
json.dump(json_file, outfile)
else:
cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
if cv2.waitKey(0) == 27:
cv2.destroyAllWindows()  

if __name__ == "__main__":
mp.set_start_method("spawn", force=True)
args = get_parser().parse_args()
@@ -97,6 +147,7 @@ def get_parser():
for path in tqdm.tqdm(os.listdir(args.input[0]), disable=not args.output):
img = read_image(os.path.join(args.input[0], path), format="BGR")
start_time = time.time()
process_image(cfg, img_path, output_dir)
predictions, visualized_output, bbox = demo.run_on_image(img)

if args.output:
@@ -123,4 +174,4 @@ def get_parser():
cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
if cv2.waitKey(0) == 27:
break # esc to quit
break # esc to quit
121 changes: 121 additions & 0 deletions image_retrieval/blip2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
from helper_function import *
import pandas as pd
import faiss
import numpy as np
import torch
import clip
import argparse
from transformers import BLIP2Processor, BLIP2Model, AutoTokenizer
parser = argparse.ArgumentParser()
parser.add_argument("--dataset", type = str)
parser.add_argument("--image_path", type=str, help="path to images")
parser.add_argument("--text_relation_path", type=str, help="path to relation files")
parser.add_argument("--dense_caption_path", type=str, help="path to densecaption")
parser.add_argument("--model", type=str)
args = parser.parse_args()

device = "cuda:0"
data = pd.read_pickle(args.dataset) ### Flickr30k or MSCOCO test set
preprocess= BLIP2Processor.from_pretrained(args.model,device='cpu')
model = BLIP2Model.from_pretrained(args.model,device='cpu')
#model, preprocess = clip.load("RN50", device='cpu')
model.cuda(device).eval()
IMAGE_PATH = args.image_path + "{}.jpg"
TEXT_JSON_PATH = args.text_relation_path + "/{}.json"
DENSE_CAPTION_PAYTH = args.dense_caption_path + "/{}.json"

def subimage_score_embedding(image, text):
if text:
# image = preprocess(image)
# text_input = clip.tokenize(text).cuda(device)
# image_input = torch.tensor(np.stack([image])).cuda(device)
# with torch.no_grad():
# image_embed = model.encode_image(image_input).float()
# text_embed = model.encode_text(text_input).float()
# score = text_embed @ image_embed.T
# return image_embed, score
image_inputs = processor(images=image, return_tensors="pt")
image_input = model.get_image_features(**image_inputs).cuda(device)
text_inputs = tokenizer(caption, padding=True, return_tensors="pt")
text_input = model.get_text_features(**text_inputs).cuda(device)
with torch.no_grad():
original_image_embed = image_input.float()
original_text_embed = text_input.float()
#original_image_embed = model.encode_image(image_input).float()
#original_text_embed = model.encode_text(text_input).float()
score = original_text_embed @ original_image_embed.T
return original_image_embed, score

else:
return None, None

def comclip_one_pair(row_id, caption, image_id):
image_inputs = processor(images=image, return_tensors="pt")
image_input = model.get_image_features(**image_inputs).cuda(device)
text_inputs = tokenizer(caption, padding=True, return_tensors="pt")
text_input = model.get_text_features(**text_inputs).cuda(device)
with torch.no_grad():
original_image_embed = image_input.float()
original_text_embed = text_input.float()
text_json = get_sentence_json(row_id, TEXT_JSON_PATH)
object_images, key_map = create_sub_image_obj(row_id, image_id, IMAGE_PATH, TEXT_JSON_PATH, DENSE_CAPTION_PAYTH)
relation_images, relation_words = create_relation_object(object_images, text_json, image_id, key_map, IMAGE_PATH)
if relation_images and relation_words:
for relation_image, word in zip(relation_images, relation_words):
if word in object_images:
object_images[word+"_dup"] = relation_image
else:
object_images[word] = relation_image

##subimages
# Create image embeddings array
image_embeds = []
image_scores = []
for key, sub_image in object_images.items():
if "_dup" in key:
key = key.replace("_dup", "")
image_embed, image_score = subimage_score_embedding(sub_image, key)
if image_embed is not None and image_score is not None:
# Append image features to image_embeds
image_input = model.get_image_features(**image_inputs).cuda(device)
image_embeds = np.append(image_embed, image_input.cpu().numpy(), axis=0)
image_scores.append(image_score)
image_embed_dim = image_embeds.ndim
index = faiss.IndexFlatL2(image_embed_dim)
# Thêm tất cả các embedding vào index
index.add(image_embeds)
#regularize the scores
similarity = normalize_tensor_list(image_scores)
for score, image in zip(similarity, index):
original_image_embed += score * image
image_features = original_image_embed / original_image_embed.norm(dim=-1, keepdim=True).float()
text_features = original_text_embed /original_text_embed.norm(dim=-1, keepdim=True).float()
similarity = text_features.detach().cpu().numpy() @ image_features.detach().cpu().numpy().T
return similarity

def get_score(row_id):
result = {}
row = data.iloc[row_id]
candidates = row.clip_top_ten_pick
for candidate in candidates:
result[candidate[0]] = comclip_one_pair(row_id, row.sentence, candidate[0]).item()
result = dict(sorted(result.items(), key=lambda x: x[1], reverse=True))
return result

if __name__ == "__main__":
comclip_score = {}
for idx, row in data.iterrows():
try:
comclip_score[idx] = get_score(idx)
except Exception as e:
print(e)
top_1 = 0
top_5 = 0
for idx, value in comclip_score.items():
candidates = list(value.keys())
candidates = [int(i) for i in candidates]
if candidates[0] == int(idx):
top_1 += 1
if int(idx) in candidates[:5]:
top_5 += 1
print("Top 1 score: {}. Top 5 score: {}".format(top_1/ 1000, top_5/ 1000))
28 changes: 21 additions & 7 deletions image_retrieval/clip_baseline.py
Original file line number Diff line number Diff line change
@@ -3,25 +3,39 @@
import torch
import clip
import argparse
import Image
from transformers import BLIP2Processor, BLIP2Model, AutoTokenizer
parser = argparse.ArgumentParser()
parser.add_argument("--dataset", type = str, help="csv file for the flickr30k")
parser.add_argument("--model", type=str, help="RN50, ViT/B-32, ViT/L-14")
parser.add_argument("--model", type=str, help="Salesforce/blip2-opt-2.7b")
parser.add_argument("--image_path", type=str, help="path to the image")
args = parser.parse_args()

device = "cuda:0"
data = pd.read_pickle(args.dataset)
model, preprocess = clip.load(args.model, device='cpu')
preprocess= BLIP2Processor.from_pretrained(args.model,device='cpu')
model = BLIP2Model.from_pretrained(args.model,device='cpu')
tokenizer = AutoTokenizer.from_pretrained(args.model,device='cpu')
#model, preprocess = clip.load(args.model, device='cpu')
model.cuda(device).eval()
IMAGE_PATH = args.image_path+ "/{}.jpg"

def clip_compute_one_pair(caption, image_id):

image = Image.open(image_id)
image_inputs = processor(images=image, return_tensors="pt")
image_input = model.get_image_features(**image_inputs).cuda(device)

image = preprocess(read_image(image_id, IMAGE_PATH))
text_input = clip.tokenize(caption).cuda(device)
image_input = torch.tensor(np.stack([image])).cuda(device)
text_inputs = tokenizer(caption, padding=True, return_tensors="pt")
text_input= = model.get_text_features(**text_inputs).cuda(device)
#text_input = clip.tokenize(caption).cuda(device)
#image_input = torch.tensor(np.stack([image])).cuda(device)
with torch.no_grad():
original_image_embed = model.encode_image(image_input).float()
original_text_embed = model.encode_text(text_input).float()
original_image_embed = image_input.float()
original_text_embed = text_input.float()
#original_image_embed = model.encode_image(image_input).float()
#original_text_embed = model.encode_text(text_input).float()
image_features = original_image_embed / original_image_embed.norm(dim=-1, keepdim=True).float()
text_features = original_text_embed /original_text_embed.norm(dim=-1, keepdim=True).float()
similarity = text_features.detach().cpu().numpy() @ image_features.detach().cpu().numpy().T
@@ -57,4 +71,4 @@ def get_score(row_id):
top_10 += 1
print("Top 1 {}. Top 5 {}".format(top_1/ 1000, top_5/ 1000))



33 changes: 24 additions & 9 deletions image_retrieval/comclip.py
Original file line number Diff line number Diff line change
@@ -3,6 +3,7 @@
import torch
import clip
import argparse
from transformers import BLIP2Processor, BLIP2Model, AutoTokenizer
parser = argparse.ArgumentParser()
parser.add_argument("--dataset", type = str)
parser.add_argument("--image_path", type=str, help="path to images")
@@ -13,22 +14,36 @@

device = "cuda:0"
data = pd.read_pickle(args.dataset) ### Flickr30k or MSCOCO test set
model, preprocess = clip.load("RN50", device='cpu')
preprocess= BLIP2Processor.from_pretrained(args.model,device='cpu')
model = BLIP2Model.from_pretrained(args.model,device='cpu')
#model, preprocess = clip.load("RN50", device='cpu')
model.cuda(device).eval()
IMAGE_PATH = args.image_path + "{}.jpg"
TEXT_JSON_PATH = args.text_relation_path + "/{}.json"
DENSE_CAPTION_PAYTH = args.dense_caption_path + "/{}.json"

def subimage_score_embedding(image, text):
if text:
image = preprocess(image)
text_input = clip.tokenize(text).cuda(device)
image_input = torch.tensor(np.stack([image])).cuda(device)
# image = preprocess(image)
# text_input = clip.tokenize(text).cuda(device)
# image_input = torch.tensor(np.stack([image])).cuda(device)
# with torch.no_grad():
# image_embed = model.encode_image(image_input).float()
# text_embed = model.encode_text(text_input).float()
# score = text_embed @ image_embed.T
# return image_embed, score
image_inputs = processor(images=image, return_tensors="pt")
image_input = model.get_image_features(**image_inputs).cuda(device)
text_inputs = tokenizer(caption, padding=True, return_tensors="pt")
text_input= = model.get_text_features(**text_inputs).cuda(device)
with torch.no_grad():
image_embed = model.encode_image(image_input).float()
text_embed = model.encode_text(text_input).float()
score = text_embed @ image_embed.T
return image_embed, score
original_image_embed = image_input.float()
original_text_embed = text_input.float()
#original_image_embed = model.encode_image(image_input).float()
#original_text_embed = model.encode_text(text_input).float()
score = original_text_embed @ original_image_embed.T
return original_image_embed, score

else:
return None, None

@@ -93,4 +108,4 @@ def get_score(row_id):
top_1 += 1
if int(idx) in candidates[:5]:
top_5 += 1
print("Top 1 score: {}. Top 5 score: {}".format(top_1/ 1000, top_5/ 1000))
print("Top 1 score: {}. Top 5 score: {}".format(top_1/ 1000, top_5/ 1000))
3 changes: 2 additions & 1 deletion image_retrieval/comclip.sh
Original file line number Diff line number Diff line change
@@ -11,4 +11,5 @@ mkdir image_retrieval/relation_json
python image_retrieval/parse_relation.py --relation_json_path image_retrieval/relation_json --data_path $4
mkdir image_retrieval/matched_relation
python image_retrieval/match_relation.py --densecaption_path $2 --dataset_path $4 --openai $5
python image_retrieval/comclip.py --dataset $4 --image_path $1 --text_relation_path image_retrieval/relation_json --densecaption_path $2 --model $6
#python image_retrieval/comclip.py --dataset $4 --image_path $1 --text_relation_path image_retrieval/relation_json --densecaption_path $2 --model $6
python image_retrieval/blip2.py --dataset $4 --image_path $1 --text_relation_path image_retrieval/relation_json --densecaption_path $2 --model $6