train.jsonl

{"unique_id": "VQA_object_presence_10026521", "task_name": "VQA_object_presence", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000005277.jpg", "question": "Are there any couches in the photo?", "target_txt": "no", "options": ["no", "yes"], "prompt": "This task asks you to identify if an object appears in the image. Are there any couches in the photo?\n\n[Options]: no||||yes", "target": "no"}
{"unique_id": "mscoco_caption2014_392260", "image_source": "coco2014", "task_name": "image_caption", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000047909.jpg", "target_txt": "A parade float of a dog on a skateboard with on lookers.", "prompt": "Look at image and tell me what is the content.", "target": "A parade float of a dog on a skateboard with on lookers."}
{"unique_id": "526754_592886_4808_13689", "image_source": "coco2014", "task_name": "object_description_generate", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000526754.jpg", "region": ["57.79 177.34 491.20000000000005 629.53"], "target_txt": "zebra creature front and center", "prompt": "Generate a sentence to describe the object in the given bounding box. The description should help people to distinguish the object from other objects in the image.\n\nBounding box: 57.79 177.34 491.20000000000005 629.53", "target": "zebra creature front and center"}
{"unique_id": "526754_592886_4808_13689", "image_source": "coco2014", "task_name": "descriptive_object_region_select", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000526754.jpg", "region": ["57.79 177.34 491.20000000000005 629.53"], "options": ["414.2 277.57 537.0 628.49", "57.79 177.34 491.20000000000005 629.53"], "text": "zebra creature front and center", "target_txt": "57.79 177.34 491.20000000000005 629.53", "prompt": "Given the image, select the region of zebra creature front and center.\n\n[Options]: 414.2 277.57 537.0 628.49||||57.79 177.34 491.20000000000005 629.53", "target": "57.79 177.34 491.20000000000005 629.53"}
{"unique_id": "mscoco_text_legibility_287140", "image_source": "coco2014", "task_name": "text_legibility", "image_path": "raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000287140.jpg", "region": ["212.35294117647058 108.67088607594941 238.16806722689077 125.88607594936713"], "options": ["not clear and complete", "clear"], "target_txt": "clear", "prompt": "Decide if the text in the given region is legible. Region 212.35294117647058 108.67088607594941 238.16806722689077 125.88607594936713 \n\n[Options]: not clear and complete||||clear", "target": "clear"}
{"unique_id": "VQAv2_183888001", "image_source": "coco2014", "task_name": "open-domain_VQA", "question": "Is the bathroom cluttered?", "target_txt": "yes", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000183888.jpg", "prompt": "Is the bathroom cluttered?", "target": "yes"}
{"unique_id": "GQA_1276551_2339011", "image_source": "GQA", "task_name": "open-domain_VQA", "image_path": "./raw_datasets/GQA/images/2339011.jpg", "question": "Are the crumbs on a cookie?", "target_txt": "no", "prompt": "Are the crumbs on a cookie?", "target": "no"}
{"unique_id": "mscoco_detection2014_443727", "image_source": "coco2014", "task_name": "region_generation", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000443727.jpg", "text": "tennis racket", "region": ["17.53 161.8 233.26 311.46000000000004"], "target_txt": "17.53 161.8 233.26 311.46000000000004", "prompt": "Identify the regions that contain \"tennis racket\".", "target": "17.53 161.8 233.26 311.46000000000004"}
{"unique_id": "mscoco_caption2014_774240", "image_source": "coco2014", "task_name": "image_text_selection", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000276634.jpg", "target_txt": "A giraffe standing in the middle of a forest.", "options": ["a male in a brown shirt and tie is holding a camera ", "A street and sidewalk full of directional signs.", "A giraffe standing in the middle of a forest.", "Three sheep, one grazing on grass, while a dog stands behind them.", "a tree that has a park bench under it"], "prompt": "Which option in the options that is the caption of the image. \n\n[Options]: a male in a brown shirt and tie is holding a camera ||||A street and sidewalk full of directional signs.||||A giraffe standing in the middle of a forest.||||Three sheep, one grazing on grass, while a dog stands behind them.||||a tree that has a park bench under it", "target": "A giraffe standing in the middle of a forest."}
{"unique_id": "VQA_activity_recognition_30189738", "task_name": "VQA_activity_recognition", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000039495.jpg", "question": "What is the boy doing?", "target_txt": "standing", "options": ["eating", "sitting", "standing", "walking", "talking", "grazing", "sleeping", "swimming"], "prompt": "In this task, you will answer a question about the activity of an object in the image. The question is \"What is the boy doing?\"\n\n[Options]: eating||||sitting||||standing||||walking||||talking||||grazing||||sleeping||||swimming", "target": "standing"}
{"unique_id": "visualgenome_object_relationship_2374286_4310275", "image_source": "visualgenome", "task_name": "object_relationship", "image_path": "./raw_datasets/visual_genome/VG_100K/2374286.jpg", "region": ["6 386 332 498", "13 393 329 490"], "target_txt": "of", "meta_data": {"subject": "back", "object": "heads"}, "prompt": "Tell me the relationship between the subject in 6 386 332 498 and the object in 13 393 329 490.", "target": "of"}
{"unique_id": "mocheg_107080", "image_source": "mocheg", "task_name": "multimodal_factual_checking", "image_path": "./raw_datasets/mocheg/mocheg/train/images/107080-proof-06-3a98b45198b3333818a7b679c21d97f1.jpg", "target_txt": "not sure", "options": ["not sure", "no", "yes"], "context": "This growth pattern further weakens Romney's argument that Latin America's economy is 'almost as big.' Our ruling Romney said that 'Latin America's economy is almost as big as the economy of China.' It's a stretch to characterize an economy that's about one-third bigger as 'almost as big,' especially since China's lead over Latin America is actually increasing.", "text": "'Latin America's economy is almost as big as the economy of China.", "prompt": "Deicide if the claim can be supported by the image and the context.\n\nContext: This growth pattern further weakens Romney's argument that Latin America's economy is 'almost as big.' Our ruling Romney said that 'Latin America's economy is almost as big as the economy of China.' It's a stretch to characterize an economy that's about one-third bigger as 'almost as big,' especially since China's lead over Latin America is actually increasing.\n\nClaim: \"'Latin America's economy is almost as big as the economy of China.\"\n\n[Options]: not sure||||no||||yes", "target": "not sure"}
{"unique_id": "VQA_counting_30160429", "task_name": "VQA_counting", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000408564.jpg", "question": "How many boards are there?", "target_txt": "one", "options": ["one", "zero", "seven", "five", "four", "twelve", "eight", "two", "six"], "prompt": "This task tests your ability to count number of objects. Here is the question \"How many boards are there?\". Select the correct answer from options.\n\n[Options]: one||||zero||||seven||||five||||four||||twelve||||eight||||two||||six", "target": "one"}
{"unique_id": "visualgenome_VG_2375383_1736528", "image_source": "visualgenome", "task_name": "VG", "image_path": "./raw_datasets/visual_genome/VG_100K/2375383.jpg", "region": ["7 174 440 329"], "text": "a group of horse back riders on the beach", "target_txt": "7 174 440 329", "prompt": "The goal of this task is to find the part of the image with the description: \"a group of horse back riders on the beach\"", "target": "7 174 440 329"}
{"unique_id": "visualgenome_VG_2393103_1087187", "image_source": "visualgenome", "task_name": "select_nonoverlaped_region", "image_path": "./raw_datasets/visual_genome/VG_100K/2393103.jpg", "region": ["19 311 44 390"], "options": ["120 249 372 410", "120 250 368 413", "19 311 44 390"], "meta_data": {"object_regions": {"given_region": ["273 284 362 326"]}}, "target_txt": "19 311 44 390", "prompt": "Given the region 273 284 362 326, select an non-overlapping region from options.\n\n[Options]: 120 249 372 410||||120 250 368 413||||19 311 44 390", "target": "19 311 44 390"}
{"unique_id": "VQA_object_recognition_10141618", "task_name": "VQA_object_recognition", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000420649.jpg", "question": "What furniture is shown in the photo?", "target_txt": "bed", "options": ["giraffes", "swan", "minivan", "fruit", "bed", "hummingbird", "butterflies", "busses", "car"], "prompt": "In this task, you will be presented with an image. Your task is to answer a question about the type of object. Question: What furniture is shown in the photo?\n\n[Options]: giraffes||||swan||||minivan||||fruit||||bed||||hummingbird||||butterflies||||busses||||car\n            ", "target": "bed"}
{"unique_id": "mscoco_detection2014_365431", "image_source": "coco2014", "task_name": "missing_object_selection", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000365431.jpg", "target_txt": "None", "region": ["90.4 323.73 421.88 550.46", "352.22 0.0 499.0 352.68"], "options": ["cell phone", "cup", "None"], "meta_data": {"region_obj": ["cell phone", "cup"]}, "prompt": "Given 90.4 323.73 421.88 550.46||||352.22 0.0 499.0 352.68, select objects that do not appear in any of the regions. Select \"None\" if you can't find it.\n\n[Options]: cell phone||||cup||||None", "target": "None"}
{"unique_id": "519404_1241542_5000_0", "image_source": "coco2014", "task_name": "object_description_generate", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000519404.jpg", "region": ["0.0 45.95 238.92 454.59"], "target_txt": "Two woman one in black eatting and the other has a white shirt at the desk", "prompt": "Write a sentence to describe the object in the given region.\n\nRegion: 0.0 45.95 238.92 454.59", "target": "Two woman one in black eatting and the other has a white shirt at the desk"}
{"unique_id": "mscoco_detection2014_34404", "image_source": "coco2014", "task_name": "object_match", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000034404.jpg", "target_txt": "Yes, two objects are the same", "region": ["80.97 120.98 286.3 315.7", "0.0 1.92 70.21 358.75"], "options": ["Yes, two objects are the same", "No, two objects are different"], "prompt": "The goal of this task is to check if two regions contain the same type of object in the image. The two regions are 80.97 120.98 286.3 315.7||||0.0 1.92 70.21 358.75. \n\n[Options]: Yes, two objects are the same||||No, two objects are different", "target": "Yes, two objects are the same"}
{"unique_id": "519404_1241542_5000_0", "image_source": "coco2014", "task_name": "descriptive_object_region_generate", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000519404.jpg", "region": ["0.0 45.95 238.92 454.59"], "text": "Two woman one in black eatting and the other has a white shirt at the desk", "target_txt": "0.0 45.95 238.92 454.59", "prompt": "What is the region of the object described by \"Two woman one in black eatting and the other has a white shirt at the desk\" in image?", "target": "0.0 45.95 238.92 454.59"}
{"unique_id": "wikihow_wikihow_text_image_step_order_161279_make-a-kite-out-of-a-plastic-bag", "image_source": "wikihow", "task_name": "wikihow_text_image_step_order", "image_path": "raw_datasets/wikihow/data/161279_make-a-kite-out-of-a-plastic-bag/image/method_0_step_2.jpg", "text": "Create a cross with frame pieces.", "target_txt": "next", "meta_data": {"method": "Forming the Kite's Frame"}, "prompt": "Is the image the next or previous step? You are doing \"Forming the Kite's Frame\" and you are currently at \"Create a cross with frame pieces.\".\n\n[Options]: previous||||next", "target": "next"}
{"unique_id": "VQA_sport_recognition_10261459", "task_name": "VQA_sport_recognition", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000050410.jpg", "question": "What sport is this ?", "target_txt": "surfing", "options": ["surfing", "baseball", "snowboarding", "soccer", "playing tennis", "playing baseball", "skateboarding", "skiing"], "prompt": "Given a picture about sports, answer the following question by select an answer from the options. \n What sport is this ?\n\n[Options]: surfing||||baseball||||snowboarding||||soccer||||playing tennis||||playing baseball||||skateboarding||||skiing", "target": "surfing"}
{"unique_id": "wikihow_wikihow_next_step_161279_make-a-kite-out-of-a-plastic-bag", "image_source": "wikihow", "task_name": "wikihow_next_step", "image_path": "raw_datasets/wikihow/data/161279_make-a-kite-out-of-a-plastic-bag/image/method_0_step_0.jpg", "text": "Get the dimensions.", "context": ["Get the dimensions.", "Create a cross with frame pieces."], "target_txt": "Create a cross with frame pieces.", "meta_data": {"method": "Forming the Kite's Frame"}, "prompt": "You are doing Forming the Kite's Frame and you are at \"Get the dimensions.\" step. The previous steps you finished are\n\nGet the dimensions.\nCreate a cross with frame pieces.\nWhat is the next step?", "target": "Create a cross with frame pieces."}
{"unique_id": "visualgenome_object_relationship_1663_3088730", "image_source": "visualgenome", "task_name": "visual_subject_region", "image_path": "./raw_datasets/visual_genome/VG_100K_2/1663.jpg", "region": ["260 175 494 598"], "meta_data": {"subject": "shelf", "object": "shelf", "relation": "has", "object_regions": {"subject": ["260 175 494 598"], "object": ["261 179 489 600"]}}, "target_txt": "260 175 494 598", "prompt": "Given the object in 261 179 489 600, where is the subject in the image that has relationship: \"has\" with the object?", "target": "260 175 494 598"}
{"unique_id": "VQA_attribute_30207591", "task_name": "VQA_attribute", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000481099.jpg", "question": "What material is the ramp made of?", "target_txt": "wood", "options": ["barrier", "arch", "vest", "wood", "faces", "ramp", "bell", "face", "house"], "prompt": "What material is the ramp made of?\n\n[Options]: barrier||||arch||||vest||||wood||||faces||||ramp||||bell||||face||||house", "target": "wood"}
{"unique_id": "visualgenome_VG_2412535_148281", "image_source": "visualgenome", "task_name": "select_overlap_least_region", "image_path": "./raw_datasets/visual_genome/VG_100K/2412535.jpg", "region": ["242 132 311 493"], "options": ["78 206 187 497", "242 132 311 493", "3 3 293 498", "44 198 227 493"], "meta_data": {"object_regions": {"given_region": ["158 433 249 493"]}}, "target_txt": "242 132 311 493", "prompt": "Given region: 158 433 249 493, decide which option has the least common area with it.\n\n[Options]: 78 206 187 497||||242 132 311 493||||3 3 293 498||||44 198 227 493", "target": "242 132 311 493"}
{"unique_id": "2406730_202549", "image_source": "Visual7W", "task_name": "VQA", "image_path": "raw_datasets/visual7w/images/v7w_2406730.jpg", "question": "What is the man in the back doing?", "options": ["Lifting cans.", "Watching.", "Pushups.", "Rejoicing."], "target_txt": "Watching.", "meta_data": {"question_type": "what"}, "prompt": "What is the man in the back doing?\n\n[Options]: Lifting cans.||||Watching.||||Pushups.||||Rejoicing.", "target": "Watching."}
{"unique_id": "581857_1719310_0_0", "image_source": "coco2014", "task_name": "object_description_generate", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000581857.jpg", "region": ["103.93 299.99 238.15 477.40999999999997"], "target_txt": "navy blue shirt", "prompt": "Generate a sentence to describe the object in the given bounding box. The description should help people to distinguish the object from other objects in the image.\n\nBounding box: 103.93 299.99 238.15 477.40999999999997", "target": "navy blue shirt"}
{"unique_id": "visualgenome_VG_2364191_2467386", "image_source": "visualgenome", "task_name": "select_overlap_most_region", "image_path": "./raw_datasets/visual_genome/VG_100K/2364191.jpg", "region": ["21 26 381 259"], "options": ["314 200 345 222", "21 26 381 259", "136 243 327 333", "13 4 73 96"], "meta_data": {"object_regions": {"given_region": ["232 147 344 226"]}}, "target_txt": "21 26 381 259", "prompt": "Given the region 232 147 344 226, decide which region in the options overlaps most with given region.\n\n[Options]: 314 200 345 222||||21 26 381 259||||136 243 327 333||||13 4 73 96", "target": "21 26 381 259"}
{"unique_id": "vizwiz_image_quality_1", "image_source": "vizwiz", "task_name": "image_quality", "image_path": "./raw_datasets/vizwiz_image_quality/train/VizWiz_train_00022585.jpg", "target_txt": "blur", "options": ["no flaws", "blur", "rotation", "too bright", "too dark", "other", "obscured", "bad framing"], "prompt": "Tell me what is wrong with the image. \n\n[Options]: no flaws||||blur||||rotation||||too bright||||too dark||||other||||obscured||||bad framing", "target": "blur"}
{"unique_id": "VQA_utility_affordance_50001019", "task_name": "VQA_utility_affordance", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000012224.jpg", "question": "Which object can be used to protect people from rain?", "target_txt": "umbrella", "options": ["watching", "umbrella", "cup", "basketball", "desk", "brush", "suitcase", "four", "flush"], "prompt": "In this task, you need to pay attention to the possible actions can be taken to the objects in the image and answer the following question. Which object can be used to protect people from rain?\n\n[Options]: watching||||umbrella||||cup||||basketball||||desk||||brush||||suitcase||||four||||flush", "target": "umbrella"}
{"unique_id": "mscoco_detection2014_41074", "image_source": "coco2014", "task_name": "object_region_match", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000041074.jpg", "text": "teddy bear", "region": ["3.37 142.35 340.45 456.96000000000004"], "target_txt": "the region does not have the object", "options": ["the region has the object", "the region does not have the object"], "prompt": "Does the region 3.37 142.35 340.45 456.96000000000004 contain \"teddy bear\"? \n\n[Options]: the region has the object||||the region does not have the object", "target": "the region does not have the object"}
{"unique_id": "mscoco_detection2014_322349", "image_source": "coco2014", "task_name": "region_object_selection", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000322349.jpg", "target_txt": "giraffe, sheep", "region": ["58.66 48.45 195.27 492.52", "234.38 299.4 245.06 313.35999999999996"], "options": ["sheep", "giraffe", "None"], "meta_data": {"region_obj": ["giraffe", "sheep"]}, "prompt": "Given Regions:  58.66 48.45 195.27 492.52||||234.38 299.4 245.06 313.35999999999996, decide which object appears in at least one of the region.\n\n[Options]: sheep||||giraffe||||None", "target": "giraffe, sheep"}
{"unique_id": "519404_1241542_5000_0", "image_source": "coco2014", "task_name": "descriptive_object_region_select", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000519404.jpg", "region": ["0.0 45.95 238.92 454.59"], "options": ["162.87 119.69 204.28 153.38", "189.84 212.49 549.03 449.8", "221.62 173.51 294.05 244.85999999999999", "364.63 1.2 640.0 474.67", "264.3 426.49 366.45000000000005 480.0", "0.0 45.95 238.92 454.59", "458.25 423.49 484.68 454.56", "136.73 33.36 314.15999999999997 226.45"], "text": "Two woman one in black eatting and the other has a white shirt at the desk", "target_txt": "0.0 45.95 238.92 454.59", "prompt": "Given the image, select the region of Two woman one in black eatting and the other has a white shirt at the desk.\n\n[Options]: 162.87 119.69 204.28 153.38||||189.84 212.49 549.03 449.8||||221.62 173.51 294.05 244.85999999999999||||364.63 1.2 640.0 474.67||||264.3 426.49 366.45000000000005 480.0||||0.0 45.95 238.92 454.59||||458.25 423.49 484.68 454.56||||136.73 33.36 314.15999999999997 226.45", "target": "0.0 45.95 238.92 454.59"}
{"unique_id": "visual_attribute_2389560001", "task_name": "visual_attribute", "image_path": "./raw_datasets/visual_genome/VG_100K_2/2389560.jpg", "options": ["tan", "plastic", "large", "yellow", "orange", "white", "green"], "target_txt": "tan", "region": ["259 143 345 279"], "prompt": "Given object in 259 143 345 279, select its attribute.\n\n[Options]: tan||||plastic||||large||||yellow||||orange||||white||||green", "target": "tan"}
{"unique_id": "visualgenome_VG_2363911_2480836", "image_source": "visualgenome", "task_name": "select_overlaped_region", "image_path": "./raw_datasets/visual_genome/VG_100K/2363911.jpg", "region": ["83 187 499 372"], "options": ["0 152 190 204", "153 54 199 95", "83 187 499 372", "0 157 192 215"], "meta_data": {"object_regions": {"given_region": ["201 92 436 189"]}}, "target_txt": "83 187 499 372", "prompt": "Given the region 201 92 436 189, select an overlapping region from options.\n\n[Options]: 0 152 190 204||||153 54 199 95||||83 187 499 372||||0 157 192 215", "target": "83 187 499 372"}
{"unique_id": "581857_1719310_0_0", "image_source": "coco2014", "task_name": "descriptive_object_region_generate", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000581857.jpg", "region": ["103.93 299.99 238.15 477.40999999999997"], "text": "navy blue shirt", "target_txt": "103.93 299.99 238.15 477.40999999999997", "prompt": "In this task, you are required to identify the object that is described by \"navy blue shirt\" and output the region of that object.", "target": "103.93 299.99 238.15 477.40999999999997"}
{"unique_id": "VQAv2_171837000", "image_source": "coco2014", "task_name": "question_image_match", "question": "According to the hat, what tourist destination is he visiting?", "target_txt": "yes", "image_path": "/projects/nlp_lab/zhiyang/projects/datasets/pretrain/VQAv2/train2014/COCO_train2014_000000171837.jpg", "answer": "puerto rico", "options": ["no", "yes"], "prompt": "Given content of image, do you have enough information to answer \"According to the hat, what tourist destination is he visiting?\" \n\n[Options]: no||||yes", "target": "yes"}
{"unique_id": "visualgenome_VG_2371877_2101306", "image_source": "visualgenome", "task_name": "if_region_overlap", "image_path": "./raw_datasets/visual_genome/VG_100K/2371877.jpg", "region": ["208 187 234 207"], "options": ["no", "yes"], "target_txt": "no", "meta_data": {"object_regions": {"given_region": ["0 212 494 267"]}}, "prompt": "Given the region 0 212 494 267, decide if 208 187 234 207 overlaps with it.\n\n[Options]: no||||yes", "target": "no"}
{"unique_id": "526754_592886_4808_13689", "image_source": "coco2014", "task_name": "descriptive_object_region_generate", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000526754.jpg", "region": ["57.79 177.34 491.20000000000005 629.53"], "text": "zebra creature front and center", "target_txt": "57.79 177.34 491.20000000000005 629.53", "prompt": "Where is the object described by \"zebra creature front and center\"?", "target": "57.79 177.34 491.20000000000005 629.53"}
{"unique_id": "wikihow_wikihow_immediate_next_step_selection_161279_make-a-kite-out-of-a-plastic-bag", "image_source": "wikihow", "task_name": "wikihow_immediate_next_step_selection", "image_path": "/projects/nlp_lab/yings/mata_data/seen_data/wikihow/data/161279_make-a-kite-out-of-a-plastic-bag/image/method_0_step_0.jpg", "options": ["Create a cross with frame pieces.", "Tie the frame together."], "target_txt": "Create a cross with frame pieces.", "meta_data": {"method": "Forming the Kite's Frame"}, "prompt": "The overall goal is to Forming the Kite's Frame. You are at the step specified by the content of the image. Select the immediate next step from the options.\n\n[Options]: Create a cross with frame pieces.||||Tie the frame together.", "target": "Create a cross with frame pieces."}
{"unique_id": "VQA_color_30002873", "task_name": "VQA_color", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000309086.jpg", "question": "What color handle does the middle man's racquet have?", "target_txt": "black", "options": ["pink", "brown", "orange", "blue", "black", "silver", "beige", "green"], "prompt": "In this task, you are asked the color of some object in the image. Question: What color handle does the middle man's racquet have?\n\n[Options]: pink||||brown||||orange||||blue||||black||||silver||||beige||||green", "target": "black"}
{"unique_id": "VQA_sentiment_understanding_30111210", "task_name": "VQA_sentiment_understanding", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000286724.jpg", "question": "What are elephants thought to be afraid of?", "target_txt": "mice", "options": ["boys", "disgust", "cold", "relaxed", "anger", "sleepy", "mice", "smiling", "happy"], "prompt": "Please analyze the sentiment depicted in the image and answer the question.\n What are elephants thought to be afraid of?\n\n[Options]: boys||||disgust||||cold||||relaxed||||anger||||sleepy||||mice||||smiling||||happy", "target": "mice"}
{"unique_id": "visualgenome_VG_2376935_494025", "image_source": "visualgenome", "task_name": "open-domain_VQA", "image_path": "./raw_datasets/visual_genome/VG_100K/2376935.jpg", "question": "What color is the grass?", "target_txt": "Green", "prompt": "What color is the grass?", "target": "Green"}
{"unique_id": "visualgenome_object_relationship_2330763_2902313", "image_source": "visualgenome", "task_name": "visual_subject_identification", "image_path": "./raw_datasets/visual_genome/VG_100K/2330763.jpg", "region": ["0 262 42 288", "1 5 331 408"], "target_txt": "handrail", "meta_data": {"subject": "handrail", "object": "wall", "relation": "on"}, "prompt": "Which subject in the image that has on with the object in 1 5 331 408", "target": "handrail"}
{"unique_id": "visualgenome_VG_2369824_2198947", "image_source": "visualgenome", "task_name": "GC", "image_path": "./raw_datasets/visual_genome/VG_100K/2369824.jpg", "region": ["147 72 193 121"], "target_txt": "square white ceramic tile", "prompt": "Generate a caption for 147 72 193 121.", "target": "square white ceramic tile"}
{"unique_id": "mscoco_detection2014_143984", "image_source": "coco2014", "task_name": "object_grounding", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000143984.jpg", "target_txt": "toilet", "region": ["0.0 152.66 171.36 324.77"], "prompt": "What is the object in 0.0 152.66 171.36 324.77", "target": "toilet"}
{"unique_id": "mscoco_text_type_287140", "image_source": "coco2014", "task_name": "text_type", "image_path": "raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000287140.jpg", "region": ["212.35294117647058 108.67088607594941 238.16806722689077 125.88607594936713"], "options": ["others", "handwritten", "machine printed"], "target_txt": "machine printed", "prompt": "Read the text in 212.35294117647058 108.67088607594941 238.16806722689077 125.88607594936713 of the image and select the type of text from options. \n\n[Options]: others||||handwritten||||machine printed", "target": "machine printed"}
{"unique_id": "ok_vqa_train_51606", "image_source": "mscoco2014", "task_name": "open-domain_VQA", "image_path": "raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000051606.jpg", "target_txt": "pony tail", "question": "What is the hairstyle of the blond called?", "meta_data": {"answers": ["pony tail", "pony tail", "pony tail", "pony tail", "pony tail", "pony tail", "braid", "braid", "ponytail", "ponytail"]}, "prompt": "What is the hairstyle of the blond called?", "target": "pony tail"}
{"unique_id": "visualgenome_VG_2404117_561719", "image_source": "visualgenome", "task_name": "VG_selection", "image_path": "./raw_datasets/visual_genome/VG_100K_2/2404117.jpg", "region": ["18 129 61 168"], "options": ["18 129 61 168", "450 142 498 222", "108 1 155 176", "17 164 34 193", "23 126 57 216", "192 109 379 264"], "text": "a red stop sign", "target_txt": "18 129 61 168", "prompt": "Select region in the image that \"a red stop sign\" describes.\n\n[Options]: 18 129 61 168||||450 142 498 222||||108 1 155 176||||17 164 34 193||||23 126 57 216||||192 109 379 264", "target": "18 129 61 168"}
{"unique_id": "visualgenome_object_relationship_2372067_4156495", "image_source": "visualgenome", "task_name": "visual_object_region", "image_path": "./raw_datasets/visual_genome/VG_100K/2372067.jpg", "region": ["0 349 498 481"], "meta_data": {"subject": "shadow", "object": "ground", "relation": "on", "object_regions": {"subject": ["2 444 462 474"], "object": ["0 349 498 481"]}}, "target_txt": "0 349 498 481", "prompt": "Which object has the relationship \"on\" with the subject in 2 444 462 474? Answer the question by generating the region of the object.", "target": "0 349 498 481"}
{"unique_id": "VQA_scene_recognition_10459855", "task_name": "VQA_scene_recognition", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000037157.jpg", "question": "Is this a indoor scene?", "target_txt": "yes", "options": ["slope", "foggy", "gloomy", "snowy", "fall", "winter", "basil", "coat", "yes"], "prompt": "In this task, you need to pay attention to the scene in the image and answer the following question.\n Is this a indoor scene?\n\n[Options]: slope||||foggy||||gloomy||||snowy||||fall||||winter||||basil||||coat||||yes", "target": "yes"}
{"unique_id": "mscoco_region_text_match_370258", "image_source": "coco2014", "task_name": "region_text_match", "image_path": "raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000370258.jpg", "region": ["108 105 246 142"], "text": "station", "options": ["yes, the text matches the text in the region", "no, the text is different from the text in the region"], "target_txt": "no, the text is different from the text in the region", "prompt": "Does 108 105 246 142 have the letters \"station\"? \n\n[Options]: yes, the text matches the text in the region||||no, the text is different from the text in the region", "target": "no, the text is different from the text in the region"}
{"unique_id": "mscoco_text_localization_370258", "image_source": "coco2014", "task_name": "text_localization", "options": ["108 105 246 142", "266 105 449 142", "431.88310233979763 328.99746192893406 557.0770315134907 368.9086294416244"], "image_path": "raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000370258.jpg", "region": ["108 105 246 142"], "text": "peene", "target_txt": "108 105 246 142", "prompt": "Which region contains \"peene\" \n\n[Options]: 108 105 246 142||||266 105 449 142||||431.88310233979763 328.99746192893406 557.0770315134907 368.9086294416244", "target": "108 105 246 142"}
{"unique_id": "581857_1719310_0_0", "image_source": "coco2014", "task_name": "descriptive_object_region_select", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000581857.jpg", "region": ["103.93 299.99 238.15 477.40999999999997"], "options": ["372.81 363.25 384.28000000000003 374.72", "152.32 265.18 219.76999999999998 365.44", "11.4 595.97 180.79 640.0", "56.09 529.28 190.39000000000001 605.43", "33.6 570.63 54.18 607.12", "353.15 360.57 367.82 373.73", "33.63 548.4 63.660000000000004 574.11", "332.4 371.05 349.53999999999996 384.86", "346.13 373.67 363.37 385.81", "115.33 465.3 251.5 543.5", "202.09 70.54 259.12 128.66", "339.98 446.65 409.8 521.89", "363.79 371.19 378.39000000000004 385.79", "335.79 418.43 420.45000000000005 444.2", "171.22 500.36 327.48 593.45", "286.36 521.21 328.77 567.85", "216.58 261.7 298.22 473.59", "103.93 299.99 238.15 477.40999999999997", "358.93 353.37 370.76 365.43", "336.85 359.26 353.78000000000003 373.08", "377.71 530.69 402.0 571.5600000000001", "163.36 580.87 191.36 603.72", "325.01 343.61 350.09999999999997 364.25", "345.34 353.8 359.13 364.49", "83.6 270.27 168.64999999999998 453.33", "224.36 548.31 279.01 621.66", "249.13 67.33 297.24 130.51", "209.65 129.24 260.73 170.78", "0.0 600.19 38.63 640.0", "369.84 352.81 380.03 365.34", "325.48 370.13 336.21000000000004 384.09", "1 164 426 639", "255.78 449.75 350.31 523.0"], "text": "navy blue shirt", "target_txt": "103.93 299.99 238.15 477.40999999999997", "prompt": "In this task, you are required to identify the object that is described by \"navy blue shirt\" and select the region of that object from options.\n\n[Options]: 372.81 363.25 384.28000000000003 374.72||||152.32 265.18 219.76999999999998 365.44||||11.4 595.97 180.79 640.0||||56.09 529.28 190.39000000000001 605.43||||33.6 570.63 54.18 607.12||||353.15 360.57 367.82 373.73||||33.63 548.4 63.660000000000004 574.11||||332.4 371.05 349.53999999999996 384.86||||346.13 373.67 363.37 385.81||||115.33 465.3 251.5 543.5||||202.09 70.54 259.12 128.66||||339.98 446.65 409.8 521.89||||363.79 371.19 378.39000000000004 385.79||||335.79 418.43 420.45000000000005 444.2||||171.22 500.36 327.48 593.45||||286.36 521.21 328.77 567.85||||216.58 261.7 298.22 473.59||||103.93 299.99 238.15 477.40999999999997||||358.93 353.37 370.76 365.43||||336.85 359.26 353.78000000000003 373.08||||377.71 530.69 402.0 571.5600000000001||||163.36 580.87 191.36 603.72||||325.01 343.61 350.09999999999997 364.25||||345.34 353.8 359.13 364.49||||83.6 270.27 168.64999999999998 453.33||||224.36 548.31 279.01 621.66||||249.13 67.33 297.24 130.51||||209.65 129.24 260.73 170.78||||0.0 600.19 38.63 640.0||||369.84 352.81 380.03 365.34||||325.48 370.13 336.21000000000004 384.09||||1 164 426 639||||255.78 449.75 350.31 523.0", "target": "103.93 299.99 238.15 477.40999999999997"}
{"unique_id": "visualgenome_object_relationship_2388063_4770642", "image_source": "visualgenome", "task_name": "visual_object_identification", "image_path": "./raw_datasets/visual_genome/VG_100K_2/2388063.jpg", "region": ["268 303 288 326", "205 242 301 331"], "target_txt": "luggage", "meta_data": {"subject": "wheel", "object": "luggage", "relation": "on"}, "prompt": "In this task, you are asked to identify the object given tne region of the subject in the image and their relationship. The subject is in 268 303 288 326 and relationship is on. The object is", "target": "luggage"}
{"unique_id": "mscoco_caption2014_362553", "image_source": "coco2014", "task_name": "ITM", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000153609.jpg", "text": "A couple of people that are talking to each other.", "target_txt": "Yes, the text matches the content of the image", "options": ["Yes, the text matches the content of the image", "No, the text does not match the content of the image"], "prompt": "Does the text: \"A couple of people that are talking to each other.\" and the content of image match? \n\n[Options]: Yes, the text matches the content of the image||||No, the text does not match the content of the image", "target": "Yes, the text matches the content of the image"}
{"unique_id": "wikihow_wikihow_image_text_step_order_161279_make-a-kite-out-of-a-plastic-bag", "image_source": "wikihow", "task_name": "wikihow_image_text_step_order", "image_path": "/projects/nlp_lab/yings/mata_data/seen_data/wikihow/data/161279_make-a-kite-out-of-a-plastic-bag/image/method_0_step_1.jpg", "text": "Tie the frame together.", "target_txt": "next", "meta_data": {"method": "Forming the Kite's Frame"}, "prompt": "Is \"Tie the frame together.\" the next or previous step? You are doing \"Forming the Kite's Frame\" and you are currently at the step described by the image.\n\n[Options]: previous||||next", "target": "next"}
{"unique_id": "VQA_positional_reasoning_10622680", "task_name": "VQA_positional_reasoning", "image_path": "./raw_datasets/MSCOCO2014/train2014/COCO_train2014_000000126719.jpg", "question": "What is to the left of cup?", "target_txt": "cake", "options": ["shrimp", "traffic", "toilet", "modern", "shadows", "black", "stools", "patio", "cake"], "prompt": "What is to the left of cup?\n\n[Options]: shrimp||||traffic||||toilet||||modern||||shadows||||black||||stools||||patio||||cake", "target": "cake"}