image_caption_dataset.py

import csv
import json
import os

import datasets
import pandas as pd
import numpy as np


class ImageCaptionBuilderConfig(datasets.BuilderConfig):

    def __init__(self, name, splits, langs, prefix_before_image_fn=False, zfill=1, **kwargs):

        super().__init__(name, **kwargs)

        self.splits = splits
        self.langs = langs
        self.prefix_before_image_fn = prefix_before_image_fn
        self.zfill = zfill


# TODO: Add BibTeX citation
# Find for instance the citation on arxiv or on the dataset repo/website
_CITATION = """\
@InProceedings{None,
    title = {Generic images to captions dataset},
    author={Yih-Dar SHIEH},
    year={2020}
}
"""

# TODO: Add description of the dataset here
# You can copy an official description
_DESCRIPTION = """\

"""

# TODO: Add a link to an official homepage for the dataset here
_HOMEPAGE = ""

# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""

# TODO: Add link to the official dataset URLs here
# The HuggingFace dataset library don't host the datasets but only point to the original files
# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
_URLs = {}


# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
class ImageCaptionDataset(datasets.GeneratorBasedBuilder):
    """TODO: Short description of my dataset."""

    VERSION = datasets.Version("0.0.0")

    BUILDER_CONFIG_CLASS = ImageCaptionBuilderConfig
    BUILDER_CONFIGS = [
        ImageCaptionBuilderConfig(name='coco_2017', splits=['train', 'valid'], prefix_before_image_fn=False, zfill=12, langs=['en', 'fr']),
        ImageCaptionBuilderConfig(name='cc3m', splits=['train', 'valid'], prefix_before_image_fn=True, zfill=8, langs=['en', 'fr']),
        ImageCaptionBuilderConfig(name='cc12m', splits=['train', 'valid'], prefix_before_image_fn=True, zfill=8, langs=['en', 'fr'])
    ]
    DEFAULT_CONFIG_NAME = "coco_2017"

    def _info(self):
        # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset

        feature_dict = {
            "image_id": datasets.Value("int64"),
            "id": datasets.Value("int64"),
            "caption": datasets.Value("string"),
        }
        for lang in self.config.langs:
            feature_dict[lang] = datasets.Value("string")
        feature_dict["image_url"] = datasets.Value("string")
        feature_dict["image_file"] = datasets.Value("string")

        features = datasets.Features(feature_dict)

        return datasets.DatasetInfo(
            # This is the description that will appear on the datasets page.
            description=_DESCRIPTION,
            # This defines the different columns of the dataset and their types
            features=features,  # Here we define them above because they are different between the two configurations
            # If there's a common (input, target) tuple from the features,
            # specify them here. They'll be used if as_supervised=True in
            # builder.as_dataset.
            supervised_keys=None,
            # Homepage of the dataset for documentation
            homepage=_HOMEPAGE,
            # License for the dataset if available
            license=_LICENSE,
            # Citation for the dataset
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
        # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name

        data_dir = self.config.data_dir

        splits = []
        for split in self.config.splits:
            if split == 'train':
                dataset = datasets.SplitGenerator(
                    name=datasets.Split.TRAIN,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={
                        "jsonl_dir": os.path.join(data_dir, f'{self.config.name}_jsonls', 'train'),
                        "image_dir": os.path.join(data_dir, f'{self.config.name}_images', 'train'),
                        "split": "train",
                    }
                )
            elif split in ['val', 'valid', 'validation', 'dev']:
                dataset = datasets.SplitGenerator(
                    name=datasets.Split.VALIDATION,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={
                        "jsonl_dir": os.path.join(data_dir, f'{self.config.name}_jsonls', 'valid'),
                        "image_dir": os.path.join(data_dir, f'{self.config.name}_images', 'valid'),
                        "split": "valid",
                    },
                )
            elif split == 'test':
                dataset = datasets.SplitGenerator(
                    name=datasets.Split.TEST,
                    # These kwargs will be passed to _generate_examples
                    gen_kwargs={
                        "jsonl_dir": os.path.join(data_dir, f'{self.config.name}_jsonls', 'test'),
                        "image_dir": os.path.join(data_dir, f'{self.config.name}_images', 'test'),
                        "split": "test",
                    },
                )
            else:
                continue

            splits.append(dataset)

        return splits

    def _generate_examples(
        # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
        self, jsonl_dir, image_dir, split
    ):
        """ Yields examples as (key, example) tuples. """
        # This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
        # The `key` is here for legacy reason (tfds) and is not important in itself.

        if split == 'dev':
            split = 'valid'

        fns = [os.path.join(jsonl_dir, fn) for fn in os.listdir(jsonl_dir) if os.path.isfile(os.path.join(jsonl_dir, fn)) and fn.endswith("jsonl")]

        for jsonl_file in fns:

            with open(jsonl_file, 'r', encoding='UTF-8') as fp:

                for id_, line in enumerate(fp):

                    ex = json.loads(line)

                    example = {
                        "image_id": ex['image_id'],
                        "id": ex["id"],
                        "caption": ex["caption"],
                    }

                    for lang in self.config.langs:
                        example[lang] = ex[lang]

                    if 'image_url' in ex:
                        example['image_url'] = ex['image_url']
                    else:
                        example['image_url'] = ''

                    fn = f'{str(ex["image_id"]).zfill(self.config.zfill)}.jpg'
                    if self.config.prefix_before_image_fn:
                        fn = f'{self.config.name}_{split}_' + fn

                    image_file = os.path.join(image_dir, fn)
                    example['image_file'] = image_file

                    if not os.path.isfile(image_file):
                        continue

                    yield id_, example