diff --git a/dataset_zoo/mlt/metafile.yml b/dataset_zoo/mlt/metafile.yml new file mode 100644 index 000000000..f25256b3c --- /dev/null +++ b/dataset_zoo/mlt/metafile.yml @@ -0,0 +1,39 @@ +Name: 'MLT 2017 (ICDAR 2017)' +Paper: + Title: ICDAR2017 Robust Reading Challenge on Multi-Lingual Scene Text Detection and Script Identification - RRC-MLT + URL: https://ieeexplore.ieee.org/document/8270168 + Venue: ICDAR + Year: '2017' + BibTeX: '@INPROCEEDINGS{8270168, + author={Nayef, Nibal and Yin, Fei and Bizid, Imen and Choi, Hyunsoo and Feng, Yuan and Karatzas, Dimosthenis and Luo, Zhenbo and Pal, Umapada and Rigaud, Christophe and Chazalon, Joseph and Khlif, Wafa and Luqman, Muhammad Muzzamil and Burie, Jean-Christophe and Liu, Cheng-lin and Ogier, Jean-Marc}, + booktitle={2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR)}, + title={ICDAR2017 Robust Reading Challenge on Multi-Lingual Scene Text Detection and Script Identification - RRC-MLT}, + year={2017}, + volume={01}, + number={}, + pages={1454-1459}, + doi={10.1109/ICDAR.2017.237}}' +Data: + Website: https://rrc.cvc.uab.es/?ch=8 + Language: + - Arabic + - English + - Chinese + - Japanese + - Korean + - Italian + - German + - Indian + - French + Scene: + - Natural Scene + Granularity: + - Word + Tasks: + - textdet + - textrecog + - textspotting + License: + Type: CC BY 4.0 + Link: https://creativecommons.org/licenses/by/4.0/ + Format: .txt diff --git a/dataset_zoo/mlt/sample_anno.md b/dataset_zoo/mlt/sample_anno.md new file mode 100644 index 000000000..93caf33d7 --- /dev/null +++ b/dataset_zoo/mlt/sample_anno.md @@ -0,0 +1,20 @@ +**Text Detection, Text Spotting** + +```text +# x1,y1,x2,y2,x3,y3,x4,y4,script,text +# Valid scripts are: "Arabic", "Latin", "Chinese", "Japanese", "Korean", "Bangla", "Symbols", "Mixed", "None" + +131,34,181,34,179,47,131,49,Latin,Castle +150,59,194,58,196,72,150,73,Arabic,متحف +90,83,143,83,143,96,91,96,Latin,Heritage +146,81,200,80,201,93,147,94,Latin,Museum +``` + +**Text Recognition** + +```text +# img_name,script,text + +word_4.png,Arabic,المكرمة +word_5.png,Latin,MAKKA +``` diff --git a/dataset_zoo/mlt/textdet.py b/dataset_zoo/mlt/textdet.py new file mode 100644 index 000000000..a436ae010 --- /dev/null +++ b/dataset_zoo/mlt/textdet.py @@ -0,0 +1,114 @@ +data_root = 'data/mlt' +cache_path = 'data/cache' +# yapf: disable +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://datasets.cvc.uab.es/rrc/ch8_training_images_1.zip', # noqa: E501 + save_name='mlt_1.zip', + md5='7b26e10d949c00fb4411f40b4f1fce6e', + content=['image'], + mapping=[['mlt_1/*', 'textdet_imgs/train']]), + dict( + url='https://datasets.cvc.uab.es/rrc/ch8_training_images_2.zip', # noqa: E501 + save_name='mlt_2.zip', + md5='e992fb5a7621dd6329081a73e52a28e1', + content=['image'], + mapping=[['mlt_2/*', 'textdet_imgs/train']]), + dict( + url='https://datasets.cvc.uab.es/rrc/ch8_training_images_3.zip', # noqa: E501 + save_name='mlt_3.zip', + md5='044ea5fb1dcec8bbb874391c517b55ff', + content=['image'], + mapping=[['mlt_3/*', 'textdet_imgs/train']]), + dict( + url='https://datasets.cvc.uab.es/rrc/ch8_training_images_4.zip', # noqa: E501 + save_name='mlt_4.zip', + md5='344a657c1cc7cbb150547f1c76b5cc8e', + content=['image'], + mapping=[['mlt_4/*', 'textdet_imgs/train']]), + dict( + url='https://datasets.cvc.uab.es/rrc/ch8_training_images_5.zip', # noqa: E501 + save_name='mlt_5.zip', + md5='5c7ac0158e7189c0a634eaf7bdededc5', + content=['image'], + mapping=[['mlt_5/*', 'textdet_imgs/train']]), + dict( + url='https://datasets.cvc.uab.es/rrc/ch8_training_images_6.zip', # noqa: E501 + save_name='mlt_6.zip', + md5='3b479255a96d255680f51005b5232bac', + content=['image'], + mapping=[['mlt_6/*', 'textdet_imgs/train']]), + dict( + url='https://datasets.cvc.uab.es/rrc/ch8_training_images_7.zip', # noqa + save_name='mlt_7.zip', + md5='faa033fb9d2922d747bad9b0692c992e', + content=['image'], + mapping=[['mlt_7/*', 'textdet_imgs/train']]), + dict( + url='https://datasets.cvc.uab.es/rrc/ch8_training_images_8.zip', # noqa + save_name='mlt_8.zip', + md5='db8afa59ae520757151f6ce5acd489ef', + content=['image'], + mapping=[['mlt_8/*', 'textdet_imgs/train']]), + dict( + url='https://datasets.cvc.uab.es/rrc/' + 'ch8_training_localization_transcription_gt_v2.zip', + save_name='mlt_train_gt.zip', + md5='2c9c3de30b5615f6846738bbd336c988', + content=['annotation'], + mapping=[['mlt_train_gt/', 'annotations/train']]), + ]), + gatherer=dict( + type='PairGatherer', + img_suffixes=['.jpg', '.JPG'], + rule=[r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt']), + parser=dict( + type='ICDARTxtTextDetAnnParser', + encoding='utf-8-sig', + format='x1,y1,x2,y2,x3,y3,x4,y4,lang,trans'), + packer=dict(type='TextDetPacker'), + dumper=dict(type='JsonDumper'), +) # noqa + +val_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://rrc.cvc.uab.es/downloads/ch8_validation_images.zip', # noqa + save_name='mlt_val_img.zip', + md5='3cfc7b440ab81b89a981d707786dbe83', + content=['image'], + mapping=[['mlt_val_img', 'textdet_imgs/val']]), + dict( + url='https://datasets.cvc.uab.es/rrc/' + 'ch8_validation_localization_transcription_gt_v2.zip', + save_name='mlt_val_gt.zip', + md5='ecae7d433e6f103bb31e00d37254009c', + content=['annotation'], + mapping=[['mlt_val_gt/', 'annotations/val']]), + ]), + gatherer=dict( + type='PairGatherer', + img_suffixes=['.jpg', '.JPG'], + rule=[r'img_(\d+)\.([jJ][pP][gG])', r'gt_img_\1.txt']), + parser=dict( + type='ICDARTxtTextDetAnnParser', + encoding='utf-8-sig', + format='x1,y1,x2,y2,x3,y3,x4,y4,lang,trans'), + packer=dict(type='TextDetPacker'), + dumper=dict(type='JsonDumper'), +) + +config_generator = dict( + type='TextDetConfigGenerator', + val_anns=[dict(ann_file='textdet_val.json', dataset_postfix='')], + test_anns=None) + +delete = [f'mlt{i}' for i in range(1, 9) + ] + ['annotations', 'mlt_val_gt', 'mlt_train_gt'] diff --git a/dataset_zoo/mlt/textrecog.py b/dataset_zoo/mlt/textrecog.py new file mode 100644 index 000000000..8908938be --- /dev/null +++ b/dataset_zoo/mlt/textrecog.py @@ -0,0 +1,82 @@ +data_root = 'data/mlt' +cache_path = 'data/cache' +train_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://datasets.cvc.uab.es/rrc/' + 'ch8_training_word_images_gt_part_1.zip', + save_name='mlt_rec_1.zip', + md5='714d899cf5c8cf23b73bc14cfb628a3a', + content=['image'], + mapping=[['mlt_rec_1/*', 'textrecog_imgs/train']]), + dict( + url='https://datasets.cvc.uab.es/rrc/' + 'ch8_training_word_images_gt_part_2.zip', + save_name='mlt_rec_2.zip', + md5='d0e5bc4736626853203d24c70bbf56d1', + content=['image'], + mapping=[['mlt_rec_2/*', 'textrecog_imgs/train']]), + dict( + url='https://datasets.cvc.uab.es/rrc/' + 'ch8_training_word_images_gt_part_3.zip', + save_name='mlt_rec_3.zip', + md5='ebc7f2c9e73c3d174437d43b03177c5c', + content=['image'], + mapping=[['mlt_rec_3/*', 'textrecog_imgs/train']]), + dict( + url='https://rrc.cvc.uab.es/downloads/' + 'ch8_validation_word_images_gt.zip', + save_name='mlt_rec_train_gt.zip', + md5='e5e681b440a616f0ac8deaa669b3682d', + content=['annotation'], + mapping=[['mlt_rec_train_gt/', 'annotations/train']]), + ]), + gatherer=dict(type='MonoGatherer', ann_name='train/gt.txt'), + parser=dict( + type='ICDARTxtTextRecogAnnParser', + encoding='utf-8-sig', + format='img,lang,text'), + packer=dict(type='TextRecogPacker'), + dumper=dict(type='JsonDumper'), +) + +val_preparer = dict( + obtainer=dict( + type='NaiveDataObtainer', + cache_path=cache_path, + files=[ + dict( + url='https://rrc.cvc.uab.es/downloads/' + 'ch8_validation_word_images_gt.zip', + save_name='mlt_rec_val.zip', + md5='954acd0325c442288fa4aff1009b6d79', + content=['image'], + mapping=[['mlt_rec_val/*', 'textrecog_imgs/val']]), + dict( + url='https://datasets.cvc.uab.es/rrc/' + 'ch8_validation_word_gt_v2.zip', + save_name='mlt_rec_val_gt.zip', + md5='951c9cee78a0064b133ab59369a9b232', + content=['annotation'], + mapping=[['mlt_rec_val_gt/', 'annotations/val']]), + ]), + gatherer=dict(type='MonoGatherer', ann_name='train/gt.txt'), + parser=dict( + type='ICDARTxtTextRecogAnnParser', + encoding='utf-8-sig', + format='img,lang,text'), + packer=dict(type='TextRecogPacker'), + dumper=dict(type='JsonDumper'), +) + +config_generator = dict( + type='TextRecogConfigGenerator', + val_anns=[dict(ann_file='textrecog_val.json', dataset_postfix='')], + test_anns=None) + +delete = [f'mlt_rec_{i}' for i in range(1, 4)] + [ + 'annotations', 'mlt_rec_val_gt', 'mlt_rec_train_gt', 'mlt_rec_val' +] diff --git a/dataset_zoo/mlt/textspotting.py b/dataset_zoo/mlt/textspotting.py new file mode 100644 index 000000000..dd6c91126 --- /dev/null +++ b/dataset_zoo/mlt/textspotting.py @@ -0,0 +1,9 @@ +_base_ = ['textdet.py'] + +_base_.train_preparer.gatherer.img_dir = 'textdet_imgs/train' +_base_.train_preparer.packer.type = 'TextSpottingPacker' + +_base_.val_preparer.gatherer.img_dir = 'textdet_imgs/val' +_base_.val_preparer.packer.type = 'TextSpottingPacker' + +config_generator = dict(type='TextSpottingConfigGenerator') diff --git a/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py b/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py index 51b0d266c..d5bd28ad8 100644 --- a/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py +++ b/mmocr/datasets/preparers/obtainers/naive_data_obtainer.py @@ -177,22 +177,32 @@ def move(self, mapping: List[Tuple[str, str]]) -> None: Args: mapping (List[Tuple[str, str]]): A list of tuples, each - tuple contains the source file name and the destination file name. + tuple contains the source file name and the destination file + name. """ for src, dst in mapping: src = osp.join(self.data_root, src) dst = osp.join(self.data_root, dst) if '*' in src: + # dst must be a directory mkdir_or_exist(dst) for f in glob.glob(src): - if not osp.exists( - osp.join(dst, osp.relpath(f, self.data_root))): + tgt = osp.join(dst, osp.basename(osp.normpath(f))) + if not osp.exists(tgt): shutil.move(f, dst) - - elif osp.exists(src) and not osp.exists(dst): - mkdir_or_exist(osp.dirname(dst)) - shutil.move(src, dst) + else: + print(f'Skipping moving {f} to {dst} since' + f' {f} does not exist or {tgt} already exists') + # If no wildcard in src, dst must match the src type + # That is, we can only move a file to a file, or a dir to a dir + else: + if osp.exists(src) and not osp.exists(dst): + mkdir_or_exist(osp.dirname(dst)) + shutil.move(src, dst) + else: + print(f'Skipping moving {src} to {dst} since' + f' {src} does not exist or {dst} already exists') def clean(self) -> None: """Remove empty dirs.""" diff --git a/mmocr/datasets/preparers/parsers/icdar_txt_parser.py b/mmocr/datasets/preparers/parsers/icdar_txt_parser.py index e90d5d7b9..4fa1d9b9f 100644 --- a/mmocr/datasets/preparers/parsers/icdar_txt_parser.py +++ b/mmocr/datasets/preparers/parsers/icdar_txt_parser.py @@ -20,7 +20,8 @@ class ICDARTxtTextDetAnnParser(BaseParser): to ','. ignore (str): The text to be ignored. Defaults to '###'. format (str): The format of the annotation. Defaults to - 'x1,y1,x2,y2,x3,y3,x4,trans'. + 'x1,y1,x2,y2,x3,y3,x4,trans'. An additional keyword "lang" can be + recognized here to specify the language of the transcription. encoding (str): The encoding of the annotation file. Defaults to 'utf-8-sig'. nproc (int): The number of processes to parse the annotation. Defaults @@ -52,6 +53,8 @@ def parse_file(self, img_path: str, ann_path: str) -> Tuple: instances = list() for anno in self.loader(ann_path, self.sep, self.format, self.encoding): + if 'lang' in anno: + del anno['lang'] anno = list(anno.values()) if self.remove_strs is not None: for strs in self.remove_strs: @@ -82,6 +85,8 @@ class ICDARTxtTextRecogAnnParser(BaseParser): to ','. ignore (str): The text to be ignored. Defaults to '#'. format (str): The format of the annotation. Defaults to 'img, text'. + An additional keyword "lang" can be recognized here to specify the + language of the transcription. encoding (str): The encoding of the annotation file. Defaults to 'utf-8-sig'. nproc (int): The number of processes to parse the annotation. Defaults diff --git a/tests/test_datasets/test_preparers/test_obtainers/test_naive_data_obtainer.py b/tests/test_datasets/test_preparers/test_obtainers/test_naive_data_obtainer.py new file mode 100644 index 000000000..dd876e39f --- /dev/null +++ b/tests/test_datasets/test_preparers/test_obtainers/test_naive_data_obtainer.py @@ -0,0 +1,71 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +import tempfile +import unittest + +from mmocr.datasets.preparers.obtainers import NaiveDataObtainer + + +class TestNaiveDataObtainer(unittest.TestCase): + + def setUp(self) -> None: + """Create temporary directories and files for testing.""" + self.tmp_dir = tempfile.TemporaryDirectory() + self.cache_path = osp.join(self.tmp_dir.name, 'cache') + self.data_root = osp.join(self.tmp_dir.name, 'data') + self.obtainer = NaiveDataObtainer([], self.cache_path, self.data_root, + 'test') + + def tearDown(self) -> None: + """Delete temporary directories and files used for testing.""" + self.tmp_dir.cleanup() + + def test_move(self): + # create tmp files + test_src = os.path.join(self.data_root, 'src') + test_dst = os.path.join(self.data_root, 'dst') + os.makedirs(test_src, exist_ok=True) + os.makedirs(test_dst, exist_ok=True) + # Create some test files/folders in src directory + for i in range(3): + with open(os.path.join(test_src, f'file{i}.txt'), 'w') as f: + f.write('hello world\n') + os.mkdir(os.path.join(test_src, f'dir{i}')) + + # Test moving file/dir + mapping = [ + ('src/file0.txt', 'dst/file0_new.txt'), # dst/file0_new.txt + ('src/file1.txt', 'dst/abc/abc.txt'), # dst/abc.txt + ('src/file2.txt', 'dst/'), # Not allowed + ('src/dir0/', 'dst/dir0'), # dst/dir0 + ('src/dir1', 'dst/abc/d2/'), # dst/abc/d2 + ('src/dir2', 'dst/'), # not allowed + ] + self.obtainer.move(mapping) + + mapping[2] = ['src/file2.txt', 'dst/file2.txt'] + mapping[5] = ['src/dir2', 'dst/dir2'] + mapping = [[osp.join(self.data_root, a), + osp.join(self.data_root, b)] for a, b in mapping] + mapping[2] = mapping[2][::-1] + mapping[5] = mapping[5][::-1] + for a, b in mapping: + self.assertFalse(os.path.exists(a)) + self.assertTrue(os.path.exists(b)) + + # Test moving paths with wildcard + mapping = [ + ('src/*.txt', 'dst/test2'), # dst/test2/file2.txt + ('src/*', 'dst/test2/file2.txt'), # not allowed (file2.txt exists) + ('src/*', 'dst/test2'), # dst/dir2 + ] + self.obtainer.move(mapping) + + mapping = [ + osp.join(self.data_root, p) + for p in ['dst/test2/file2.txt', 'dst/test2/dir2'] + ] + for a, b in mapping: + self.assertFalse(os.path.exists(a)) + self.assertTrue(os.path.exists(b))