diff --git a/roboflow/__init__.py b/roboflow/__init__.py index a74398da..bde69972 100644 --- a/roboflow/__init__.py +++ b/roboflow/__init__.py @@ -15,7 +15,7 @@ from roboflow.models import CLIPModel, GazeModel # noqa: F401 from roboflow.util.general import write_line -__version__ = "1.2.10" +__version__ = "1.2.11" def check_key(api_key, model, notebook, num_retries=0): diff --git a/roboflow/util/folderparser.py b/roboflow/util/folderparser.py index 7cc336bf..047cdaee 100644 --- a/roboflow/util/folderparser.py +++ b/roboflow/util/folderparser.py @@ -111,45 +111,123 @@ def _map_annotations_to_images_1to1(images, annotations): def _map_annotations_to_images_1tomany(images, annotationFiles): - annotationsByDirname = _list_map(annotationFiles, "dirname") + image_path_to_annotation_files = _build_image_to_annotationfile_index(annotationFiles) imgRefMap, annotationMap = _build_image_and_annotation_maps(annotationFiles) for image in tqdm(images): - dirname = image["dirname"] - annotationsInSameDir = annotationsByDirname.get(dirname, []) - if annotationsInSameDir: - for annotationFile in annotationsInSameDir: - format = annotationFile["parsedType"] - filtered_annotations = _filterIndividualAnnotations( - image, annotationFile, format, imgRefMap, annotationMap - ) - if filtered_annotations: - image["annotationfile"] = filtered_annotations - break + # Get candidate annotation files for this image + rel_path = image["file"].lstrip("/") + candidate_annotations = ( + image_path_to_annotation_files.get(rel_path, []) + or image_path_to_annotation_files.get(image["name"], []) + or image_path_to_annotation_files.get(image["key"], []) + or annotationFiles # Fallback to all files for non-COCO formats + ) + + for annotationFile in candidate_annotations: + format = annotationFile["parsedType"] + filtered_annotations = _filterIndividualAnnotations(image, annotationFile, format, imgRefMap, annotationMap) + if filtered_annotations: + image["annotationfile"] = filtered_annotations + break + + +def _build_image_to_annotationfile_index(annotationFiles): + """Create an index mapping possible image path keys to annotation files that reference them. + + Keys include full relative path, basename, and stem to improve robustness across + different dataset layouts. Supports coco, createml, csv, multilabel_csv, jsonl. + """ + index = defaultdict(list) + for annotationFile in annotationFiles: + parsedType = annotationFile.get("parsedType") + parsed = annotationFile.get("parsed") + if not parsedType or parsed is None: + continue + + if parsedType == "coco": + for imageRef in parsed.get("images", []): + file_name = _patch_sep(imageRef.get("file_name", "")).lstrip("/") + if not file_name: + continue + basename = os.path.basename(file_name) + stem = os.path.splitext(basename)[0] + index[file_name].append(annotationFile) + index[basename].append(annotationFile) + index[stem].append(annotationFile) + + elif parsedType == "createml": + for entry in parsed: + image_name = entry.get("image") + if not image_name: + continue + index[image_name].append(annotationFile) + + elif parsedType == "csv": + for ld in parsed.get("lines", []): + image_name = ld.get("file_name") + if not image_name: + continue + index[image_name].append(annotationFile) + + elif parsedType == "multilabel_csv": + for row in parsed.get("rows", []): + image_name = row.get("file_name") + if not image_name: + continue + index[image_name].append(annotationFile) + + elif parsedType == "jsonl": + for entry in parsed: + image_name = entry.get("image") + if not image_name: + continue + index[image_name].append(annotationFile) + + return index def _build_image_and_annotation_maps(annotationFiles): imgRefMap = {} annotationMap = defaultdict(list) for annFile in annotationFiles: - filename, dirname, parsed, parsedType = ( + filename, parsed, parsedType = ( annFile["file"], - annFile["dirname"], annFile["parsed"], annFile["parsedType"], ) if parsedType == "coco": for imageRef in parsed["images"]: - imgRefMap[f"{filename}/{imageRef['file_name']}"] = imageRef + # Normalize and index by multiple forms to improve matching robustness + file_name = _patch_sep(imageRef["file_name"]).lstrip("/") + basename = os.path.basename(file_name) + stem = os.path.splitext(basename)[0] + + # Prefer full relative path, but also allow basename and stem + imgRefMap.update( + { + f"{filename}/{file_name}": imageRef, + f"{filename}/{basename}": imageRef, + f"{filename}/{stem}": imageRef, + } + ) for annotation in parsed["annotations"]: - annotationMap[f"{dirname}/{annotation['image_id']}"].append(annotation) + annotationMap[f"{filename}/{annotation['image_id']}"].append(annotation) return imgRefMap, annotationMap def _filterIndividualAnnotations(image, annotation, format, imgRefMap, annotationMap): parsed = annotation["parsed"] if format == "coco": - imgReference = imgRefMap.get(f"{annotation['file']}/{image['name']}") + rel_path = image["file"].lstrip("/") + imgReference = ( + # Try matching by full relative path first + imgRefMap.get(f"{annotation['file']}/{rel_path}") + # Fallback: basename with extension + or imgRefMap.get(f"{annotation['file']}/{image['name']}") + # Fallback: stem (no extension) + or imgRefMap.get(f"{annotation['file']}/{image['key']}") + ) if imgReference: # workaround to make Annotations.js correctly identify this as coco in the backend fake_annotation = { @@ -161,7 +239,7 @@ def _filterIndividualAnnotations(image, annotation, format, imgRefMap, annotatio "iscrowd": 0, } _annotation = {"name": "annotation.coco.json"} - annotations_for_image = annotationMap.get(f"{image['dirname']}/{imgReference['id']}", []) + annotations_for_image = annotationMap.get(f"{annotation['file']}/{imgReference['id']}", []) _annotation["rawText"] = json.dumps( { "info": parsed["info"], @@ -314,13 +392,6 @@ def _decide_split(images): i["split"] = "train" -def _list_map(my_list, key): - d = {} - for i in my_list: - d.setdefault(i[key], []).append(i) - return d - - def _infer_classification_labels_from_folders(images): for image in images: if image.get("annotationfile"): diff --git a/tests/util/test_folderparser.py b/tests/util/test_folderparser.py index aec5ea44..4f9ddb5b 100644 --- a/tests/util/test_folderparser.py +++ b/tests/util/test_folderparser.py @@ -1,4 +1,6 @@ import json +import os +import tempfile import unittest from os.path import abspath, dirname @@ -95,6 +97,114 @@ def test_parse_multilabel_classification_csv(self): self.assertEqual(img1["annotationfile"]["type"], "classification_multilabel") self.assertEqual(set(img1["annotationfile"]["labels"]), {"Blackheads"}) + def test_coco_with_subdir_file_name_should_match_annotations(self): + # COCO file_name includes a subdirectory, but the actual image is at dataset root. + with tempfile.TemporaryDirectory() as tmpdir: + # Create nested image path: /2/100002/img.jpeg + image_name = "example_2_100002_02f2f7c6e15f09b401575ae6.jpeg" + image_relpath = os.path.join("2", "100002", image_name) + image_path = os.path.join(tmpdir, image_name) + # Create an empty image file (content not used by parser) + open(image_path, "wb").close() + + # Create COCO annotation JSON at dataset root, referencing the image with subdir in file_name + coco = { + "info": {}, + "licenses": [], + "categories": [{"id": 1, "name": "thing"}], + "images": [ + { + "id": 10000000, + "file_name": image_relpath.replace(os.sep, "/"), + "width": 800, + "height": 533, + } + ], + "annotations": [ + { + "id": 1, + "image_id": 10000000, + "category_id": 1, + "bbox": [10, 10, 100, 50], + "area": 5000, + "segmentation": [], + "iscrowd": 0, + } + ], + } + coco_path = os.path.join(tmpdir, "_annotations.coco.json") + with open(coco_path, "w") as f: + json.dump(coco, f) + + parsed = folderparser.parsefolder(tmpdir) + # Image entries store file with a leading slash relative to root + expected_file_key = f"/{image_name}" + img_entries = [i for i in parsed["images"] if i["file"] == expected_file_key] + self.assertTrue(len(img_entries) == 1) + img_entry = img_entries[0] + + # Expect annotationfile to be populated, but this currently fails due to basename-only matching + self.assertIsNotNone(img_entry.get("annotationfile")) + + def test_coco_root_annotation_matches_images_in_subdirs(self): + """Test that COCO annotation at root can match images in subdirectories. + + This tests the fix for the bug where annotation file dirname (/) didn't match + image dirname (/1/100001), causing annotations to not be found. + """ + with tempfile.TemporaryDirectory() as tmpdir: + # Create image in subdirectory + subdir = os.path.join(tmpdir, "1", "100001") + os.makedirs(subdir, exist_ok=True) + image_name = "image.jpeg" + image_path = os.path.join(subdir, image_name) + open(image_path, "wb").close() + + # Create COCO annotation at root referencing image with subdirectory path + coco = { + "info": {}, + "licenses": [], + "categories": [{"id": 1, "name": "object"}], + "images": [ + { + "id": 10000000, + "file_name": "1/100001/image.jpeg", + "width": 800, + "height": 600, + } + ], + "annotations": [ + { + "id": 1, + "image_id": 10000000, + "category_id": 1, + "bbox": [10, 20, 100, 200], + "area": 20000, + "segmentation": [[10, 20, 110, 20, 110, 220, 10, 220]], + "iscrowd": 0, + } + ], + } + coco_path = os.path.join(tmpdir, "_annotations.coco.json") + with open(coco_path, "w") as f: + json.dump(coco, f) + + parsed = folderparser.parsefolder(tmpdir) + + # Find the image + img_entries = [i for i in parsed["images"] if image_name in i["file"]] + self.assertEqual(len(img_entries), 1, "Should find exactly one image") + img_entry = img_entries[0] + + # Verify annotation was matched + self.assertIsNotNone(img_entry.get("annotationfile"), "Image should have annotation") + + # Verify annotation content + ann_data = json.loads(img_entry["annotationfile"]["rawText"]) + self.assertEqual(len(ann_data["images"]), 1, "Should have one image reference") + self.assertEqual(len(ann_data["annotations"]), 1, "Should have one annotation") + self.assertEqual(ann_data["annotations"][0]["bbox"], [10, 20, 100, 200]) + def _assertJsonMatchesFile(actual, filename): with open(filename) as file: