Resolve shape mismatch errors and finalize data collection script

VishwamAI · Jul 18, 2024 · 3507fae · 3507fae
1 parent 59e5a55
commit 3507fae
Showing 1 changed file with 115 additions and 28 deletions.
diff --git a/data_collection.py b/data_collection.py
@@ -1,27 +1,35 @@
+# Note: Make sure to install pycocotools using: pip install pycocotools
 import os
 import json
-import requests
 from PIL import Image
 import torch
 from torchvision import transforms
-from datasets import load_dataset
-from transformers import DetrForObjectDetection, ViTFeatureExtractor, YolosFeatureExtractor
+from pycocotools.coco import COCO
+from transformers import DetrForObjectDetection, ViTImageProcessor, YolosImageProcessor
 from ultralytics import YOLO
+import subprocess
 
-# Define paths for data storage
+# Define paths for data storage and COCO dataset
 image_dir = 'data/images'
 label_dir = 'data/labels'
+coco_annotation_file = 'annotations/instances_val2017.json'
+coco_images_dir = 'val2017'
 
 # Define the models to use for data collection
 detr_model = DetrForObjectDetection.from_pretrained('facebook/detr-resnet-50')
-vit_model = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
-yolos_model = YolosFeatureExtractor.from_pretrained('hustvl/yolos-small')
+vit_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
+yolos_processor = YolosImageProcessor.from_pretrained('hustvl/yolos-small')
 
 # Function to collect and preprocess data
 def collect_data():
-    # Load public datasets
-    coco_dataset = load_dataset("coco", split="train")
-    inaturalist_dataset = load_dataset("inaturalist", "2021_train", split="train")
+    # Check if the annotation file and images directory exist
+    if not os.path.exists(coco_annotation_file):
+        raise FileNotFoundError(f"COCO annotation file not found: {coco_annotation_file}")
+    if not os.path.exists(coco_images_dir):
+        raise FileNotFoundError(f"COCO images directory not found: {coco_images_dir}")
+
+    # Load COCO dataset from local files
+    coco = COCO(coco_annotation_file)
 
     # Define preprocessing steps
     preprocess = transforms.Compose([
@@ -31,29 +39,38 @@ def collect_data():
     ])
 
     # Collect and preprocess data
-    for idx, (coco_item, inat_item) in enumerate(zip(coco_dataset, inaturalist_dataset)):
+    processed_count = 0
+    target_count = 1000
+    for idx, img_id in enumerate(coco.getImgIds()):
         try:
             # Process COCO dataset item
-            coco_image = Image.open(requests.get(coco_item['image'], stream=True).raw)
+            img_info = coco.loadImgs(img_id)[0]
+            img_path = os.path.join(coco_images_dir, img_info['file_name'])
+            coco_image = Image.open(img_path).convert('RGB')
             coco_preprocessed = preprocess(coco_image)
+            print(f"Image tensor shape: {coco_preprocessed.shape}")
 
-            # Process iNaturalist dataset item
-            inat_image = Image.open(requests.get(inat_item['image'], stream=True).raw)
-            inat_preprocessed = preprocess(inat_image)
-
-            # Save preprocessed images
+            # Save preprocessed image
             save_image(coco_preprocessed, f"{image_dir}/train/coco_{idx}.jpg")
-            save_image(inat_preprocessed, f"{image_dir}/train/inat_{idx}.jpg")
+
+            # Get annotations for the image
+            ann_ids = coco.getAnnIds(imgIds=img_id)
+            anns = coco.loadAnns(ann_ids)
 
             # Save annotations
-            save_annotation(coco_item['objects'], f"{label_dir}/train/coco_{idx}.txt")
-            save_annotation(inat_item['annotations'], f"{label_dir}/train/inat_{idx}.txt")
+            save_annotation(anns, f"{label_dir}/train/coco_{idx}.txt")
 
-            if idx >= 1000:  # Limit to 1000 images for now
+            processed_count += 1
+            if processed_count >= target_count:
                 break
         except Exception as e:
             print(f"Error processing item {idx}: {str(e)}")
 
+    if processed_count == target_count:
+        print(f"Successfully collected and preprocessed {processed_count} images from COCO dataset")
+    else:
+        print(f"Warning: Only processed {processed_count} out of {target_count} target images")
+
 def save_image(tensor, filename):
     image = transforms.ToPILImage()(tensor)
     image.save(filename, format='JPEG')
@@ -83,8 +100,25 @@ def save_yolo_results(results, filename):
                 x, y, w, h = box.xywh[0]
                 f.write(f"{class_id} {x} {y} {w} {h}\n")
 
+# Function to create COCO bird names file
+def create_coco_bird_names():
+    bird_classes = {
+        14: 'bird',
+        15: 'cat',  # Including cat for reference
+        16: 'dog',  # Including dog for reference
+        17: 'horse',  # Including horse for reference
+        18: 'sheep',  # Including sheep for reference
+        19: 'cow',  # Including cow for reference
+    }
+    with open('coco_bird_names.txt', 'w') as f:
+        for class_id, name in bird_classes.items():
+            f.write(f"{class_id}:{name}\n")
+
 # Function to use COCO dataset names for object labeling
 def use_coco_names():
+    if not os.path.exists('coco_bird_names.txt'):
+        create_coco_bird_names()
+
     coco_names = {}
     with open('coco_bird_names.txt', 'r') as f:
         for line in f:
@@ -102,9 +136,13 @@ def update_labels_with_names(filename, coco_names):
 
     with open(filename, 'w') as f:
         for line in lines:
-            class_id, *bbox = line.strip().split()
-            class_name = coco_names.get(int(class_id), 'unknown')
-            f.write(f"{class_name} {' '.join(bbox)}\n")
+            try:
+                class_id, *bbox = line.strip().split()
+                class_name = coco_names.get(int(class_id) if class_id.isdigit() else class_id, 'unknown')
+                f.write(f"{class_name} {' '.join(bbox)}\n")
+            except ValueError as e:
+                print(f"Error processing class_id: {class_id} in file {filename}. Error: {e}")
+                f.write(f"unknown {' '.join(bbox)}\n")
 
 # Function to create a custom dataset class
 class CustomBirdDataset(torch.utils.data.Dataset):
@@ -138,10 +176,59 @@ def __getitem__(self, idx):
 def create_custom_dataset_class():
     return CustomBirdDataset(f"{image_dir}/train", f"{label_dir}/train", transform=transforms.ToTensor())
 
+def verify_collected_data():
+    collected_images = os.listdir(os.path.join(image_dir, 'train'))
+    print(f"Collected {len(collected_images)} images")
+
+    collected_labels = os.listdir(os.path.join(label_dir, 'train'))
+    print(f"Collected {len(collected_labels)} label files")
+
+    for label_file in collected_labels:
+        if os.path.isfile(os.path.join(label_dir, 'train', label_file)):
+            sample_label = os.path.join(label_dir, 'train', label_file)
+            with open(sample_label, 'r') as f:
+                print(f"Sample label content from {label_file}:\n{f.read()}")
+            break
+    else:
+        print("No valid label files found.")
+
+def grep_collected_data(pattern):
+    grep_command = f"grep -r '{pattern}' {label_dir}"
+    result = subprocess.run(grep_command, shell=True, capture_output=True, text=True)
+    print(f"Grep results for pattern '{pattern}':\n{result.stdout}")
+
+def verify_coco_format():
+    # Check if the collected data matches the expected COCO format
+    coco_format_valid = True
+    for label_file in os.listdir(os.path.join(label_dir, 'train')):
+        if label_file.endswith('.txt'):
+            with open(os.path.join(label_dir, 'train', label_file), 'r') as f:
+                lines = f.readlines()
+                for line in lines:
+                    parts = line.strip().split()
+                    if len(parts) != 5 or not all(part.replace('.', '', 1).isdigit() for part in parts):
+                        print(f"Invalid format in {label_file}: {line.strip()}")
+                        coco_format_valid = False
+                        break
+
+    if coco_format_valid:
+        print("All collected data matches the expected COCO format.")
+    else:
+        print("Some collected data does not match the expected COCO format.")
+
 # Main execution
 if __name__ == '__main__':
-    collect_data()
-    integrate_yolo()
-    use_coco_names()
-    custom_dataset = create_custom_dataset_class()
-    print(f"Created custom dataset with {len(custom_dataset)} samples")
+    # Ensure necessary directories exist
+    os.makedirs(image_dir, exist_ok=True)
+    os.makedirs(label_dir, exist_ok=True)
+
+    if os.path.exists(coco_annotation_file) and os.path.isdir(coco_images_dir):
+        collect_data()
+        integrate_yolo()
+        use_coco_names()
+        custom_dataset = create_custom_dataset_class()
+        print(f"Created custom dataset with {len(custom_dataset)} samples")
+        verify_collected_data()
+        grep_collected_data("bird")  # Example pattern
+    else:
+        print(f"Error: COCO dataset files not found. Please ensure the annotation file '{coco_annotation_file}' and images directory '{coco_images_dir}' exist.")