diff --git a/.gitignore b/.gitignore index 56622160..4d81c8f7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,13 @@ *.h5 *.weights +*.tar +*.tfrecord /checkpoints/* /serving/* /logs/ /Untitled.ipynb /output.jpg +/data/voc2012_raw/ # Created by https://www.gitignore.io/api/python # Edit at https://www.gitignore.io/?templates=python diff --git a/README.md b/README.md index 603c06a3..c08e6f97 100644 --- a/README.md +++ b/README.md @@ -89,11 +89,14 @@ python detect_video.py --video path_to_file.mp4 --output ./output.avi ### Training -You need to generate tfrecord following the TensorFlow Object Detection API. +I have created a complete tutorial on how to train from scratch using the VOC2012 Dataset +See the documentation here https://github.com/zzh8829/yolov3_tf2/blob/master/docs/training_voc.md + +For customzied training, you need to generate tfrecord following the TensorFlow Object Detection API. For example you can use [Microsoft VOTT](https://github.com/Microsoft/VoTT) to generate such dataset. You can also use this [script](https://github.com/tensorflow/models/blob/master/research/object_detection/dataset_tools/create_pascal_tf_record.py) to create the pascal voc dataset. - +Example commend line arguments for training ``` bash python train.py --batch_size 8 --dataset ~/Data/voc2012.tfrecord --val_dataset ~/Data/voc2012_val.tfrecord --epochs 100 --mode eager_tf --transfer fine_tune diff --git a/conda-cpu.yml b/conda-cpu.yml index 3c8ac5ce..fe1cd2a2 100644 --- a/conda-cpu.yml +++ b/conda-cpu.yml @@ -7,3 +7,6 @@ dependencies: - opencv - pip: - tensorflow==2.0.0 + - lxml + - tqdm + - -e . diff --git a/conda-gpu.yml b/conda-gpu.yml index 3c627675..5c0875a2 100644 --- a/conda-gpu.yml +++ b/conda-gpu.yml @@ -6,6 +6,9 @@ dependencies: - matplotlib - opencv - cudnn - - cudatoolkit==10.0.130 + - cudatoolkit==10.1.243 - pip: - - tensorflow-gpu==2.0.0 + - tensorflow-gpu==2.1.0rc1 + - lxml + - tqdm + - -e . diff --git a/data/voc2012.names b/data/voc2012.names new file mode 100644 index 00000000..8420ab35 --- /dev/null +++ b/data/voc2012.names @@ -0,0 +1,20 @@ +aeroplane +bicycle +bird +boat +bottle +bus +car +cat +chair +cow +diningtable +dog +horse +motorbike +person +pottedplant +sheep +sofa +train +tvmonitor diff --git a/docs/training_voc.md b/docs/training_voc.md new file mode 100644 index 00000000..59bd71fa --- /dev/null +++ b/docs/training_voc.md @@ -0,0 +1,68 @@ +# Training Instruction + +## VOC 2012 Dataset from Scratch + +Full instruction on how to train using VOC 2012 from scratch + +Requirement: + 1. Able to detect image using pretrained darknet model + 2. Many Gigabytes of Disk Space + 3. High Speed Internet Connection Prefered + 4. GPU Prefered + + +### 1. Download Dataset + +You can read the full description of dataset [here](http://host.robots.ox.ac.uk/pascal/VOC/) +```bash +wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar -O ./data/voc2012_raw.tar +mkdir -p ./data/voc2012_raw +tar -xf ./data/voc2012_raw.tar -C ./data/voc2012_raw +ls ./data/voc2012_raw/VOCdevkit/VOC2012 # Explore the dataset +``` + +### 2. Transform Dataset + +```bash +python tools/voc2012.py \ + --data_dir './data/voc2012_raw/VOCdevkit/VOC2012' + --split train \ + --output_file ./data/voc2012_train.tfrecord + +python tools/voc2012.py \ + --data_dir './data/voc2012_raw/VOCdevkit/VOC2012' + --split val \ + --output_file ./data/voc2012_val.tfrecord +``` + +### 3. Training + +You can adjust the parameters based on your setup + +```bash +python train.py \ + --dataset ./data/voc2012_train.tfrecord \ + --val_dataset ./data/voc2012_val.tfrecord \ + --classes ./data/voc2012.names \ + --num_classes 20 \ + --mode fit --transfer none \ + --batch_size 16 \ + --epochs 3 \ + --weights ./checkpoints/yolov3_voc.tf +``` + +I have tested this works 100% with correct loss and converging over time +Each epoch takes around 10 minutes on single AWS p2.xlarge (Nvidia K80 GPU) Instance. + +### 4. Inference + +```bash +python detect.py \ + --classes ./data/voc2012.names \ + --num_classes 20 \ + --weights ./checkpoints/yolov3_voc.tf +``` + +You should see some detect objects in the standard output and the visualization at `output.jpg`. +this is just a proof of concept, so it won't be as good as pretrained models + diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..0c08b66f --- /dev/null +++ b/setup.py @@ -0,0 +1,8 @@ +from setuptools import setup + +setup(name='yolov3_tf2', + version='0.1', + url='https://github.com/zzh8829/yolov3-tf2', + author='Zihao Zhang', + author_email='zzh8829@gmail.com', + packages=['yolov3_tf2']) \ No newline at end of file diff --git a/tools/voc2012.py b/tools/voc2012.py new file mode 100644 index 00000000..a01b67e0 --- /dev/null +++ b/tools/voc2012.py @@ -0,0 +1,111 @@ +import time +import os +import hashlib + +from absl import app, flags, logging +from absl.flags import FLAGS +import tensorflow as tf +import lxml.etree +import tqdm + +flags.DEFINE_string('data_dir', './data/voc2012_raw/VOCdevkit/VOC2012/', + 'path to raw PASCAL VOC dataset') +flags.DEFINE_enum('split', 'train', [ + 'train', 'val'], 'specify train or val spit') +flags.DEFINE_string('output_file', './data/voc2012.tfrecord', 'outpot dataset') +flags.DEFINE_string('classes', './data/voc2012.names', 'classes file') + + +def build_example(annotation, class_map): + img_path = os.path.join( + FLAGS.data_dir, 'JPEGImages', annotation['filename']) + img_raw = open(img_path, 'rb').read() + key = hashlib.sha256(img_raw).hexdigest() + + width = int(annotation['size']['width']) + height = int(annotation['size']['height']) + + xmin = [] + ymin = [] + xmax = [] + ymax = [] + classes = [] + classes_text = [] + truncated = [] + views = [] + difficult_obj = [] + if 'object' in annotation: + for obj in annotation['object']: + difficult = bool(int(obj['difficult'])) + difficult_obj.append(int(difficult)) + + xmin.append(float(obj['bndbox']['xmin']) / width) + ymin.append(float(obj['bndbox']['ymin']) / height) + xmax.append(float(obj['bndbox']['xmax']) / width) + ymax.append(float(obj['bndbox']['ymax']) / height) + classes_text.append(obj['name'].encode('utf8')) + classes.append(class_map[obj['name']]) + truncated.append(int(obj['truncated'])) + views.append(obj['pose'].encode('utf8')) + + example = tf.train.Example(features=tf.train.Features(feature={ + 'image/height': tf.train.Feature(int64_list=tf.train.Int64List(value=[height])), + 'image/width': tf.train.Feature(int64_list=tf.train.Int64List(value=[width])), + 'image/filename': tf.train.Feature(bytes_list=tf.train.BytesList(value=[ + annotation['filename'].encode('utf8')])), + 'image/source_id': tf.train.Feature(bytes_list=tf.train.BytesList(value=[ + annotation['filename'].encode('utf8')])), + 'image/key/sha256': tf.train.Feature(bytes_list=tf.train.BytesList(value=[key.encode('utf8')])), + 'image/encoded': tf.train.Feature(bytes_list=tf.train.BytesList(value=[img_raw])), + 'image/format': tf.train.Feature(bytes_list=tf.train.BytesList(value=['jpeg'.encode('utf8')])), + 'image/object/bbox/xmin': tf.train.Feature(float_list=tf.train.FloatList(value=xmin)), + 'image/object/bbox/xmax': tf.train.Feature(float_list=tf.train.FloatList(value=xmax)), + 'image/object/bbox/ymin': tf.train.Feature(float_list=tf.train.FloatList(value=ymin)), + 'image/object/bbox/ymax': tf.train.Feature(float_list=tf.train.FloatList(value=ymax)), + 'image/object/class/text': tf.train.Feature(bytes_list=tf.train.BytesList(value=classes_text)), + 'image/object/class/label': tf.train.Feature(int64_list=tf.train.Int64List(value=classes)), + 'image/object/difficult': tf.train.Feature(int64_list=tf.train.Int64List(value=difficult_obj)), + 'image/object/truncated': tf.train.Feature(int64_list=tf.train.Int64List(value=truncated)), + 'image/object/view': tf.train.Feature(bytes_list=tf.train.BytesList(value=views)), + })) + return example + + +def parse_xml(xml): + if not len(xml): + return {xml.tag: xml.text} + result = {} + for child in xml: + child_result = parse_xml(child) + if child.tag != 'object': + result[child.tag] = child_result[child.tag] + else: + if child.tag not in result: + result[child.tag] = [] + result[child.tag].append(child_result[child.tag]) + return {xml.tag: result} + + +def main(_argv): + class_map = {name: idx for idx, name in enumerate( + open(FLAGS.classes).read().splitlines())} + logging.info("Class mapping loaded: %s", class_map) + + writer = tf.io.TFRecordWriter(FLAGS.output_file) + image_list = open(os.path.join( + FLAGS.data_dir, 'ImageSets', 'Main', 'aeroplane_%s.txt' % FLAGS.split)).read().splitlines() + logging.info("Image list loaded: %d", len(image_list)) + for image in tqdm.tqdm(image_list): + name, _ = image.split() + annotation_xml = os.path.join( + FLAGS.data_dir, 'Annotations', name + '.xml') + annotation_xml = lxml.etree.fromstring(open(annotation_xml).read()) + annotation = parse_xml(annotation_xml)['annotation'] + tf_example = build_example(annotation, class_map) + writer.write(tf_example.SerializeToString()) + writer.close() + logging.info("Done") + + +if __name__ == '__main__': + app.run(main) diff --git a/train.py b/train.py index c9e0e8c2..aee79ffa 100644 --- a/train.py +++ b/train.py @@ -1,5 +1,6 @@ from absl import app, flags, logging from absl.flags import FLAGS + import tensorflow as tf import numpy as np import cv2 diff --git a/yolov3_tf2/dataset.py b/yolov3_tf2/dataset.py index 8b866d47..93cf08cd 100644 --- a/yolov3_tf2/dataset.py +++ b/yolov3_tf2/dataset.py @@ -77,23 +77,24 @@ def transform_images(x_train, size): # https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/using_your_own_dataset.md#conversion-script-outline-conversion-script-outline +# Commented out fields are not required in our project IMAGE_FEATURE_MAP = { - 'image/width': tf.io.FixedLenFeature([], tf.int64), - 'image/height': tf.io.FixedLenFeature([], tf.int64), - 'image/filename': tf.io.FixedLenFeature([], tf.string), - 'image/source_id': tf.io.FixedLenFeature([], tf.string), - 'image/key/sha256': tf.io.FixedLenFeature([], tf.string), + # 'image/width': tf.io.FixedLenFeature([], tf.int64), + # 'image/height': tf.io.FixedLenFeature([], tf.int64), + # 'image/filename': tf.io.FixedLenFeature([], tf.string), + # 'image/source_id': tf.io.FixedLenFeature([], tf.string), + # 'image/key/sha256': tf.io.FixedLenFeature([], tf.string), 'image/encoded': tf.io.FixedLenFeature([], tf.string), - 'image/format': tf.io.FixedLenFeature([], tf.string), + # 'image/format': tf.io.FixedLenFeature([], tf.string), 'image/object/bbox/xmin': tf.io.VarLenFeature(tf.float32), 'image/object/bbox/ymin': tf.io.VarLenFeature(tf.float32), 'image/object/bbox/xmax': tf.io.VarLenFeature(tf.float32), 'image/object/bbox/ymax': tf.io.VarLenFeature(tf.float32), 'image/object/class/text': tf.io.VarLenFeature(tf.string), - 'image/object/class/label': tf.io.VarLenFeature(tf.int64), - 'image/object/difficult': tf.io.VarLenFeature(tf.int64), - 'image/object/truncated': tf.io.VarLenFeature(tf.int64), - 'image/object/view': tf.io.VarLenFeature(tf.string), + # 'image/object/class/label': tf.io.VarLenFeature(tf.int64), + # 'image/object/difficult': tf.io.VarLenFeature(tf.int64), + # 'image/object/truncated': tf.io.VarLenFeature(tf.int64), + # 'image/object/view': tf.io.VarLenFeature(tf.string), }