diff --git a/README.md b/README.md new file mode 100644 index 0000000..3cfb986 --- /dev/null +++ b/README.md @@ -0,0 +1,96 @@ +# PP_YOLO TensorFlow +### Tensorflow implementation of PP-YOLOv1 + +
+ + + +## Requiremets Installation + +```bash +# Tensorflow CPU +conda env create -f conda-cpu.yml +conda activate yolov4-cpu + +# Tensorflow GPU +conda env create -f conda-gpu.yml +conda activate yolov4-gpu +``` + +### Pip +```bash +# TensorFlow CPU +pip install -r requirements.txt + +# TensorFlow GPU +pip install -r requirements-gpu.txt +``` + +## Custom Data Training +### Step - 1 : (Setup "core/config.py file") +* Modify path of .names file (line 14) +* Modify number of classes (line 15) +* Modify path of train.txt file (line 29) +* Modify other parametrs like batch size, learning rate,etc according to your requirements (Optional) + +Use following code to create train.txt file. First need to copy all annotations file and image to 'data/dataset' and then run following code. + +```python +# create train.txt file +import glob + +files = glob.glob('data/dataset/*.jpg') +with open('train.txt','w') as f: + f.write('\n'.join(files)) + +``` + +### Step - 2 : (Model training) +Run following command for training +```bash + python train.py +``` +Note : If training interrupts due to any network or other issues , run following command for resuming. Use less learning rate to fix Nan error. +```bash +python train.py --const_lr True --resume 'checkpoints/pp_yolo' +``` + +### Step - 3: (Model covertion) +Run following command for model convertion , basically it's take saved weights and convert it to saved model format. + +```bash +python convert.py --weights './checkpoints/pp_yolo' --save './saved_model' --size 416 +``` +### Step - 4: (Detection) +Run following command for images: +```bash +python detect_img.py --model ./checkpoints/saved_model --image './source/test.jpeg' + +``` +Run following command for Video : +```bash +python detect_vid.py --model ./checkpoints/saved_model --video ./source/vid.mp4 --output './output/result.avi' + +``` + +Note : Outputs are stored in detection folder defaultly, use --output to change path. + +To Do List : +* [x] Core Architecture +* [x] CoordConv +* [x] SPP(Spatial Pyramid Pooling) +* [ ] Deformable Conv +* [ ] Drop Block +* [x] Detection(Infer) +* [ ] Model Evaluation + +Note : This project is not optimized version, use official Paddle Paddle framework for better result. +### References +* PP-YOLO: An Effective and Efficient Implementation of Object Detector [PP-Yolo v1](https://arxiv.org/abs/2007.12099) + +* Paddle Detection [Paddle implemetation](https://github.com/PaddlePaddle/PaddleDetection) + +My project is inspired by this privious YOLOv4 implemetation. +* [YOLOv4](https://github.com/theAIGuysCode/tensorflow-yolov4-tflite) + + diff --git a/convert.py b/convert.py new file mode 100644 index 0000000..5a1d222 --- /dev/null +++ b/convert.py @@ -0,0 +1,53 @@ +import os +import shutil +import tensorflow as tf +from utils.data_process import decode_tf , filter_boxes +from utils.config import cfg +import numpy as np +from utils import utils +from core.fpn import fpn +from core.resnet_50 import resnet_50 +from core.head import head +import argparse + +def save_tf(): + STRIDES, ANCHORS, NUM_CLASS, XYSCALE = utils.load_config() + + resnet = tf.keras.applications.ResNet50(include_top=False,weights='imagenet',input_shape=(args.size,args.size,3)) + c3 , c4 ,c5 = resnet_50(resnet) + neck_output = fpn(c3 , c4 , c5) + head_output = head(neck_output) + feature_maps = head_output + bbox_tensors = [] + prob_tensors = [] + + input_size = 416 + for i, fm in enumerate(feature_maps): + if i == 0: + output_tensors = decode_tf(fm, args.size // 8, NUM_CLASS, STRIDES, ANCHORS, i, XYSCALE) + elif i == 1: + output_tensors = decode_tf(fm, args.size // 16, NUM_CLASS, STRIDES, ANCHORS, i, XYSCALE ) + else: + output_tensors = decode_tf(fm, args.size // 32, NUM_CLASS, STRIDES, ANCHORS, i, XYSCALE) + bbox_tensors.append(output_tensors[0]) + prob_tensors.append(output_tensors[1]) + pred_bbox = tf.concat(bbox_tensors, axis=1) + pred_prob = tf.concat(prob_tensors, axis=1) + + boxes, pred_conf = filter_boxes(pred_bbox, pred_prob, score_threshold=0.2, input_shape=tf.constant([args.size, args.size])) + pred = tf.concat([boxes, pred_conf], axis=-1) + model = tf.keras.Model(resnet.input, pred) + print('loading weights..') + model.load_weights(args.weights) + #model.summary() + print('saving model..') + model.save(args.save) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-s','--size',help = 'Size of input image',type = int ,default = 416) + parser.add_argument('-w','--weights',help = 'path to weights',type = str , default = './checkpoints/pp_yolo') + parser.add_argument('-m','--save', help = 'path to save model',type = str , default = './checkpoints/saved_model') + + args = parser.parse_args() + save_tf() \ No newline at end of file diff --git a/core/__pycache__/blocks.cpython-37.pyc b/core/__pycache__/blocks.cpython-37.pyc new file mode 100644 index 0000000..55b94fb Binary files /dev/null and b/core/__pycache__/blocks.cpython-37.pyc differ diff --git a/core/__pycache__/deformable_conv_layer.cpython-37.pyc b/core/__pycache__/deformable_conv_layer.cpython-37.pyc new file mode 100644 index 0000000..32acf1a Binary files /dev/null and b/core/__pycache__/deformable_conv_layer.cpython-37.pyc differ diff --git a/core/__pycache__/deformable_conv_layer.cpython-38.pyc b/core/__pycache__/deformable_conv_layer.cpython-38.pyc new file mode 100644 index 0000000..55abcd4 Binary files /dev/null and b/core/__pycache__/deformable_conv_layer.cpython-38.pyc differ diff --git a/core/__pycache__/fpn.cpython-37.pyc b/core/__pycache__/fpn.cpython-37.pyc new file mode 100644 index 0000000..e31d4ea Binary files /dev/null and b/core/__pycache__/fpn.cpython-37.pyc differ diff --git a/core/__pycache__/head.cpython-37.pyc b/core/__pycache__/head.cpython-37.pyc new file mode 100644 index 0000000..780b8db Binary files /dev/null and b/core/__pycache__/head.cpython-37.pyc differ diff --git a/core/__pycache__/resnet_50.cpython-37.pyc b/core/__pycache__/resnet_50.cpython-37.pyc new file mode 100644 index 0000000..c698164 Binary files /dev/null and b/core/__pycache__/resnet_50.cpython-37.pyc differ diff --git a/core/blocks.py b/core/blocks.py new file mode 100644 index 0000000..b1b6ff4 --- /dev/null +++ b/core/blocks.py @@ -0,0 +1,80 @@ +import tensorflow as tf +from tensorflow.keras.layers import Conv2D , BatchNormalization , MaxPool2D +from utils.config import cfg + +def convblock(input_tensors, bn=True , coorconv = True ): + + conv = Conv2D(filters= input_tensors.shape[-1]*2, kernel_size=(3,3),padding='same')(input_tensors) + + if bn: conv = BatchNormalization()(conv) + + conv = tf.nn.relu(conv) + if coordconv: conv = coordconv(conv) + conv = Conv2D(filters=input_tensors.shape[-1], kernel_size =(1,1))(conv) + if bn: conv = BatchNormalization()(conv) + conv = tf.nn.relu(conv) + + return conv + +def coordconv(feature_map): + batch_size = tf.shape(feature_map)[0] + x_shape = tf.shape(feature_map)[1] + y_shape = tf.shape(feature_map)[2] + + x_ones = tf.ones((batch_size , x_shape),dtype=tf.float32) + x_ones = tf.expand_dims(x_ones,axis = -1) + x_range = tf.tile(tf.expand_dims(tf.range(y_shape,dtype=tf.float32),axis=0),[batch_size,1]) + x_range = tf.expand_dims(x_range,1) + x_channel = tf.matmul(x_ones,x_range) + x_channel = tf.expand_dims(x_channel,axis=-1) + + y_ones = tf.ones((batch_size , y_shape),dtype=tf.float32) + y_ones = tf.expand_dims(y_ones,axis = 1) + y_range = tf.tile(tf.expand_dims(tf.range(x_shape,dtype=tf.float32),axis=0),[batch_size,1]) + y_range = tf.expand_dims(y_range,-1) + y_channel = tf.matmul(y_range,y_ones) + y_channel = tf.expand_dims(y_channel,axis=-1) + + x_shape = tf.cast(x_shape , dtype=tf.float32) + y_shape = tf.cast(y_shape, dtype = tf.float32) + + + x_channel = tf.cast(x_channel,dtype=tf.float32) / (y_shape -1) + y_channel = tf.cast(y_channel,dtype=tf.float32) / (x_shape - 1) + + x_channel = x_channel * 2 - 1 + y_channel = y_channel * 2 -1 + + output_tensors = tf.concat([feature_map,x_channel,y_channel],axis=-1) + + return output_tensors + +def upsampling(features): + channels = features.shape[-1] + conv = coordconv(features) + conv = Conv2D(filters=channels/2 ,kernel_size=(1,1))(conv) + output = tf.image.resize(conv,size=(conv.shape[1]*2,conv.shape[2]*2)) + return output + +def sppblock(input_tensors): + + pooling_1 = MaxPool2D(pool_size=(1,1),strides=(1,1))(input_tensors) + pooling_2 = MaxPool2D(pool_size=(5,5),padding='same',strides=(1,1))(input_tensors) + pooling_3 = MaxPool2D(pool_size=(9,9),padding='same',strides=(1,1))(input_tensors) + #pooling_4 = MaxPool2D(pool_size=(13,13),padding='same',strides=(1,1))(input_tensors) + + output = tf.concat([input_tensors,pooling_1,pooling_2 ,pooling_3],axis=-1) + + return output + +def conv_head(features): + channel = features.shape[-1] + num_classes = cfg.YOLO.NUM_CLASSES + num_filters = 3 * (num_classes + 5) + conv = coordconv(features) + conv = Conv2D(filters=channel*2,kernel_size=(3,3),padding='same')(conv) + conv = tf.nn.relu(conv) + conv = Conv2D(filters= num_filters,kernel_size=(1,1))(conv) + + return conv + diff --git a/core/deformable_conv_layer.py b/core/deformable_conv_layer.py new file mode 100644 index 0000000..b0e59b2 --- /dev/null +++ b/core/deformable_conv_layer.py @@ -0,0 +1,240 @@ +import tensorflow as tf +from tensorflow.keras.layers import Conv2D +from utils.config import cfg + +class DeformableConvLayer(Conv2D): + """Only support "channel last" data format""" + def __init__(self, + filters, + kernel_size, + strides=(1, 1), + padding='same', + data_format=None, + dilation_rate=(1, 1), + num_deformable_group=None, + activation=None, + use_bias=True, + kernel_initializer='glorot_uniform', + bias_initializer='zeros', + kernel_regularizer=None, + bias_regularizer=None, + activity_regularizer=None, + kernel_constraint=None, + bias_constraint=None, + **kwargs): + """`kernel_size`, `strides` and `dilation_rate` must have the same value in both axis. + + :param num_deformable_group: split output channels into groups, offset shared in each group. If + this parameter is None, then set num_deformable_group=filters. + """ + super().__init__( + filters=filters, + kernel_size=kernel_size, + strides=strides, + padding=padding, + data_format=data_format, + dilation_rate=dilation_rate, + activation=activation, + use_bias=use_bias, + kernel_initializer=kernel_initializer, + bias_initializer=bias_initializer, + kernel_regularizer=kernel_regularizer, + bias_regularizer=bias_regularizer, + activity_regularizer=activity_regularizer, + kernel_constraint=kernel_constraint, + bias_constraint=bias_constraint, + **kwargs) + self.kernel = None + self.bias = None + self.offset_layer_kernel = None + self.offset_layer_bias = None + if num_deformable_group is None: + num_deformable_group = filters + if filters % num_deformable_group != 0: + raise ValueError('"filters" mod "num_deformable_group" must be zero') + self.num_deformable_group = num_deformable_group + + def build(self, input_shape): + input_dim = int(input_shape[-1]) + # kernel_shape = self.kernel_size + (input_dim, self.filters) + # we want to use depth-wise conv + kernel_shape = self.kernel_size + (self.filters * input_dim, 1) + self.kernel = self.add_weight( + name='kernel', + shape=kernel_shape, + initializer=self.kernel_initializer, + regularizer=self.kernel_regularizer, + constraint=self.kernel_constraint, + trainable=True, + dtype=self.dtype) + if self.use_bias: + self.bias = self.add_weight( + name='bias', + shape=(self.filters,), + initializer=self.bias_initializer, + regularizer=self.bias_regularizer, + constraint=self.bias_constraint, + trainable=True, + dtype=self.dtype) + + # create offset conv layer + offset_num = self.kernel_size[0] * self.kernel_size[1] * self.num_deformable_group + self.offset_layer_kernel = self.add_weight( + name='offset_layer_kernel', + shape=self.kernel_size + (input_dim, offset_num * 2), # 2 means x and y axis + initializer=tf.zeros_initializer(), + regularizer=self.kernel_regularizer, + trainable=True, + dtype=self.dtype) + self.offset_layer_bias = self.add_weight( + name='offset_layer_bias', + shape=(offset_num * 2,), + initializer=tf.zeros_initializer(), + # initializer=tf.random_uniform_initializer(-5, 5), + regularizer=self.bias_regularizer, + trainable=True, + dtype=self.dtype) + self.built = True + + def call(self, inputs, training=None, **kwargs): + # get offset, shape [batch_size, out_h, out_w, filter_h, * filter_w * channel_out * 2] + offset = tf.nn.conv2d(inputs, + filters=self.offset_layer_kernel, + strides=[1, *self.strides, 1], + padding=self.padding.upper(), + dilations=[1, *self.dilation_rate, 1]) + offset += self.offset_layer_bias + + # add padding if needed + inputs = self._pad_input(inputs) + + # some length + batch_size = cfg.TRAIN.BATCH_SIZE #int(inputs.get_shape()[0]) + channel_in = int(inputs.get_shape()[-1]) + in_h, in_w = [int(i) for i in inputs.get_shape()[1: 3]] # input feature map size + out_h, out_w = [int(i) for i in offset.get_shape()[1: 3]] # output feature map size + filter_h, filter_w = self.kernel_size + + # get x, y axis offset + offset = tf.reshape(offset, [batch_size, out_h, out_w, -1, 2]) + y_off, x_off = offset[:, :, :, :, 0], offset[:, :, :, :, 1] + + # input feature map gird coordinates + y, x = self._get_conv_indices([in_h, in_w]) + y, x = [tf.expand_dims(i, axis=-1) for i in [y, x]] + y, x = [tf.tile(i, [batch_size, 1, 1, 1, self.num_deformable_group]) for i in [y, x]] + y, x = [tf.reshape(i, [*i.shape[0: 3], -1]) for i in [y, x]] + y, x = [tf.cast(i,dtype=tf.float32) for i in [y, x]] + + # add offset + y, x = y + y_off, x + x_off + y = tf.clip_by_value(y, 0, in_h - 1) + x = tf.clip_by_value(x, 0, in_w - 1) + + # get four coordinates of points around (x, y) + y0, x0 = [tf.cast(tf.floor(i),dtype=tf.int32) for i in [y, x]] + y1, x1 = y0 + 1, x0 + 1 + # clip + y0, y1 = [tf.clip_by_value(i, 0, in_h - 1) for i in [y0, y1]] + x0, x1 = [tf.clip_by_value(i, 0, in_w - 1) for i in [x0, x1]] + + # get pixel values + indices = [[y0, x0], [y0, x1], [y1, x0], [y1, x1]] + p0, p1, p2, p3 = [DeformableConvLayer._get_pixel_values_at_point(inputs, i) for i in indices] + + # cast to float + x0, x1, y0, y1 = [tf.cast(i,dtype=tf.float32) for i in [x0, x1, y0, y1]] + # weights + w0 = (y1 - y) * (x1 - x) + w1 = (y1 - y) * (x - x0) + w2 = (y - y0) * (x1 - x) + w3 = (y - y0) * (x - x0) + # expand dim for broadcast + w0, w1, w2, w3 = [tf.expand_dims(i, axis=-1) for i in [w0, w1, w2, w3]] + # bilinear interpolation + pixels = tf.add_n([w0 * p0, w1 * p1, w2 * p2, w3 * p3]) + + # reshape the "big" feature map + pixels = tf.reshape(pixels, [batch_size, out_h, out_w, filter_h, filter_w, self.num_deformable_group, channel_in]) + pixels = tf.transpose(pixels, [0, 1, 3, 2, 4, 5, 6]) + pixels = tf.reshape(pixels, [batch_size, out_h * filter_h, out_w * filter_w, self.num_deformable_group, channel_in]) + + # copy channels to same group + feat_in_group = self.filters // self.num_deformable_group + pixels = tf.tile(pixels, [1, 1, 1, 1, feat_in_group]) + pixels = tf.reshape(pixels, [batch_size, out_h * filter_h, out_w * filter_w, -1]) + + # depth-wise conv + out = tf.nn.depthwise_conv2d(pixels, self.kernel, [1, filter_h, filter_w, 1], 'VALID') + # add the output feature maps in the same group + out = tf.reshape(out, [batch_size, out_h, out_w, self.filters, channel_in]) + out = tf.reduce_sum(out, axis=-1) + if self.use_bias: + out += self.bias + return self.activation(out) + + def _pad_input(self, inputs): + """Check if input feature map needs padding, because we don't use the standard Conv() function. + + :param inputs: + :return: padded input feature map + """ + # When padding is 'same', we should pad the feature map. + # if padding == 'same', output size should be `ceil(input / stride)` + if self.padding == 'same': + in_shape = inputs.get_shape().as_list()[1: 3] + padding_list = [] + for i in range(2): + filter_size = self.kernel_size[i] + dilation = self.dilation_rate[i] + dilated_filter_size = filter_size + (filter_size - 1) * (dilation - 1) + same_output = (in_shape[i] + self.strides[i] - 1) // self.strides[i] + valid_output = (in_shape[i] - dilated_filter_size + self.strides[i]) // self.strides[i] + if same_output == valid_output: + padding_list += [0, 0] + else: + p = dilated_filter_size - 1 + p_0 = p // 2 + padding_list += [p_0, p - p_0] + if sum(padding_list) != 0: + padding = [[0, 0], + [padding_list[0], padding_list[1]], # top, bottom padding + [padding_list[2], padding_list[3]], # left, right padding + [0, 0]] + inputs = tf.pad(inputs, padding) + return inputs + + def _get_conv_indices(self, feature_map_size): + """the x, y coordinates in the window when a filter sliding on the feature map + + :param feature_map_size: + :return: y, x with shape [1, out_h, out_w, filter_h * filter_w] + """ + feat_h, feat_w = [int(i) for i in feature_map_size[0: 2]] + + x, y = tf.meshgrid(tf.range(feat_w), tf.range(feat_h)) + x, y = [tf.reshape(i, [1, *i.get_shape(), 1]) for i in [x, y]] # shape [1, h, w, 1] + x, y = [tf.image.extract_patches(i, + [1, *self.kernel_size, 1], + [1, *self.strides, 1], + [1, *self.dilation_rate, 1], + 'VALID') + for i in [x, y]] # shape [1, out_h, out_w, filter_h * filter_w] + return y, x + + @staticmethod + def _get_pixel_values_at_point(inputs, indices): + """get pixel values + + :param inputs: + :param indices: shape [batch_size, H, W, I], I = filter_h * filter_w * channel_out + :return: + """ + y, x = indices + batch, h, w, n = y.get_shape().as_list()[0: 4] + + batch_idx = tf.reshape(tf.range(0, batch), (batch, 1, 1, 1)) + b = tf.tile(batch_idx, (1, h, w, n)) + pixel_idx = tf.stack([b, y, x], axis=-1) + return tf.gather_nd(inputs, pixel_idx) + diff --git a/core/fpn.py b/core/fpn.py new file mode 100644 index 0000000..33d80c5 --- /dev/null +++ b/core/fpn.py @@ -0,0 +1,32 @@ +from numpy import short +import tensorflow as tf +from tensorflow.keras.layers import Conv2D +from core.blocks import coordconv , convblock ,sppblock , upsampling + +def fpn(c3,c4,c5): + conv_1 = coordconv(c5) + conv_1 = Conv2D(filters=512 , kernel_size=(1,1),padding='same')(conv_1) + conv_1 = tf.nn.relu(conv_1) + conv_1 = convblock(conv_1) + conv_1 = sppblock(conv_1) + conv_1 = convblock(conv_1) + shortcut_1 = conv_1 + + conv_2 = upsampling(shortcut_1) + conv_2 = tf.concat([c4 , conv_2],axis =-1) + conv_2 = coordconv(conv_2) + conv_2 = Conv2D(filters=256, kernel_size=(1,1),padding='same')(conv_2) + conv_2 = tf.nn.relu(conv_2) + conv_2 = convblock(conv_2) + conv_2 = convblock(conv_2) + shortcut_2 = conv_2 + + conv_3 = upsampling(shortcut_2) + conv_3 = tf.concat([c3,conv_3],axis = -1) + conv_3 =coordconv(conv_3) + conv_3 = Conv2D(filters=128 , kernel_size=(1,1))(conv_3) + conv_3 = tf.nn.relu(conv_3) + conv_3 = convblock(conv_3) + conv_3 = convblock(conv_3) + + return [conv_1,conv_2,conv_3] \ No newline at end of file diff --git a/core/head.py b/core/head.py new file mode 100644 index 0000000..1729806 --- /dev/null +++ b/core/head.py @@ -0,0 +1,16 @@ +from core.blocks import conv_head + + +def head(features): + + p5 = features[0] + p4 = features[1] + p3 = features[2] + + head_1 = conv_head(p5) + head_2 = conv_head(p4) + head_3 = conv_head(p3) + + return [head_3,head_2, head_1] + + diff --git a/core/resnet_50.py b/core/resnet_50.py new file mode 100644 index 0000000..9cc40e4 --- /dev/null +++ b/core/resnet_50.py @@ -0,0 +1,91 @@ +import tensorflow as tf +from tensorflow.keras import layers +from tensorflow.keras.layers import Conv2D +from tensorflow.python.keras.engine import input_layer +from core.deformable_conv_layer import DeformableConvLayer +def resnet_50(resnet,training=True): + + if not training: # For lyers freeze + print('Resnet layers are freezed...') + for layer in resnet.layers: + layer.trainable = False + else : # For layers training + print('Resnet layers are trainable...') + for layer in resnet.layers : + layer.trainable=True + + + + c3 = resnet.get_layer('conv3_block4_out').output + c4 = resnet.get_layer('conv4_block6_out').output + c5 = resnet.get_layer('conv5_block3_out').output + #modified_c5 = resnet_50_last_stage_custom(c4) + print('Resnet50 is loaded with Imagenet weights...') + print('c1 , c2 , c3 layers loaded sucessfully..') + + return c3 , c4 , c5 + +def resnet_50_last_stage_custom(last_stage_in): + #modified Resnet50 last stage + #Block 1 + block_1 = Conv2D(filters = 512,strides = (2,2),kernel_size = (1,1))(last_stage_in) + block_1 = layers.BatchNormalization()(block_1) + block_1 = tf.nn.relu(block_1) + block_1 = Conv2D(filters = 512,kernel_size = (3,3),padding='same')(block_1) + block_1 = layers.BatchNormalization()(block_1) + block_1 = tf.nn.relu(block_1) + block_1 = Conv2D(filters = 2048,kernel_size = (1,1))(block_1) + block_1 = layers.BatchNormalization()(block_1) + block_1 = tf.nn.relu(block_1) + + #adding_1 + last_stage_input = Conv2D(filters=2048 , kernel_size=(1,1),strides=(2,2))(last_stage_in) + last_stage_input = layers.BatchNormalization()(last_stage_input) + last_stage_input = tf.nn.relu(last_stage_input) + block_1 = block_1 + last_stage_input + + #Block 2 + block_2 = Conv2D(filters = 512,kernel_size = (1,1))(block_1) + block_2 = layers.BatchNormalization()(block_2) + block_2 = tf.nn.relu(block_2) + block_2 = Conv2D(filters = 512,kernel_size = (3,3),padding='same')(block_2) + block_2 = layers.BatchNormalization()(block_2) + block_2 = tf.nn.relu(block_2) + block_2 = Conv2D(filters = 2048,kernel_size = (1,1))(block_2) + block_2 = layers.BatchNormalization()(block_2) + block_2 = tf.nn.relu(block_2) + + # adding 2 + block_2 = block_2 + block_1 + + #Block 3 + block_3 = Conv2D(filters = 512,kernel_size = (1,1))(block_2) + block_3 = layers.BatchNormalization()(block_3) + block_3 = tf.nn.relu(block_3) + block_3 = Conv2D(filters = 512,kernel_size = (3,3),padding='same')(block_3) + block_3 = layers.BatchNormalization()(block_3) + block_3 = tf.nn.relu(block_3) + block_3 = Conv2D(filters = 2048,kernel_size = (1,1),name= 'conv5_block3_out')(block_3) + block_3 = layers.BatchNormalization()(block_3) + block_3 = tf.nn.relu(block_3) + + # adding 2 + block_3 = block_3 + block_2 + + return block_3 + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/data/dataset/img1.jpg b/data/dataset/img1.jpg new file mode 100644 index 0000000..084e250 Binary files /dev/null and b/data/dataset/img1.jpg differ diff --git a/data/dataset/img1.txt b/data/dataset/img1.txt new file mode 100644 index 0000000..6de3d7b --- /dev/null +++ b/data/dataset/img1.txt @@ -0,0 +1 @@ +4 0.908717 0.433594 0.172697 0.164062 diff --git a/data/dataset/img2.jpg b/data/dataset/img2.jpg new file mode 100644 index 0000000..ddcc6f2 Binary files /dev/null and b/data/dataset/img2.jpg differ diff --git a/data/dataset/img2.txt b/data/dataset/img2.txt new file mode 100644 index 0000000..f4700ab --- /dev/null +++ b/data/dataset/img2.txt @@ -0,0 +1,2 @@ +3 0.514062 0.458333 0.190625 0.245833 +3 0.583594 0.525000 0.189062 0.245833 diff --git a/data/dataset/img3.jpg b/data/dataset/img3.jpg new file mode 100644 index 0000000..f3a2cfc Binary files /dev/null and b/data/dataset/img3.jpg differ diff --git a/data/dataset/img3.txt b/data/dataset/img3.txt new file mode 100644 index 0000000..a9b96f5 --- /dev/null +++ b/data/dataset/img3.txt @@ -0,0 +1 @@ +4 0.573770 0.557813 0.852459 0.775000 diff --git a/detect_img.py b/detect_img.py new file mode 100644 index 0000000..89911fb --- /dev/null +++ b/detect_img.py @@ -0,0 +1,84 @@ +import tensorflow as tf +physical_devices = tf.config.experimental.list_physical_devices('GPU') +if len(physical_devices) > 0: + tf.config.experimental.set_memory_growth(physical_devices[0], True) +from absl import app, flags, logging +from absl.flags import FLAGS +from utils import utils +from utils.config import cfg +import argparse +from tensorflow.python.saved_model import tag_constants +from PIL import Image +import cv2 +import numpy as np +from tensorflow.compat.v1 import ConfigProto +from tensorflow.compat.v1 import InteractiveSession + + +def main(): + config = ConfigProto() + config.gpu_options.allow_growth = True + session = InteractiveSession(config=config) + STRIDES, ANCHORS, NUM_CLASS, XYSCALE = utils.load_config() + input_size = args.size + image_path = args.image + + + + saved_model_loaded = tf.saved_model.load(args.model, tags=[tag_constants.SERVING]) + + + original_image = cv2.imread(image_path) + original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB) + + image_data = cv2.resize(original_image, (input_size, input_size)) + image_data = image_data / 255. + + image_data = np.asarray(image_data).astype(np.float32) + image_data = np.expand_dims(image_data,axis=0) + + + infer = saved_model_loaded.signatures['serving_default'] + batch_data = tf.constant(image_data) + pred_bbox = infer(batch_data) + for key, value in pred_bbox.items(): + boxes = value[:, :, 0:4] + pred_conf = value[:, :, 4:] + + boxes, scores, classes, valid_detections = tf.image.combined_non_max_suppression( + boxes=tf.reshape(boxes, (tf.shape(boxes)[0], -1, 1, 4)), + scores=tf.reshape( + pred_conf, (tf.shape(pred_conf)[0], -1, tf.shape(pred_conf)[-1])), + max_output_size_per_class=50, + max_total_size=50, + iou_threshold=args.iou, + score_threshold=args.score + ) + pred_bbox = [boxes.numpy(), scores.numpy(), classes.numpy(), valid_detections.numpy()] + + # read in all class names from config + class_names = utils.read_class_names(cfg.YOLO.CLASSES) + + # by default allow all classes in .names file + allowed_classes = list(class_names.values()) + + + + image = utils.draw_bbox(original_image, pred_bbox, allowed_classes = allowed_classes) + + image = Image.fromarray(image.astype(np.uint8)) + + image = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2RGB) + cv2.imwrite(args.output, image) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-m','--model',help = 'path to model',type = str , default='./checkpoints/saved_model') + parser.add_argument('-s','--size',help = 'image size',type = int ,default = 416 ) + parser.add_argument('-i','--image',help = 'path of input image', type = str ) + parser.add_argument('-u','--iou',help = 'iou threshold',type = float , default = 0.4) + parser.add_argument('-c','--score',help = 'score threshold',type = float , default = 0.2 ) + parser.add_argument('-o','--output',help ='path to save output', type = str , default = './output/output.jpg') + + args = parser.parse_args() + main() \ No newline at end of file diff --git a/detect_vid.py b/detect_vid.py new file mode 100644 index 0000000..3c71384 --- /dev/null +++ b/detect_vid.py @@ -0,0 +1,114 @@ +import tensorflow as tf +physical_devices = tf.config.experimental.list_physical_devices('GPU') +if len(physical_devices) > 0: + tf.config.experimental.set_memory_growth(physical_devices[0], True) +from absl import app, flags, logging +from absl.flags import FLAGS +from utils import utils +from utils.config import cfg +import argparse +from tensorflow.python.saved_model import tag_constants +from PIL import Image +import cv2 +import numpy as np +from tensorflow.compat.v1 import ConfigProto +from tensorflow.compat.v1 import InteractiveSession +import time + + +def main(): + config = ConfigProto() + config.gpu_options.allow_growth = True + session = InteractiveSession(config=config) + STRIDES, ANCHORS, NUM_CLASS, XYSCALE = utils.load_config() + input_size = args.size + vid_path = args.video + + saved_model_loaded = tf.saved_model.load(args.model, tags=[tag_constants.SERVING]) + + + vid = cv2.VideoCapture(vid_path) + # read in all class names from config + class_names = utils.read_class_names(cfg.YOLO.CLASSES) + + # by default allow all classes in .names file + allowed_classes = list(class_names.values()) + if args.output: + # by default VideoCapture returns float instead of int + width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) + fps = int(vid.get(cv2.CAP_PROP_FPS)) + codec = cv2.VideoWriter_fourcc(*'XVID') + out = cv2.VideoWriter(args.output, codec, fps, (width, height)) + + while True: + return_value, frame = vid.read() + if return_value: + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + image = Image.fromarray(frame) + else: + print('Video has ended or failed, try a different video format!') + break + + frame_size = frame.shape[:2] + image_data = cv2.resize(frame, (input_size, input_size)) + image_data = image_data / 255. + image_data = image_data[np.newaxis, ...].astype(np.float32) + start_time = time.time() + + infer = saved_model_loaded.signatures['serving_default'] + batch_data = tf.constant(image_data) + pred_bbox = infer(batch_data) + for key, value in pred_bbox.items(): + boxes = value[:, :, 0:4] + pred_conf = value[:, :, 4:] + + boxes, scores, classes, valid_detections = tf.image.combined_non_max_suppression( + boxes=tf.reshape(boxes, (tf.shape(boxes)[0], -1, 1, 4)), + scores=tf.reshape( + pred_conf, (tf.shape(pred_conf)[0], -1, tf.shape(pred_conf)[-1])), + max_output_size_per_class=50, + max_total_size=50, + iou_threshold=args.iou, + score_threshold=args.score ) + + pred_bbox = [boxes.numpy(), scores.numpy(), classes.numpy(), valid_detections.numpy()] + image = utils.draw_bbox(frame, pred_bbox , allowed_classes = allowed_classes) + fps = 1.0 / (time.time() - start_time) + print("FPS: %.2f" % fps) + result = np.asarray(image) + + result = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) + + if args.show == 'True': + print(args.show) + cv2.namedWindow("result", cv2.WINDOW_AUTOSIZE) + cv2.imshow('reuslt',result) + + if args.output: + out.write(result) + if cv2.waitKey(1) & 0xFF == ord('q'): break + cv.destroyAllWindows() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-m','--model',help = 'path to model',type = str , default='./checkpoints/saved_model') + parser.add_argument('-s','--size',help = 'image size',type = int ,default = 416 ) + parser.add_argument('-i','--video',help = 'path of input image', type = str ) + parser.add_argument('-u','--iou',help = 'iou threshold',type = float , default = 0.4) + parser.add_argument('-c','--score',help = 'score threshold',type = float , default = 0.2 ) + parser.add_argument('-o','--output',help ='path to save output', type = str , default = './output/output.avi') + parser.add_argument('-w','--show',help ='Show result in window' , type = str, default= 'False' ) + args = parser.parse_args() + main() + + + + + + + + + + diff --git a/output/output.jpg b/output/output.jpg new file mode 100644 index 0000000..73c979b Binary files /dev/null and b/output/output.jpg differ diff --git a/sample_outputs/output_1.jpeg b/sample_outputs/output_1.jpeg new file mode 100644 index 0000000..1f01655 Binary files /dev/null and b/sample_outputs/output_1.jpeg differ diff --git a/sample_outputs/output_2.jpeg b/sample_outputs/output_2.jpeg new file mode 100644 index 0000000..3dfd94f Binary files /dev/null and b/sample_outputs/output_2.jpeg differ diff --git a/sample_outputs/output_3.jpeg b/sample_outputs/output_3.jpeg new file mode 100644 index 0000000..b0f03c5 Binary files /dev/null and b/sample_outputs/output_3.jpeg differ diff --git a/sample_outputs/output_4.jpeg b/sample_outputs/output_4.jpeg new file mode 100644 index 0000000..73c979b Binary files /dev/null and b/sample_outputs/output_4.jpeg differ diff --git a/source/test.jpeg b/source/test.jpeg new file mode 100644 index 0000000..32a574a Binary files /dev/null and b/source/test.jpeg differ diff --git a/train.py b/train.py new file mode 100644 index 0000000..e447a01 --- /dev/null +++ b/train.py @@ -0,0 +1,161 @@ +import os +import shutil +import tensorflow as tf +physical_devices = tf.config.experimental.list_physical_devices('GPU') +if len(physical_devices) > 0: + tf.config.experimental.set_memory_growth(physical_devices[0], True) +from utils.data_process import compute_loss, decode_train +from utils.dataset import Dataset +from utils.config import cfg +import numpy as np +from utils import utils +from utils.utils import freeze_all, unfreeze_all +from core.fpn import fpn +from core.resnet_50 import resnet_50 +from core.head import head +import argparse + +def main(): + print('loading datasets...') + trainset = Dataset(is_training=True) + #testset = Dataset(is_training=False) + logdir = "./data/log" + isfreeze = False + steps_per_epoch = len(trainset) + first_stage_epochs = cfg.TRAIN.FISRT_STAGE_EPOCHS + second_stage_epochs = cfg.TRAIN.SECOND_STAGE_EPOCHS + global_steps = tf.Variable(1, trainable=False, dtype=tf.int64) + warmup_steps = cfg.TRAIN.WARMUP_EPOCHS * steps_per_epoch + total_steps = (first_stage_epochs + second_stage_epochs) * steps_per_epoch + # train_steps = (first_stage_epochs + second_stage_epochs) * steps_per_period + + #input_layer = tf.keras.layers.Input([cfg.TRAIN.INPUT_SIZE, cfg.TRAIN.INPUT_SIZE, 3]) + STRIDES, ANCHORS, NUM_CLASS, XYSCALE = utils.load_config() + IOU_LOSS_THRESH = cfg.YOLO.IOU_LOSS_THRESH + + freeze_layers = utils.load_freeze_layer() + resnet = tf.keras.applications.ResNet50(include_top=False,weights='imagenet',input_shape=(cfg.TRAIN.INPUT_SIZE,cfg.TRAIN.INPUT_SIZE,3)) + c3 , c4 ,c5 = resnet_50(resnet) + neck_output = fpn(c3 , c4 , c5) + head_output = head(neck_output) + feature_maps = head_output + bbox_tensors = [] + for i, fm in enumerate(feature_maps): + if i == 0: + bbox_tensor = decode_train(fm, cfg.TRAIN.INPUT_SIZE // 8, NUM_CLASS, STRIDES, ANCHORS, i, XYSCALE) + elif i == 1: + bbox_tensor = decode_train(fm, cfg.TRAIN.INPUT_SIZE // 16, NUM_CLASS, STRIDES, ANCHORS, i, XYSCALE) + else: + bbox_tensor = decode_train(fm, cfg.TRAIN.INPUT_SIZE // 32, NUM_CLASS, STRIDES, ANCHORS, i, XYSCALE) + bbox_tensors.append(fm) + bbox_tensors.append(bbox_tensor) + + model = tf.keras.Model(resnet.input, bbox_tensors) + #model.summary() + if args.resume != None: + print('Resuming with last weights -> .{}'.format(args.resume)) + model.load_weights(args.resume) + + else : + print("Training from scratch") + + + + optimizer = tf.keras.optimizers.Adam(learning_rate= 0.00001) + if os.path.exists(logdir): shutil.rmtree(logdir) + writer = tf.summary.create_file_writer(logdir) + + # define training step function + # @tf.function + def train_step(image_data, target): + with tf.GradientTape() as tape: + pred_result = model(image_data, training=True) + giou_loss = conf_loss = prob_loss = 0 + + # optimizing process + for i in range(len(freeze_layers)): + conv, pred = pred_result[i * 2], pred_result[i * 2 + 1] + loss_items = compute_loss(pred, conv, target[i][0], target[i][1], STRIDES=STRIDES, NUM_CLASS=NUM_CLASS, IOU_LOSS_THRESH=IOU_LOSS_THRESH, i=i) + giou_loss += loss_items[0] + conf_loss += loss_items[1] + prob_loss += loss_items[2] + + total_loss = giou_loss + conf_loss + prob_loss + + gradients = tape.gradient(total_loss, model.trainable_variables) + optimizer.apply_gradients(zip(gradients, model.trainable_variables)) + tf.print("=> STEP %4d/%4d lr: %.6f giou_loss: %4.2f conf_loss: %4.2f " + "prob_loss: %4.2f total_loss: %4.2f" % (global_steps, total_steps, optimizer.lr.numpy(), + giou_loss, conf_loss, + prob_loss, total_loss)) + # update learning rate + global_steps.assign_add(1) + if args.const_lr == True: + optimizer.lr.assign(args.lr) + else: + if global_steps < warmup_steps: + lr = global_steps / warmup_steps * cfg.TRAIN.LR_INIT + else: + lr = cfg.TRAIN.LR_END + 0.5 * (cfg.TRAIN.LR_INIT - cfg.TRAIN.LR_END) * ( + (1 + tf.cos((global_steps - warmup_steps) / (total_steps - warmup_steps) * np.pi)) + ) + optimizer.lr.assign(lr.numpy()) + + + # writing summary data + with writer.as_default(): + tf.summary.scalar("lr", optimizer.lr, step=global_steps) + tf.summary.scalar("loss/total_loss", total_loss, step=global_steps) + tf.summary.scalar("loss/giou_loss", giou_loss, step=global_steps) + tf.summary.scalar("loss/conf_loss", conf_loss, step=global_steps) + tf.summary.scalar("loss/prob_loss", prob_loss, step=global_steps) + writer.flush() + def test_step(image_data, target): + with tf.GradientTape() as tape: + pred_result = model(image_data, training=True) + giou_loss = conf_loss = prob_loss = 0 + + # optimizing process + for i in range(len(freeze_layers)): + conv, pred = pred_result[i * 2], pred_result[i * 2 + 1] + loss_items = compute_loss(pred, conv, target[i][0], target[i][1], STRIDES=STRIDES, NUM_CLASS=NUM_CLASS, IOU_LOSS_THRESH=IOU_LOSS_THRESH, i=i) + giou_loss += loss_items[0] + conf_loss += loss_items[1] + prob_loss += loss_items[2] + + total_loss = giou_loss + conf_loss + prob_loss + + tf.print("=> TEST STEP %4d giou_loss: %4.2f conf_loss: %4.2f " + "prob_loss: %4.2f total_loss: %4.2f" % (global_steps, giou_loss, conf_loss, + prob_loss, total_loss)) + + for epoch in range(first_stage_epochs + second_stage_epochs): + if epoch < first_stage_epochs: + if not isfreeze: + isfreeze = True + for name in freeze_layers: + freeze = model.get_layer(name) + freeze_all(freeze) + elif epoch >= first_stage_epochs: + if isfreeze: + isfreeze = False + for name in freeze_layers: + freeze = model.get_layer(name) + unfreeze_all(freeze) + for image_data, target in trainset: + train_step(image_data, target) + #for image_data, target in testset: + # test_step(image_data, target) + print('saving checkpoints...-> {}'.format(args.checkpoints)) + model.save_weights(args.checkpoints) + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-r','--resume',help = 'path to last weights',type = str) + parser.add_argument('-c','--checkpoints',help = 'path to save checkpoints',type = str ,default = './checkpoints/pp_yolo') + parser.add_argument('-l','--lr',help = 'learning rate for resuming',type= float , default= cfg.CONST_LR) + parser.add_argument('-n','--const_lr',help = 'set constant lr',type = bool,default = False) + args = parser.parse_args() + main() diff --git a/utils/config.py b/utils/config.py new file mode 100644 index 0000000..4370bdd --- /dev/null +++ b/utils/config.py @@ -0,0 +1,55 @@ +#! /usr/bin/env python +# coding=utf-8 +from easydict import EasyDict as edict + + +__C = edict() +# Consumers can get config by: from config import cfg + +cfg = __C + +# YOLO options +__C.YOLO = edict() + +__C.YOLO.CLASSES = "utils/obj.names" +__C.YOLO.NUM_CLASSES = 6 +__C.YOLO.ANCHORS = [12,16, 19,36, 40,28, 36,75, 76,55, 72,146, 142,110, 192,243, 459,401] +__C.YOLO.ANCHORS_V3 = [10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326] +__C.YOLO.ANCHORS_TINY = [23,27, 37,58, 81,82, 81,82, 135,169, 344,319] +__C.YOLO.STRIDES = [8, 16, 32] +__C.YOLO.STRIDES_TINY = [16, 32] +__C.YOLO.XYSCALE = [1.2, 1.1, 1.05] +__C.YOLO.XYSCALE_TINY = [1.05, 1.05] +__C.YOLO.ANCHOR_PER_SCALE = 3 +__C.YOLO.IOU_LOSS_THRESH = 0.5 + + +# Train options +__C.TRAIN = edict() + +__C.TRAIN.ANNOT_PATH = "utils/train.txt" +__C.TRAIN.BATCH_SIZE = 8 +# __C.TRAIN.INPUT_SIZE = [320, 352, 384, 416, 448, 480, 512, 544, 576, 608] +__C.TRAIN.INPUT_SIZE = 416 +__C.TRAIN.DATA_AUG = True +__C.TRAIN.LR_INIT = 1e-3 +__C.TRAIN.LR_END = 1e-6 +__C.CONST_LR = 0.00002 +__C.TRAIN.WARMUP_EPOCHS = 2 +__C.TRAIN.FISRT_STAGE_EPOCHS = 60 +__C.TRAIN.SECOND_STAGE_EPOCHS = 70 + + + +# TEST options +__C.TEST = edict() + +__C.TEST.ANNOT_PATH = "./data/dataset/val2017.txt" +__C.TEST.BATCH_SIZE = 2 +__C.TEST.INPUT_SIZE = 416 +__C.TEST.DATA_AUG = False +__C.TEST.DECTECTED_IMAGE_PATH = "./data/detection/" +__C.TEST.SCORE_THRESHOLD = 0.25 +__C.TEST.IOU_THRESHOLD = 0.5 + + diff --git a/utils/data_process.py b/utils/data_process.py new file mode 100644 index 0000000..6b64e88 --- /dev/null +++ b/utils/data_process.py @@ -0,0 +1,128 @@ +import tensorflow as tf +from utils import utils +import numpy as np + + +def decode_train(conv_output, output_size, NUM_CLASS, STRIDES, ANCHORS, i=0, XYSCALE=[1, 1, 1]): + conv_output = tf.reshape(conv_output, + (tf.shape(conv_output)[0], output_size, output_size, 3, 5 + NUM_CLASS)) + + conv_raw_dxdy, conv_raw_dwdh, conv_raw_conf, conv_raw_prob = tf.split(conv_output, (2, 2, 1, NUM_CLASS), + axis=-1) + + xy_grid = tf.meshgrid(tf.range(output_size), tf.range(output_size)) + xy_grid = tf.expand_dims(tf.stack(xy_grid, axis=-1), axis=2) # [gx, gy, 1, 2] + xy_grid = tf.tile(tf.expand_dims(xy_grid, axis=0), [tf.shape(conv_output)[0], 1, 1, 3, 1]) + + xy_grid = tf.cast(xy_grid, tf.float32) + + pred_xy = ((tf.sigmoid(conv_raw_dxdy) * XYSCALE[i]) - 0.5 * (XYSCALE[i] - 1) + xy_grid) * \ + STRIDES[i] + pred_wh = (tf.exp(conv_raw_dwdh) * ANCHORS[i]) + pred_xywh = tf.concat([pred_xy, pred_wh], axis=-1) + + pred_conf = tf.sigmoid(conv_raw_conf) + pred_prob = tf.sigmoid(conv_raw_prob) + + return tf.concat([pred_xywh, pred_conf, pred_prob], axis=-1) + +def compute_loss(pred, conv, label, bboxes, STRIDES, NUM_CLASS, IOU_LOSS_THRESH, i=0): + conv_shape = tf.shape(conv) + batch_size = conv_shape[0] + output_size = conv_shape[1] + input_size = STRIDES[i] * output_size + conv = tf.reshape(conv, (batch_size, output_size, output_size, 3, 5 + NUM_CLASS)) + + conv_raw_conf = conv[:, :, :, :, 4:5] + conv_raw_prob = conv[:, :, :, :, 5:] + + pred_xywh = pred[:, :, :, :, 0:4] + pred_conf = pred[:, :, :, :, 4:5] + + label_xywh = label[:, :, :, :, 0:4] + respond_bbox = label[:, :, :, :, 4:5] + label_prob = label[:, :, :, :, 5:] + + giou = tf.expand_dims(utils.bbox_giou(pred_xywh, label_xywh), axis=-1) + input_size = tf.cast(input_size, tf.float32) + + bbox_loss_scale = 2.0 - 1.0 * label_xywh[:, :, :, :, 2:3] * label_xywh[:, :, :, :, 3:4] / (input_size ** 2) + giou_loss = respond_bbox * bbox_loss_scale * (1- giou) + + iou = utils.bbox_iou(pred_xywh[:, :, :, :, np.newaxis, :], bboxes[:, np.newaxis, np.newaxis, np.newaxis, :, :]) + max_iou = tf.expand_dims(tf.reduce_max(iou, axis=-1), axis=-1) + + respond_bgd = (1.0 - respond_bbox) * tf.cast( max_iou < IOU_LOSS_THRESH, tf.float32 ) + + conf_focal = tf.pow(respond_bbox - pred_conf, 2) + + conf_loss = conf_focal * ( + respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits(labels=respond_bbox, logits=conv_raw_conf) + + + respond_bgd * tf.nn.sigmoid_cross_entropy_with_logits(labels=respond_bbox, logits=conv_raw_conf) + ) + + prob_loss = respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits(labels=label_prob, logits=conv_raw_prob) + + giou_loss = tf.reduce_mean(tf.reduce_sum(giou_loss, axis=[1,2,3,4])) + conf_loss = tf.reduce_mean(tf.reduce_sum(conf_loss, axis=[1,2,3,4])) + prob_loss = tf.reduce_mean(tf.reduce_sum(prob_loss, axis=[1,2,3,4])) + + return giou_loss, conf_loss, prob_loss + +def filter_boxes(box_xywh, scores, score_threshold=0.4, input_shape = tf.constant([416,416])): + scores_max = tf.math.reduce_max(scores, axis=-1) + + mask = scores_max >= score_threshold + class_boxes = tf.boolean_mask(box_xywh, mask) + pred_conf = tf.boolean_mask(scores, mask) + class_boxes = tf.reshape(class_boxes, [tf.shape(scores)[0], -1, tf.shape(class_boxes)[-1]]) + pred_conf = tf.reshape(pred_conf, [tf.shape(scores)[0], -1, tf.shape(pred_conf)[-1]]) + + box_xy, box_wh = tf.split(class_boxes, (2, 2), axis=-1) + + input_shape = tf.cast(input_shape, dtype=tf.float32) + + box_yx = box_xy[..., ::-1] + box_hw = box_wh[..., ::-1] + + box_mins = (box_yx - (box_hw / 2.)) / input_shape + box_maxes = (box_yx + (box_hw / 2.)) / input_shape + boxes = tf.concat([ + box_mins[..., 0:1], # y_min + box_mins[..., 1:2], # x_min + box_maxes[..., 0:1], # y_max + box_maxes[..., 1:2] # x_max + ], axis=-1) + # return tf.concat([boxes, pred_conf], axis=-1) + return (boxes, pred_conf) + +def decode_tf(conv_output, output_size, NUM_CLASS, STRIDES, ANCHORS, i=0, XYSCALE=[1, 1, 1]): + batch_size = tf.shape(conv_output)[0] + conv_output = tf.reshape(conv_output, + (batch_size, output_size, output_size, 3, 5 + NUM_CLASS)) + + conv_raw_dxdy, conv_raw_dwdh, conv_raw_conf, conv_raw_prob = tf.split(conv_output, (2, 2, 1, NUM_CLASS), + axis=-1) + + xy_grid = tf.meshgrid(tf.range(output_size), tf.range(output_size)) + xy_grid = tf.expand_dims(tf.stack(xy_grid, axis=-1), axis=2) # [gx, gy, 1, 2] + xy_grid = tf.tile(tf.expand_dims(xy_grid, axis=0), [batch_size, 1, 1, 3, 1]) + + xy_grid = tf.cast(xy_grid, tf.float32) + + pred_xy = ((tf.sigmoid(conv_raw_dxdy) * XYSCALE[i]) - 0.5 * (XYSCALE[i] - 1) + xy_grid) * \ + STRIDES[i] + pred_wh = (tf.exp(conv_raw_dwdh) * ANCHORS[i]) + pred_xywh = tf.concat([pred_xy, pred_wh], axis=-1) + + pred_conf = tf.sigmoid(conv_raw_conf) + pred_prob = tf.sigmoid(conv_raw_prob) + + pred_prob = pred_conf * pred_prob + pred_prob = tf.reshape(pred_prob, (batch_size, -1, NUM_CLASS)) + pred_xywh = tf.reshape(pred_xywh, (batch_size, -1, 4)) + + return pred_xywh, pred_prob + # return tf.concat([pred_xywh, pred_conf, pred_prob], axis=-1) + diff --git a/utils/dataset.py b/utils/dataset.py new file mode 100644 index 0000000..3663577 --- /dev/null +++ b/utils/dataset.py @@ -0,0 +1,375 @@ +#! /usr/bin/env python +# coding=utf-8 + +import os +import cv2 +import random +import numpy as np +import tensorflow as tf +from utils import utils +from utils.config import cfg + + +class Dataset(object): + """implement Dataset here""" + + def __init__(self, is_training: bool, dataset_type: str = "yolo"): + + self.strides, self.anchors, NUM_CLASS, XYSCALE = utils.load_config() + self.dataset_type = dataset_type + + self.annot_path = ( + cfg.TRAIN.ANNOT_PATH if is_training else cfg.TEST.ANNOT_PATH) + + self.input_sizes = ( + cfg.TRAIN.INPUT_SIZE if is_training else cfg.TEST.INPUT_SIZE + ) + self.batch_size = ( + cfg.TRAIN.BATCH_SIZE if is_training else cfg.TEST.BATCH_SIZE + ) + self.data_aug = cfg.TRAIN.DATA_AUG if is_training else cfg.TEST.DATA_AUG + + self.train_input_sizes = cfg.TRAIN.INPUT_SIZE + self.classes = utils.read_class_names(cfg.YOLO.CLASSES) + self.num_classes = len(self.classes) + self.anchor_per_scale = cfg.YOLO.ANCHOR_PER_SCALE + self.max_bbox_per_scale = 150 + + self.annotations = self.load_annotations() + self.num_samples = len(self.annotations) + self.num_batchs = int(np.ceil(self.num_samples / self.batch_size)) + self.batch_count = 0 + + def load_annotations(self): + with open(self.annot_path, "r") as f: + txt = f.readlines() + if self.dataset_type == "yolo": + annotations = [] + for line in txt: + image_path = line.strip() + root, _ = os.path.splitext(image_path) + try : + with open(root + ".txt") as fd: + boxes = fd.readlines() + string = "" + for box in boxes: + box = box.strip() + box = box.split() + class_num = int(box[0]) + center_x = float(box[1]) + center_y = float(box[2]) + half_width = float(box[3]) / 2 + half_height = float(box[4]) / 2 + string += " {},{},{},{},{}".format( + center_x - half_width, + center_y - half_height, + center_x + half_width, + center_y + half_height, + class_num, + ) + annotations.append(image_path + string) + except: + print('missing file') + np.random.shuffle(annotations) + return annotations + + def __iter__(self): + return self + + def __next__(self): + with tf.device("/cpu:0"): + # self.train_input_size = random.choice(self.train_input_sizes) + self.train_input_size = cfg.TRAIN.INPUT_SIZE + self.train_output_sizes = self.train_input_size // self.strides + + batch_image = np.zeros( + ( + self.batch_size, + self.train_input_size, + self.train_input_size, + 3, + ), + dtype=np.float32, + ) + + batch_label_sbbox = np.zeros( + ( + self.batch_size, + self.train_output_sizes[0], + self.train_output_sizes[0], + self.anchor_per_scale, + 5 + self.num_classes, + ), + dtype=np.float32, + ) + batch_label_mbbox = np.zeros( + ( + self.batch_size, + self.train_output_sizes[1], + self.train_output_sizes[1], + self.anchor_per_scale, + 5 + self.num_classes, + ), + dtype=np.float32, + ) + batch_label_lbbox = np.zeros( + ( + self.batch_size, + self.train_output_sizes[2], + self.train_output_sizes[2], + self.anchor_per_scale, + 5 + self.num_classes, + ), + dtype=np.float32, + ) + + batch_sbboxes = np.zeros( + (self.batch_size, self.max_bbox_per_scale, 4), dtype=np.float32 + ) + batch_mbboxes = np.zeros( + (self.batch_size, self.max_bbox_per_scale, 4), dtype=np.float32 + ) + batch_lbboxes = np.zeros( + (self.batch_size, self.max_bbox_per_scale, 4), dtype=np.float32 + ) + + num = 0 + if self.batch_count < self.num_batchs: + while num < self.batch_size: + index = self.batch_count * self.batch_size + num + if index >= self.num_samples: + index -= self.num_samples + annotation = self.annotations[index] + image, bboxes = self.parse_annotation(annotation) + ( + label_sbbox, + label_mbbox, + label_lbbox, + sbboxes, + mbboxes, + lbboxes, + ) = self.preprocess_true_boxes(bboxes) + + batch_image[num, :, :, :] = image + batch_label_sbbox[num, :, :, :, :] = label_sbbox + batch_label_mbbox[num, :, :, :, :] = label_mbbox + batch_label_lbbox[num, :, :, :, :] = label_lbbox + batch_sbboxes[num, :, :] = sbboxes + batch_mbboxes[num, :, :] = mbboxes + batch_lbboxes[num, :, :] = lbboxes + num += 1 + self.batch_count += 1 + batch_smaller_target = batch_label_sbbox, batch_sbboxes + batch_medium_target = batch_label_mbbox, batch_mbboxes + batch_larger_target = batch_label_lbbox, batch_lbboxes + + return ( + batch_image, + ( + batch_smaller_target, + batch_medium_target, + batch_larger_target, + ), + ) + else: + self.batch_count = 0 + np.random.shuffle(self.annotations) + raise StopIteration + + def random_horizontal_flip(self, image, bboxes): + if random.random() < 0.5: + _, w, _ = image.shape + image = image[:, ::-1, :] + bboxes[:, [0, 2]] = w - bboxes[:, [2, 0]] + + return image, bboxes + + def random_crop(self, image, bboxes): + if random.random() < 0.5: + h, w, _ = image.shape + max_bbox = np.concatenate( + [ + np.min(bboxes[:, 0:2], axis=0), + np.max(bboxes[:, 2:4], axis=0), + ], + axis=-1, + ) + + max_l_trans = max_bbox[0] + max_u_trans = max_bbox[1] + max_r_trans = w - max_bbox[2] + max_d_trans = h - max_bbox[3] + + crop_xmin = max( + 0, int(max_bbox[0] - random.uniform(0, max_l_trans)) + ) + crop_ymin = max( + 0, int(max_bbox[1] - random.uniform(0, max_u_trans)) + ) + crop_xmax = max( + w, int(max_bbox[2] + random.uniform(0, max_r_trans)) + ) + crop_ymax = max( + h, int(max_bbox[3] + random.uniform(0, max_d_trans)) + ) + + image = image[crop_ymin:crop_ymax, crop_xmin:crop_xmax] + + bboxes[:, [0, 2]] = bboxes[:, [0, 2]] - crop_xmin + bboxes[:, [1, 3]] = bboxes[:, [1, 3]] - crop_ymin + + return image, bboxes + + def random_translate(self, image, bboxes): + if random.random() < 0.5: + h, w, _ = image.shape + max_bbox = np.concatenate( + [ + np.min(bboxes[:, 0:2], axis=0), + np.max(bboxes[:, 2:4], axis=0), + ], + axis=-1, + ) + + max_l_trans = max_bbox[0] + max_u_trans = max_bbox[1] + max_r_trans = w - max_bbox[2] + max_d_trans = h - max_bbox[3] + + tx = random.uniform(-(max_l_trans - 1), (max_r_trans - 1)) + ty = random.uniform(-(max_u_trans - 1), (max_d_trans - 1)) + + M = np.array([[1, 0, tx], [0, 1, ty]]) + image = cv2.warpAffine(image, M, (w, h)) + + bboxes[:, [0, 2]] = bboxes[:, [0, 2]] + tx + bboxes[:, [1, 3]] = bboxes[:, [1, 3]] + ty + + return image, bboxes + + def parse_annotation(self, annotation): + line = annotation.split() + image_path = line[0] + if not os.path.exists(image_path): + raise KeyError("%s does not exist ... " % image_path) + image = cv2.imread(image_path) + + if self.dataset_type == "yolo": + height, width, _ = image.shape + bboxes = np.array( + [list(map(float, box.split(","))) for box in line[1:]] + ) + bboxes = bboxes * np.array([width, height, width, height, 1]) + bboxes = bboxes.astype(np.int64) + + if self.data_aug: + image, bboxes = self.random_horizontal_flip( + np.copy(image), np.copy(bboxes) + ) + image, bboxes = self.random_crop(np.copy(image), np.copy(bboxes)) + image, bboxes = self.random_translate( + np.copy(image), np.copy(bboxes) + ) + + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + image, bboxes = utils.image_preprocess( + np.copy(image), + [self.train_input_size, self.train_input_size], + np.copy(bboxes), + ) + return image, bboxes + + + def preprocess_true_boxes(self, bboxes): + label = [ + np.zeros( + ( + self.train_output_sizes[i], + self.train_output_sizes[i], + self.anchor_per_scale, + 5 + self.num_classes, + ) + ) + for i in range(3) + ] + bboxes_xywh = [np.zeros((self.max_bbox_per_scale, 4)) for _ in range(3)] + bbox_count = np.zeros((3,)) + + for bbox in bboxes: + bbox_coor = bbox[:4] + bbox_class_ind = bbox[4] + + onehot = np.zeros(self.num_classes, dtype=np.float) + onehot[bbox_class_ind] = 1.0 + uniform_distribution = np.full( + self.num_classes, 1.0 / self.num_classes + ) + deta = 0.01 + smooth_onehot = onehot * (1 - deta) + deta * uniform_distribution + + bbox_xywh = np.concatenate( + [ + (bbox_coor[2:] + bbox_coor[:2]) * 0.5, + bbox_coor[2:] - bbox_coor[:2], + ], + axis=-1, + ) + bbox_xywh_scaled = ( + 1.0 * bbox_xywh[np.newaxis, :] / self.strides[:, np.newaxis] + ) + + iou = [] + exist_positive = False + for i in range(3): + anchors_xywh = np.zeros((self.anchor_per_scale, 4)) + anchors_xywh[:, 0:2] = ( + np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32) + 0.5 + ) + anchors_xywh[:, 2:4] = self.anchors[i] + + iou_scale = utils.bbox_iou( + bbox_xywh_scaled[i][np.newaxis, :], anchors_xywh + ) + iou.append(iou_scale) + iou_mask = iou_scale > 0.3 + + if np.any(iou_mask): + xind, yind = np.floor(bbox_xywh_scaled[i, 0:2]).astype( + np.int32 + ) + + label[i][yind, xind, iou_mask, :] = 0 + label[i][yind, xind, iou_mask, 0:4] = bbox_xywh + label[i][yind, xind, iou_mask, 4:5] = 1.0 + label[i][yind, xind, iou_mask, 5:] = smooth_onehot + + bbox_ind = int(bbox_count[i] % self.max_bbox_per_scale) + bboxes_xywh[i][bbox_ind, :4] = bbox_xywh + bbox_count[i] += 1 + + exist_positive = True + + if not exist_positive: + best_anchor_ind = np.argmax(np.array(iou).reshape(-1), axis=-1) + best_detect = int(best_anchor_ind / self.anchor_per_scale) + best_anchor = int(best_anchor_ind % self.anchor_per_scale) + xind, yind = np.floor( + bbox_xywh_scaled[best_detect, 0:2] + ).astype(np.int32) + + label[best_detect][yind, xind, best_anchor, :] = 0 + label[best_detect][yind, xind, best_anchor, 0:4] = bbox_xywh + label[best_detect][yind, xind, best_anchor, 4:5] = 1.0 + label[best_detect][yind, xind, best_anchor, 5:] = smooth_onehot + + bbox_ind = int( + bbox_count[best_detect] % self.max_bbox_per_scale + ) + bboxes_xywh[best_detect][bbox_ind, :4] = bbox_xywh + bbox_count[best_detect] += 1 + label_sbbox, label_mbbox, label_lbbox = label + sbboxes, mbboxes, lbboxes = bboxes_xywh + return label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes + + def __len__(self): + return self.num_batchs diff --git a/utils/obj.names b/utils/obj.names new file mode 100644 index 0000000..3f2ed5b --- /dev/null +++ b/utils/obj.names @@ -0,0 +1,6 @@ +aeroplane +car +chair +cow +person +traffic_light \ No newline at end of file diff --git a/utils/train.txt b/utils/train.txt new file mode 100644 index 0000000..20b7158 --- /dev/null +++ b/utils/train.txt @@ -0,0 +1,689 @@ +Dataset_2/000000301376.jpg +Dataset_2/000000514376.jpg +Dataset_2/000000475484.jpg +Dataset_2/000000442746.jpg +Dataset_2/000000577932.jpg +Dataset_2/000000553339.jpg +Dataset_2/000000307598.jpg +Dataset_2/000000575372.jpg +Dataset_2/000000545100.jpg +Dataset_2/000000544605.jpg +Dataset_2/000000319369.jpg +Dataset_2/000000423798.jpg +Dataset_2/000000253835.jpg +Dataset_2/000000307074.jpg +Dataset_2/000000245102.jpg +Dataset_2/000000171382.jpg +Dataset_2/000000460147.jpg +Dataset_2/000000109441.jpg +Dataset_2/000000418696.jpg +Dataset_2/000000269682.jpg +Dataset_2/000000213255.jpg +Dataset_2/000000274272.jpg +Dataset_2/000000538458.jpg +Dataset_2/000000479030.jpg +Dataset_2/000000021839.jpg +Dataset_2/000000155451.jpg +Dataset_2/000000351559.jpg +Dataset_2/000000315187.jpg +Dataset_2/000000088432.jpg +Dataset_2/000000099114.jpg +Dataset_2/000000520009.jpg +Dataset_2/000000122166.jpg +Dataset_2/000000198928.jpg +Dataset_2/000000155443.jpg +Dataset_2/000000356169.jpg +Dataset_2/000000178982.jpg +Dataset_2/000000423229.jpg +Dataset_2/000000440617.jpg +Dataset_2/000000216419.jpg +Dataset_2/000000057149.jpg +Dataset_2/000000254814.jpg +Dataset_2/000000175438.jpg +Dataset_2/000000144706.jpg +Dataset_2/000000501023.jpg +Dataset_2/000000295809.jpg +Dataset_2/000000350023.jpg +Dataset_2/000000028993.jpg +Dataset_2/000000380706.jpg +Dataset_2/000000260266.jpg +Dataset_2/000000412894.jpg +Dataset_2/000000385190.jpg +Dataset_2/000000575243.jpg +Dataset_2/000000115946.jpg +Dataset_2/000000252219.jpg +Dataset_2/000000482585.jpg +Dataset_2/000000430871.jpg +Dataset_2/000000032941.jpg +Dataset_2/000000453841.jpg +Dataset_2/000000061268.jpg +Dataset_2/000000292997.jpg +Dataset_2/000000110282.jpg +Dataset_2/000000076547.jpg +Dataset_2/000000441553.jpg +Dataset_2/000000565012.jpg +Dataset_2/000000169169.jpg +Dataset_2/000000430875.jpg +Dataset_2/000000275727.jpg +Dataset_2/000000431876.jpg +Dataset_2/000000320232.jpg +Dataset_2/000000309391.jpg +Dataset_2/000000319607.jpg +Dataset_2/000000142585.jpg +Dataset_2/000000438017.jpg +Dataset_2/000000284445.jpg +Dataset_2/000000338624.jpg +Dataset_2/000000039484.jpg +Dataset_2/000000133819.jpg +Dataset_2/000000242411.jpg +Dataset_2/000000157365.jpg +Dataset_2/000000491213.jpg +Dataset_2/000000534639.jpg +Dataset_2/000000287291.jpg +Dataset_2/000000572555.jpg +Dataset_2/000000496854.jpg +Dataset_2/000000569030.jpg +Dataset_2/000000244379.jpg +Dataset_2/000000165039.jpg +Dataset_2/000000365745.jpg +Dataset_2/000000071877.jpg +Dataset_2/000000480944.jpg +Dataset_2/000000323751.jpg +Dataset_2/000000193717.jpg +Dataset_2/000000531134.jpg +Dataset_2/000000054967.jpg +Dataset_2/000000306893.jpg +Dataset_2/000000215723.jpg +Dataset_2/000000119516.jpg +Dataset_2/000000280710.jpg +Dataset_2/000000449198.jpg +Dataset_2/000000157756.jpg +Dataset_2/000000492077.jpg +Dataset_2/000000361103.jpg +Dataset_2/000000526392.jpg +Dataset_2/000000204871.jpg +Dataset_2/000000315450.jpg +Dataset_2/000000125572.jpg +Dataset_2/000000221754.jpg +Dataset_2/000000542856.jpg +Dataset_2/000000226417.jpg +Dataset_2/000000500826.jpg +Dataset_2/000000512564.jpg +Dataset_2/000000166509.jpg +Dataset_2/000000454661.jpg +Dataset_2/000000338560.jpg +Dataset_2/000000532530.jpg +Dataset_2/000000553511.jpg +Dataset_2/000000410712.jpg +Dataset_2/000000076417.jpg +Dataset_2/000000088218.jpg +Dataset_2/000000170545.jpg +Dataset_2/000000433774.jpg +Dataset_2/000000555050.jpg +Dataset_2/000000269942.jpg +Dataset_2/000000505942.jpg +Dataset_2/000000169996.jpg +Dataset_2/000000094751.jpg +Dataset_2/000000467511.jpg +Dataset_2/000000496722.jpg +Dataset_2/000000571943.jpg +Dataset_2/000000377946.jpg +Dataset_2/000000228942.jpg +Dataset_2/000000213593.jpg +Dataset_2/000000311883.jpg +Dataset_2/000000334371.jpg +Dataset_2/000000458325.jpg +Dataset_2/000000281179.jpg +Dataset_2/000000162858.jpg +Dataset_2/000000212573.jpg +Dataset_2/000000069138.jpg +Dataset_2/000000230450.jpg +Dataset_2/000000026204.jpg +Dataset_2/000000503841.jpg +Dataset_2/000000184400.jpg +Dataset_2/000000008762.jpg +Dataset_2/000000373353.jpg +Dataset_2/000000516708.jpg +Dataset_2/000000393226.jpg +Dataset_2/000000565778.jpg +Dataset_2/000000148999.jpg +Dataset_2/000000336232.jpg +Dataset_2/000000467315.jpg +Dataset_2/000000156071.jpg +Dataset_2/000000483667.jpg +Dataset_2/000000278848.jpg +Dataset_2/000000284762.jpg +Dataset_2/000000491867.jpg +Dataset_2/000000074058.jpg +Dataset_2/000000352684.jpg +Dataset_2/000000254368.jpg +Dataset_2/000000436738.jpg +Dataset_2/000000565563.jpg +Dataset_2/000000057760.jpg +Dataset_2/000000057672.jpg +Dataset_2/000000180487.jpg +Dataset_2/000000532855.jpg +Dataset_2/000000459153.jpg +Dataset_2/000000123213.jpg +Dataset_2/000000393569.jpg +Dataset_2/000000336265.jpg +Dataset_2/000000434459.jpg +Dataset_2/000000377239.jpg +Dataset_2/000000303499.jpg +Dataset_2/000000131444.jpg +Dataset_2/000000508312.jpg +Dataset_2/000000057597.jpg +Dataset_2/000000262487.jpg +Dataset_2/000000369037.jpg +Dataset_2/000000295316.jpg +Dataset_2/000000074092.jpg +Dataset_2/000000434548.jpg +Dataset_2/000000426376.jpg +Dataset_2/000000426268.jpg +Dataset_2/000000229659.jpg +Dataset_2/000000549220.jpg +Dataset_2/000000541055.jpg +Dataset_2/000000180560.jpg +Dataset_2/000000098716.jpg +Dataset_2/000000147725.jpg +Dataset_2/000000278749.jpg +Dataset_2/000000033104.jpg +Dataset_2/000000147740.jpg +Dataset_2/000000540932.jpg +Dataset_2/000000393469.jpg +Dataset_2/000000442480.jpg +Dataset_2/000000368961.jpg +Dataset_2/000000032901.jpg +Dataset_2/000000467176.jpg +Dataset_2/000000426203.jpg +Dataset_2/000000319721.jpg +Dataset_2/000000295138.jpg +Dataset_2/000000106757.jpg +Dataset_2/000000180383.jpg +Dataset_2/000000377113.jpg +Dataset_2/000000270677.jpg +Dataset_2/000000418062.jpg +Dataset_2/000000336053.jpg +Dataset_2/000000278705.jpg +Dataset_2/000000352491.jpg +Dataset_2/000000147729.jpg +Dataset_2/000000336209.jpg +Dataset_2/000000303305.jpg +Dataset_2/000000205105.jpg +Dataset_2/000000008532.jpg +Dataset_2/000000352582.jpg +Dataset_2/000000188592.jpg +Dataset_2/000000475387.jpg +Dataset_2/000000229601.jpg +Dataset_2/000000311392.jpg +Dataset_2/000000090284.jpg +Dataset_2/000000033005.jpg +Dataset_2/000000016598.jpg +Dataset_2/000000098392.jpg +Dataset_2/000000016451.jpg +Dataset_2/000000516318.jpg +Dataset_2/000000073946.jpg +Dataset_2/000000114884.jpg +Dataset_2/000000229553.jpg +Dataset_2/000000065736.jpg +Dataset_2/000000532690.jpg +Dataset_2/000000524456.jpg +Dataset_2/000000549055.jpg +Dataset_2/000000188465.jpg +Dataset_2/000000442456.jpg +Dataset_2/000000426241.jpg +Dataset_2/000000458992.jpg +Dataset_2/000000213171.jpg +Dataset_2/000000417911.jpg +Dataset_2/000000507975.jpg +Dataset_2/000000450686.jpg +Dataset_2/000000254016.jpg +Dataset_2/000000368752.jpg +Dataset_2/000000122962.jpg +Dataset_2/000000082085.jpg +Dataset_2/000000311394.jpg +Dataset_2/000000213086.jpg +Dataset_2/000000434247.jpg +Dataset_2/000000049259.jpg +Dataset_2/000000221291.jpg +Dataset_2/000000516173.jpg +Dataset_2/000000008211.jpg +Dataset_2/000000532481.jpg +Dataset_2/000000270474.jpg +Dataset_2/000000131138.jpg +Dataset_2/000000032817.jpg +Dataset_2/000000434230.jpg +Dataset_2/000000311303.jpg +Dataset_2/000000376900.jpg +Dataset_2/000000032861.jpg +Dataset_2/000000032887.jpg +Dataset_2/000000106912.jpg +Dataset_2/000000081988.jpg +Dataset_2/000000114907.jpg +Dataset_2/000000000139.jpg +Dataset_2/000000065798.jpg +Dataset_2/000000475191.jpg +Dataset_2/000000385029.jpg +Dataset_2/000000466986.jpg +Dataset_2/000000327701.jpg +Dataset_2/000000401446.jpg +Dataset_2/000000188439.jpg +Dataset_2/000000516143.jpg +Dataset_2/000000319534.jpg +Dataset_2/000000213033.jpg +Dataset_2/000000434204.jpg +Dataset_2/000000360661.jpg +Dataset_2/000000344268.jpg +Dataset_2/000000508101.jpg +Dataset_2/000000213035.jpg +Dataset_2/000000442463.jpg +Dataset_2/000000532493.jpg +Dataset_2/000000418281.jpg +Dataset_2/000000572408.jpg +Dataset_2/000000458755.jpg +Dataset_2/000000430073.jpg +Dataset_2/000000416758.jpg +Dataset_2/000000581100.jpg +Dataset_2/000000559099.jpg +Dataset_2/000000200667.jpg +Dataset_2/000000221155.jpg +Dataset_2/000000087470.jpg +Dataset_2/000000023937.jpg +Dataset_2/000000090062.jpg +Dataset_2/000000492992.jpg +Dataset_2/000000206135.jpg +Dataset_2/000000129416.jpg +Dataset_2/000000399764.jpg +Dataset_2/000000125806.jpg +Dataset_2/000000137576.jpg +Dataset_2/000000229858.jpg +Dataset_2/000000306582.jpg +Dataset_2/000000258911.jpg +Dataset_2/000000526706.jpg +Dataset_2/000000229221.jpg +Dataset_2/000000166287.jpg +Dataset_2/000000500663.jpg +Dataset_2/000000235857.jpg +Dataset_2/000000356612.jpg +Dataset_2/000000396205.jpg +Dataset_2/000000060823.jpg +Dataset_2/000000417085.jpg +Dataset_2/000000580418.jpg +Dataset_2/000000272212.jpg +Dataset_2/000000224093.jpg +Dataset_2/000000415990.jpg +Dataset_2/000000267434.jpg +Dataset_2/000000244411.jpg +Dataset_2/000000119038.jpg +Dataset_2/000000545958.jpg +Dataset_2/000000041635.jpg +Dataset_2/000000118594.jpg +Dataset_2/000000219440.jpg +Dataset_2/000000140583.jpg +Dataset_2/000000389451.jpg +Dataset_2/000000361268.jpg +Dataset_2/000000334555.jpg +Dataset_2/000000329447.jpg +Dataset_2/000000124636.jpg +Dataset_2/000000298697.jpg +Dataset_2/000000467776.jpg +Dataset_2/000000061171.jpg +Dataset_2/000000052565.jpg +Dataset_2/000000047010.jpg +Dataset_2/000000223188.jpg +Dataset_2/000000412887.jpg +Dataset_2/000000152740.jpg +Dataset_2/000000302165.jpg +Dataset_2/000000347664.jpg +Dataset_2/000000459396.jpg +Dataset_2/000000014888.jpg +Dataset_2/000000535094.jpg +Dataset_2/000000125072.jpg +Dataset_2/000000016010.jpg +Dataset_2/000000233567.jpg +Dataset_2/000000518213.jpg +Dataset_2/000000133778.jpg +Dataset_2/000000455219.jpg +Dataset_2/000000240754.jpg +Dataset_2/000000193162.jpg +Dataset_2/000000276024.jpg +Dataset_2/000000314034.jpg +Dataset_2/000000512648.jpg +Dataset_2/000000222863.jpg +Dataset_2/000000231580.jpg +Dataset_2/000000416837.jpg +Dataset_2/000000377486.jpg +Dataset_2/000000573626.jpg +Dataset_2/000000105264.jpg +Dataset_2/000000387148.jpg +Dataset_2/000000357081.jpg +Dataset_2/000000435206.jpg +Dataset_2/000000247838.jpg +Dataset_2/000000571857.jpg +Dataset_2/000000402992.jpg +Dataset_2/000000289393.jpg +Dataset_2/000000332318.jpg +Dataset_2/000000066038.jpg +Dataset_2/000000438774.jpg +Dataset_2/000000084477.jpg +Dataset_2/000000248314.jpg +Dataset_2/000000500211.jpg +Dataset_2/000000080340.jpg +Dataset_2/000000367082.jpg +Dataset_2/000000129492.jpg +Dataset_2/000000520659.jpg +Dataset_2/000000436617.jpg +Dataset_2/000000395701.jpg +Dataset_2/000000383386.jpg +Dataset_2/000000170474.jpg +Dataset_2/000000541123.jpg +Dataset_2/000000575970.jpg +Dataset_2/000000559543.jpg +Dataset_2/000000213445.jpg +Dataset_2/000000139684.jpg +Dataset_2/000000319935.jpg +Dataset_2/000000039405.jpg +Dataset_2/000000405970.jpg +Dataset_2/000000074209.jpg +Dataset_2/000000485844.jpg +Dataset_2/000000160012.jpg +Dataset_2/000000379332.jpg +Dataset_2/000000532901.jpg +Dataset_2/000000301421.jpg +Dataset_2/000000336356.jpg +Dataset_2/000000231879.jpg +Dataset_2/000000326082.jpg +Dataset_2/000000192670.jpg +Dataset_2/000000489764.jpg +Dataset_2/000000571718.jpg +Dataset_2/000000434479.jpg +Dataset_2/000000004495.jpg +Dataset_2/000000190753.jpg +Dataset_2/000000162092.jpg +Dataset_2/000000493905.jpg +Dataset_2/000000536947.jpg +Dataset_2/000000237984.jpg +Dataset_2/000000416104.jpg +Dataset_2/000000078170.jpg +Dataset_2/000000338219.jpg +Dataset_2/000000235836.jpg +Dataset_2/000000579970.jpg +Dataset_2/000000127270.jpg +Dataset_2/000000366884.jpg +Dataset_2/000000084362.jpg +Dataset_2/000000520531.jpg +Dataset_2/000000240023.jpg +Dataset_2/000000457078.jpg +Dataset_2/000000325838.jpg +Dataset_2/000000504074.jpg +Dataset_2/000000082180.jpg +Dataset_2/000000166166.jpg +Dataset_2/000000194832.jpg +Dataset_2/000000543047.jpg +Dataset_2/000000151820.jpg +Dataset_2/000000540962.jpg +Dataset_2/000000368900.jpg +Dataset_2/000000340175.jpg +Dataset_2/000000540928.jpg +Dataset_2/000000264441.jpg +Dataset_2/000000356424.jpg +Dataset_2/000000274687.jpg +Dataset_2/000000342128.jpg +Dataset_2/000000239857.jpg +Dataset_2/000000231508.jpg +Dataset_2/000000532761.jpg +Dataset_2/000000059598.jpg +Dataset_2/000000127394.jpg +Dataset_2/000000190648.jpg +Dataset_2/000000420069.jpg +Dataset_2/000000229849.jpg +Dataset_2/000000213224.jpg +Dataset_2/000000213422.jpg +Dataset_2/000000231831.jpg +Dataset_2/000000022892.jpg +Dataset_2/000000461009.jpg +Dataset_2/000000194875.jpg +Dataset_2/000000014439.jpg +Dataset_2/000000061584.jpg +Dataset_2/000000367095.jpg +Dataset_2/000000563470.jpg +Dataset_2/000000067616.jpg +Dataset_2/000000473219.jpg +Dataset_2/000000313454.jpg +Dataset_2/000000419974.jpg +Dataset_2/000000297084.jpg +Dataset_2/000000411754.jpg +Dataset_2/000000125129.jpg +Dataset_2/000000481390.jpg +Dataset_2/000000196754.jpg +Dataset_2/000000163951.jpg +Dataset_2/000000020553.jpg +Dataset_2/000000079969.jpg +Dataset_2/000000546964.jpg +Dataset_2/000000174231.jpg +Dataset_2/000000100510.jpg +Dataset_2/000000133244.jpg +Dataset_2/000000217285.jpg +Dataset_2/000000428280.jpg +Dataset_2/000000440475.jpg +Dataset_2/000000374982.jpg +Dataset_2/000000473237.jpg +Dataset_2/000000194724.jpg +Dataset_2/000000395343.jpg +Dataset_2/000000180296.jpg +Dataset_2/000000277020.jpg +Dataset_2/000000462904.jpg +Dataset_2/000000530466.jpg +Dataset_2/000000328238.jpg +Dataset_2/000000277051.jpg +Dataset_2/000000024610.jpg +Dataset_2/000000530975.jpg +Dataset_2/000000446522.jpg +Dataset_2/000000368684.jpg +Dataset_2/000000004134.jpg +Dataset_2/000000115245.jpg +Dataset_2/000000407614.jpg +Dataset_2/000000221708.jpg +Dataset_2/000000246308.jpg +Dataset_2/000000098853.jpg +Dataset_2/000000397354.jpg +Dataset_2/000000215072.jpg +Dataset_2/000000159791.jpg +Dataset_2/000000025096.jpg +Dataset_2/000000166426.jpg +Dataset_2/000000344621.jpg +Dataset_2/000000051738.jpg +Dataset_2/000000031248.jpg +Dataset_2/000000241668.jpg +Dataset_2/000000034873.jpg +Dataset_2/000000047112.jpg +Dataset_2/000000303653.jpg +Dataset_2/000000172595.jpg +Dataset_2/000000295420.jpg +Dataset_2/000000268378.jpg +Dataset_2/000000166391.jpg +Dataset_2/000000068093.jpg +Dataset_2/000000137727.jpg +Dataset_2/000000018837.jpg +Dataset_2/000000086483.jpg +Dataset_2/000000563653.jpg +Dataset_2/000000303566.jpg +Dataset_2/000000102805.jpg +Dataset_2/000000389566.jpg +Dataset_2/000000354753.jpg +Dataset_2/000000334309.jpg +Dataset_2/000000151962.jpg +Dataset_2/000000111086.jpg +Dataset_2/000000111036.jpg +Dataset_2/000000088462.jpg +Dataset_2/000000414133.jpg +Dataset_2/000000176606.jpg +Dataset_2/000000283037.jpg +Dataset_2/000000033221.jpg +Dataset_2/000000283038.jpg +Dataset_2/000000428454.jpg +Dataset_2/000000135604.jpg +Dataset_2/000000577976.jpg +Dataset_2/000000174482.jpg +Dataset_2/000000121242.jpg +Dataset_2/000000200961.jpg +Dataset_2/000000526728.jpg +Dataset_2/000000033109.jpg +Dataset_2/000000463037.jpg +Dataset_2/000000313588.jpg +Dataset_2/000000276720.jpg +Dataset_2/000000141597.jpg +Dataset_2/000000100624.jpg +Dataset_2/000000411938.jpg +Dataset_2/000000260470.jpg +Dataset_2/000000135410.jpg +Dataset_2/000000168330.jpg +Dataset_2/000000172330.jpg +Dataset_2/000000127263.jpg +Dataset_2/000000567640.jpg +Dataset_2/000000543043.jpg +Dataset_2/000000184611.jpg +Dataset_2/000000225532.jpg +Dataset_2/000000209222.jpg +Dataset_2/000000323828.jpg +Dataset_2/000000579902.jpg +Dataset_2/000000383289.jpg +Dataset_2/000000026926.jpg +Dataset_2/000000426372.jpg +Dataset_2/000000031118.jpg +Dataset_2/000000293071.jpg +Dataset_2/000000086220.jpg +Dataset_2/000000198805.jpg +Dataset_2/000000192716.jpg +Dataset_2/000000030828.jpg +Dataset_2/000000301135.jpg +Dataset_2/000000157928.jpg +Dataset_2/000000276707.jpg +Dataset_2/000000196759.jpg +Dataset_2/000000565391.jpg +Dataset_2/000000243867.jpg +Dataset_2/000000127092.jpg +Dataset_2/000000346232.jpg +Dataset_2/000000327890.jpg +Dataset_2/000000334006.jpg +Dataset_2/000000114770.jpg +Dataset_2/000000391290.jpg +Dataset_2/000000022755.jpg +Dataset_2/000000227511.jpg +Dataset_2/000000284725.jpg +Dataset_2/000000424162.jpg +Dataset_2/000000084170.jpg +Dataset_2/000000010363.jpg +Dataset_2/000000446651.jpg +Dataset_2/000000477227.jpg +Dataset_2/000000395801.jpg +Dataset_2/000000499775.jpg +Dataset_2/000000305695.jpg +Dataset_2/000000514586.jpg +Dataset_2/000000499768.jpg +Dataset_2/000000014380.jpg +Dataset_2/000000102411.jpg +Dataset_2/000000546823.jpg +Dataset_2/000000408120.jpg +Dataset_2/000000045596.jpg +Dataset_2/000000084492.jpg +Dataset_2/000000191013.jpg +Dataset_2/000000397351.jpg +Dataset_2/000000184324.jpg +Dataset_2/000000561679.jpg +Dataset_2/000000530470.jpg +Dataset_2/000000354829.jpg +Dataset_2/000000194716.jpg +Dataset_2/000000277005.jpg +Dataset_2/000000495054.jpg +Dataset_2/000000490413.jpg +Dataset_2/000000485802.jpg +Dataset_2/000000404479.jpg +Dataset_2/000000336309.jpg +Dataset_2/000000217060.jpg +Dataset_2/000000449996.jpg +Dataset_2/000000135673.jpg +Dataset_2/000000189828.jpg +Dataset_2/000000293324.jpg +Dataset_2/000000517523.jpg +Dataset_2/000000187745.jpg +Dataset_2/000000005477.jpg +Dataset_2/000000497568.jpg +Dataset_2/000000488270.jpg +Dataset_2/000000338325.jpg +Dataset_2/000000163746.jpg +Dataset_2/000000131386.jpg +Dataset_2/000000229747.jpg +Dataset_2/000000502599.jpg +Dataset_2/000000199977.jpg +Dataset_2/000000033114.jpg +Dataset_2/000000272136.jpg +Dataset_2/000000099054.jpg +Dataset_2/000000485237.jpg +Dataset_2/000000543528.jpg +Dataset_2/000000208208.jpg +Dataset_2/000000101787.jpg +Dataset_2/000000493286.jpg +Dataset_2/000000110359.jpg +Dataset_2/000000134886.jpg +Dataset_2/000000500049.jpg +Dataset_2/000000424776.jpg +Dataset_2/000000052017.jpg +Dataset_2/000000022396.jpg +Dataset_2/000000425221.jpg +Dataset_2/000000392481.jpg +Dataset_2/000000109900.jpg +Dataset_2/000000384350.jpg +Dataset_2/000000144114.jpg +Dataset_2/000000477441.jpg +Dataset_2/000000567432.jpg +Dataset_2/000000096549.jpg +Dataset_2/000000272049.jpg +Dataset_2/000000456865.jpg +Dataset_2/000000161044.jpg +Dataset_2/000000214205.jpg +Dataset_2/000000348881.jpg +Dataset_2/000000388258.jpg +Dataset_2/000000575205.jpg +Dataset_2/000000183500.jpg +Dataset_2/000000504000.jpg +Dataset_2/000000404128.jpg +Dataset_2/000000479912.jpg +Dataset_2/000000001761.jpg +Dataset_2/000000553094.jpg +Dataset_2/000000478862.jpg +Dataset_2/000000137950.jpg +Dataset_2/000000459467.jpg +Dataset_2/000000381639.jpg +Dataset_2/000000084752.jpg +Dataset_2/000000098520.jpg +Dataset_2/000000052412.jpg +Dataset_2/000000190676.jpg +Dataset_2/000000469174.jpg +Dataset_2/000000110721.jpg +Dataset_2/000000281693.jpg +Dataset_2/000000139871.jpg +Dataset_2/000000400922.jpg +Dataset_2/000000044652.jpg +Dataset_2/000000502347.jpg +Dataset_2/000000383621.jpg +Dataset_2/000000520324.jpg +Dataset_2/000000396903.jpg +Dataset_2/000000524850.jpg +Dataset_2/000000410221.jpg +Dataset_2/000000513580.jpg +Dataset_2/000000196185.jpg +Dataset_2/000000071711.jpg +Dataset_2/000000579158.jpg +Dataset_2/000000545407.jpg +Dataset_2/000000205401.jpg +Dataset_2/000000323709.jpg +Dataset_2/000000300659.jpg +Dataset_2/000000167540.jpg +Dataset_2/000000379453.jpg +Dataset_2/000000013348.jpg +Dataset_2/000000208901.jpg +Dataset_2/000000525322.jpg +Dataset_2/000000090631.jpg +Dataset_2/000000408112.jpg +Dataset_2/000000452122.jpg \ No newline at end of file diff --git a/utils/utils.py b/utils/utils.py new file mode 100644 index 0000000..6a1e93a --- /dev/null +++ b/utils/utils.py @@ -0,0 +1,358 @@ +import cv2 +import random +import colorsys +import numpy as np +import tensorflow as tf +from utils.config import cfg + +def load_freeze_layer(model='yolov4', tiny=False): + + freeze_layouts = ['conv3_block4_out', 'conv4_block6_out', 'conv5_block3_out'] + return freeze_layouts + + +def read_class_names(class_file_name): + names = {} + with open(class_file_name, 'r') as data: + for ID, name in enumerate(data): + names[ID] = name.strip('\n') + return names + +def load_config(): + + + STRIDES = np.array(cfg.YOLO.STRIDES) + + + ANCHORS = get_anchors(cfg.YOLO.ANCHORS_V3) + XYSCALE = [1, 1, 1] + NUM_CLASS = len(read_class_names(cfg.YOLO.CLASSES)) + + return STRIDES, ANCHORS, NUM_CLASS, XYSCALE + +def get_anchors(anchors_path): + anchors = np.array(anchors_path) + return anchors.reshape(3, 3, 2) + +def image_preprocess(image, target_size, gt_boxes=None): + + ih, iw = target_size + h, w, _ = image.shape + + scale = min(iw/w, ih/h) + nw, nh = int(scale * w), int(scale * h) + image_resized = cv2.resize(image, (nw, nh)) + + image_paded = np.full(shape=[ih, iw, 3], fill_value=128.0) + dw, dh = (iw - nw) // 2, (ih-nh) // 2 + image_paded[dh:nh+dh, dw:nw+dw, :] = image_resized + image_paded = image_paded / 255. + + if gt_boxes is None: + return image_paded + + else: + gt_boxes[:, [0, 2]] = gt_boxes[:, [0, 2]] * scale + dw + gt_boxes[:, [1, 3]] = gt_boxes[:, [1, 3]] * scale + dh + return image_paded, gt_boxes + +def draw_bbox(image, bboxes, classes=read_class_names(cfg.YOLO.CLASSES), allowed_classes=list(read_class_names(cfg.YOLO.CLASSES).values()), show_label=True): + num_classes = len(classes) + image_h, image_w, _ = image.shape + hsv_tuples = [(1.0 * x / num_classes, 1., 1.) for x in range(num_classes)] + colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples)) + colors = list(map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), colors)) + + random.seed(0) + random.shuffle(colors) + random.seed(None) + + out_boxes, out_scores, out_classes, num_boxes = bboxes + for i in range(num_boxes[0]): + if int(out_classes[0][i]) < 0 or int(out_classes[0][i]) > num_classes: continue + coor = out_boxes[0][i] + coor[0] = int(coor[0] * image_h) + coor[2] = int(coor[2] * image_h) + coor[1] = int(coor[1] * image_w) + coor[3] = int(coor[3] * image_w) + + fontScale = 0.5 + score = out_scores[0][i] + class_ind = int(out_classes[0][i]) + class_name = classes[class_ind] + + # check if class is in allowed classes + if class_name not in allowed_classes: + continue + else: + bbox_color = colors[class_ind] + bbox_thick = int(0.6 * (image_h + image_w) / 600) + c1, c2 = (coor[1], coor[0]), (coor[3], coor[2]) + cv2.rectangle(image, c1, c2, bbox_color, bbox_thick) + + if show_label: + bbox_mess = '%s: %.2f' % (classes[class_ind], score) + t_size = cv2.getTextSize(bbox_mess, 0, fontScale, thickness=bbox_thick // 2)[0] + c3 = (c1[0] + t_size[0], c1[1] - t_size[1] - 3) + cv2.rectangle(image, c1, (np.float32(c3[0]), np.float32(c3[1])), bbox_color, -1) #filled + + cv2.putText(image, bbox_mess, (c1[0], np.float32(c1[1] - 2)), cv2.FONT_HERSHEY_SIMPLEX, + fontScale, (0, 0, 0), bbox_thick // 2, lineType=cv2.LINE_AA) + return image + +def bbox_iou(bboxes1, bboxes2): + """ + @param bboxes1: (a, b, ..., 4) + @param bboxes2: (A, B, ..., 4) + x:X is 1:n or n:n or n:1 + @return (max(a,A), max(b,B), ...) + ex) (4,):(3,4) -> (3,) + (2,1,4):(2,3,4) -> (2,3) + """ + bboxes1_area = bboxes1[..., 2] * bboxes1[..., 3] + bboxes2_area = bboxes2[..., 2] * bboxes2[..., 3] + + bboxes1_coor = tf.concat( + [ + bboxes1[..., :2] - bboxes1[..., 2:] * 0.5, + bboxes1[..., :2] + bboxes1[..., 2:] * 0.5, + ], + axis=-1, + ) + bboxes2_coor = tf.concat( + [ + bboxes2[..., :2] - bboxes2[..., 2:] * 0.5, + bboxes2[..., :2] + bboxes2[..., 2:] * 0.5, + ], + axis=-1, + ) + + left_up = tf.maximum(bboxes1_coor[..., :2], bboxes2_coor[..., :2]) + right_down = tf.minimum(bboxes1_coor[..., 2:], bboxes2_coor[..., 2:]) + + inter_section = tf.maximum(right_down - left_up, 0.0) + inter_area = inter_section[..., 0] * inter_section[..., 1] + + union_area = bboxes1_area + bboxes2_area - inter_area + + iou = tf.math.divide_no_nan(inter_area, union_area) + + return iou + + +def bbox_giou(bboxes1, bboxes2): + """ + Generalized IoU + @param bboxes1: (a, b, ..., 4) + @param bboxes2: (A, B, ..., 4) + x:X is 1:n or n:n or n:1 + @return (max(a,A), max(b,B), ...) + ex) (4,):(3,4) -> (3,) + (2,1,4):(2,3,4) -> (2,3) + """ + bboxes1_area = bboxes1[..., 2] * bboxes1[..., 3] + bboxes2_area = bboxes2[..., 2] * bboxes2[..., 3] + + bboxes1_coor = tf.concat( + [ + bboxes1[..., :2] - bboxes1[..., 2:] * 0.5, + bboxes1[..., :2] + bboxes1[..., 2:] * 0.5, + ], + axis=-1, + ) + bboxes2_coor = tf.concat( + [ + bboxes2[..., :2] - bboxes2[..., 2:] * 0.5, + bboxes2[..., :2] + bboxes2[..., 2:] * 0.5, + ], + axis=-1, + ) + + left_up = tf.maximum(bboxes1_coor[..., :2], bboxes2_coor[..., :2]) + right_down = tf.minimum(bboxes1_coor[..., 2:], bboxes2_coor[..., 2:]) + + inter_section = tf.maximum(right_down - left_up, 0.0) + inter_area = inter_section[..., 0] * inter_section[..., 1] + + union_area = bboxes1_area + bboxes2_area - inter_area + + iou = tf.math.divide_no_nan(inter_area, union_area) + + enclose_left_up = tf.minimum(bboxes1_coor[..., :2], bboxes2_coor[..., :2]) + enclose_right_down = tf.maximum( + bboxes1_coor[..., 2:], bboxes2_coor[..., 2:] + ) + + enclose_section = enclose_right_down - enclose_left_up + enclose_area = enclose_section[..., 0] * enclose_section[..., 1] + + giou = iou - tf.math.divide_no_nan(enclose_area - union_area, enclose_area) + + return giou + + +def bbox_ciou(bboxes1, bboxes2): + """ + Complete IoU + @param bboxes1: (a, b, ..., 4) + @param bboxes2: (A, B, ..., 4) + x:X is 1:n or n:n or n:1 + @return (max(a,A), max(b,B), ...) + ex) (4,):(3,4) -> (3,) + (2,1,4):(2,3,4) -> (2,3) + """ + bboxes1_area = bboxes1[..., 2] * bboxes1[..., 3] + bboxes2_area = bboxes2[..., 2] * bboxes2[..., 3] + + bboxes1_coor = tf.concat( + [ + bboxes1[..., :2] - bboxes1[..., 2:] * 0.5, + bboxes1[..., :2] + bboxes1[..., 2:] * 0.5, + ], + axis=-1, + ) + bboxes2_coor = tf.concat( + [ + bboxes2[..., :2] - bboxes2[..., 2:] * 0.5, + bboxes2[..., :2] + bboxes2[..., 2:] * 0.5, + ], + axis=-1, + ) + + left_up = tf.maximum(bboxes1_coor[..., :2], bboxes2_coor[..., :2]) + right_down = tf.minimum(bboxes1_coor[..., 2:], bboxes2_coor[..., 2:]) + + inter_section = tf.maximum(right_down - left_up, 0.0) + inter_area = inter_section[..., 0] * inter_section[..., 1] + + union_area = bboxes1_area + bboxes2_area - inter_area + + iou = tf.math.divide_no_nan(inter_area, union_area) + + enclose_left_up = tf.minimum(bboxes1_coor[..., :2], bboxes2_coor[..., :2]) + enclose_right_down = tf.maximum( + bboxes1_coor[..., 2:], bboxes2_coor[..., 2:] + ) + + enclose_section = enclose_right_down - enclose_left_up + + c_2 = enclose_section[..., 0] ** 2 + enclose_section[..., 1] ** 2 + + center_diagonal = bboxes2[..., :2] - bboxes1[..., :2] + + rho_2 = center_diagonal[..., 0] ** 2 + center_diagonal[..., 1] ** 2 + + diou = iou - tf.math.divide_no_nan(rho_2, c_2) + + v = ( + ( + tf.math.atan( + tf.math.divide_no_nan(bboxes1[..., 2], bboxes1[..., 3]) + ) + - tf.math.atan( + tf.math.divide_no_nan(bboxes2[..., 2], bboxes2[..., 3]) + ) + ) + * 2 + / np.pi + ) ** 2 + + alpha = tf.math.divide_no_nan(v, 1 - iou + v) + + ciou = diou - alpha * v + + return ciou + +def nms(bboxes, iou_threshold, sigma=0.3, method='nms'): + """ + :param bboxes: (xmin, ymin, xmax, ymax, score, class) + + Note: soft-nms, https://arxiv.org/pdf/1704.04503.pdf + https://github.com/bharatsingh430/soft-nms + """ + classes_in_img = list(set(bboxes[:, 5])) + best_bboxes = [] + + for cls in classes_in_img: + cls_mask = (bboxes[:, 5] == cls) + cls_bboxes = bboxes[cls_mask] + + while len(cls_bboxes) > 0: + max_ind = np.argmax(cls_bboxes[:, 4]) + best_bbox = cls_bboxes[max_ind] + best_bboxes.append(best_bbox) + cls_bboxes = np.concatenate([cls_bboxes[: max_ind], cls_bboxes[max_ind + 1:]]) + iou = bbox_iou(best_bbox[np.newaxis, :4], cls_bboxes[:, :4]) + weight = np.ones((len(iou),), dtype=np.float32) + + assert method in ['nms', 'soft-nms'] + + if method == 'nms': + iou_mask = iou > iou_threshold + weight[iou_mask] = 0.0 + + if method == 'soft-nms': + weight = np.exp(-(1.0 * iou ** 2 / sigma)) + + cls_bboxes[:, 4] = cls_bboxes[:, 4] * weight + score_mask = cls_bboxes[:, 4] > 0. + cls_bboxes = cls_bboxes[score_mask] + + return best_bboxes + +def freeze_all(model, frozen=True): + model.trainable = not frozen + if isinstance(model, tf.keras.Model): + for l in model.layers: + freeze_all(l, frozen) + +def unfreeze_all(model, frozen=False): + model.trainable = not frozen + if isinstance(model, tf.keras.Model): + for l in model.layers: + unfreeze_all(l, frozen) + + +def draw_bbox(image, bboxes, classes=read_class_names(cfg.YOLO.CLASSES), allowed_classes=list(read_class_names(cfg.YOLO.CLASSES).values()), show_label=True): + num_classes = len(classes) + image_h, image_w, _ = image.shape + hsv_tuples = [(1.0 * x / num_classes, 1., 1.) for x in range(num_classes)] + colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples)) + colors = list(map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), colors)) + + random.seed(0) + random.shuffle(colors) + random.seed(None) + + out_boxes, out_scores, out_classes, num_boxes = bboxes + for i in range(num_boxes[0]): + if int(out_classes[0][i]) < 0 or int(out_classes[0][i]) > num_classes: continue + coor = out_boxes[0][i] + coor[0] = int(coor[0] * image_h) + coor[2] = int(coor[2] * image_h) + coor[1] = int(coor[1] * image_w) + coor[3] = int(coor[3] * image_w) + + fontScale = 0.5 + score = out_scores[0][i] + class_ind = int(out_classes[0][i]) + class_name = classes[class_ind] + + # check if class is in allowed classes + if class_name not in allowed_classes: + continue + else: + bbox_color = colors[class_ind] + bbox_thick = int(0.6 * (image_h + image_w) / 600) + c1, c2 = (coor[1], coor[0]), (coor[3], coor[2]) + cv2.rectangle(image, c1, c2, bbox_color, bbox_thick) + + if show_label: + bbox_mess = '%s: %.2f' % (classes[class_ind], score) + t_size = cv2.getTextSize(bbox_mess, 0, fontScale, thickness=bbox_thick // 2)[0] + c3 = (c1[0] + t_size[0], c1[1] - t_size[1] - 3) + cv2.rectangle(image, c1, (np.float32(c3[0]), np.float32(c3[1])), bbox_color, -1) #filled + + cv2.putText(image, bbox_mess, (c1[0], np.float32(c1[1] - 2)), cv2.FONT_HERSHEY_SIMPLEX, + fontScale, (0, 0, 0), bbox_thick // 2, lineType=cv2.LINE_AA) + return image \ No newline at end of file