Select Git revision
yolov4.py 25.65 KiB
#================================================================
#
# File name : yolov4.py
# Author : PyLessons
# Created date: 2020-09-31
# Website : https://pylessons.com/
# GitHub : https://github.com/pythonlessons/TensorFlow-2.x-YOLOv3
# Description : main yolov3 & yolov4 functions
#
#================================================================
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Conv2D, Input, LeakyReLU, ZeroPadding2D, BatchNormalization, MaxPool2D
from tensorflow.keras.regularizers import l2
from yolov3.configs import *
STRIDES = np.array(YOLO_STRIDES)
ANCHORS = (np.array(YOLO_ANCHORS).T/STRIDES).T
def read_class_names(class_file_name):
# loads class name from a file
names = {}
with open(class_file_name, 'r') as data:
for ID, name in enumerate(data):
names[ID] = name.strip('\n')
return names
class BatchNormalization(BatchNormalization):
# "Frozen state" and "inference mode" are two separate concepts.
# `layer.trainable = False` is to freeze the layer, so the layer will use
# stored moving `var` and `mean` in the "inference mode", and both `gama`
# and `beta` will not be updated !
def call(self, x, training=False):
if not training:
training = tf.constant(False)
training = tf.logical_and(training, self.trainable)
return super().call(x, training)
def convolutional(input_layer, filters_shape, downsample=False, activate=True, bn=True, activate_type='leaky'):
if downsample:
input_layer = ZeroPadding2D(((1, 0), (1, 0)))(input_layer)
padding = 'valid'
strides = 2
else:
strides = 1
padding = 'same'
conv = Conv2D(filters=filters_shape[-1], kernel_size = filters_shape[0], strides=strides,
padding=padding, use_bias=not bn, kernel_regularizer=l2(0.0005),
kernel_initializer=tf.random_normal_initializer(stddev=0.01),
bias_initializer=tf.constant_initializer(0.))(input_layer)
if bn:
conv = BatchNormalization()(conv)
if activate == True:
if activate_type == "leaky":
conv = LeakyReLU(alpha=0.1)(conv)
elif activate_type == "mish":
conv = mish(conv)
return conv
def mish(x):
return x * tf.math.tanh(tf.math.softplus(x))
def residual_block(input_layer, input_channel, filter_num1, filter_num2, activate_type='leaky'):
short_cut = input_layer
conv = convolutional(input_layer, filters_shape=(1, 1, input_channel, filter_num1), activate_type=activate_type)
conv = convolutional(conv , filters_shape=(3, 3, filter_num1, filter_num2), activate_type=activate_type)
residual_output = short_cut + conv
return residual_output
def upsample(input_layer):
return tf.image.resize(input_layer, (input_layer.shape[1] * 2, input_layer.shape[2] * 2), method='nearest')
def route_group(input_layer, groups, group_id):
convs = tf.split(input_layer, num_or_size_splits=groups, axis=-1)
return convs[group_id]
def darknet53(input_data):
input_data = convolutional(input_data, (3, 3, 3, 32))
input_data = convolutional(input_data, (3, 3, 32, 64), downsample=True)
for i in range(1):
input_data = residual_block(input_data, 64, 32, 64)
input_data = convolutional(input_data, (3, 3, 64, 128), downsample=True)
for i in range(2):
input_data = residual_block(input_data, 128, 64, 128)
input_data = convolutional(input_data, (3, 3, 128, 256), downsample=True)
for i in range(8):
input_data = residual_block(input_data, 256, 128, 256)
route_1 = input_data
input_data = convolutional(input_data, (3, 3, 256, 512), downsample=True)
for i in range(8):
input_data = residual_block(input_data, 512, 256, 512)
route_2 = input_data
input_data = convolutional(input_data, (3, 3, 512, 1024), downsample=True)
for i in range(4):
input_data = residual_block(input_data, 1024, 512, 1024)
return route_1, route_2, input_data
def cspdarknet53(input_data):
input_data = convolutional(input_data, (3, 3, 3, 32), activate_type="mish")
input_data = convolutional(input_data, (3, 3, 32, 64), downsample=True, activate_type="mish")
route = input_data
route = convolutional(route, (1, 1, 64, 64), activate_type="mish")
input_data = convolutional(input_data, (1, 1, 64, 64), activate_type="mish")
for i in range(1):
input_data = residual_block(input_data, 64, 32, 64, activate_type="mish")
input_data = convolutional(input_data, (1, 1, 64, 64), activate_type="mish")
input_data = tf.concat([input_data, route], axis=-1)
input_data = convolutional(input_data, (1, 1, 128, 64), activate_type="mish")
input_data = convolutional(input_data, (3, 3, 64, 128), downsample=True, activate_type="mish")
route = input_data
route = convolutional(route, (1, 1, 128, 64), activate_type="mish")
input_data = convolutional(input_data, (1, 1, 128, 64), activate_type="mish")
for i in range(2):
input_data = residual_block(input_data, 64, 64, 64, activate_type="mish")
input_data = convolutional(input_data, (1, 1, 64, 64), activate_type="mish")
input_data = tf.concat([input_data, route], axis=-1)
input_data = convolutional(input_data, (1, 1, 128, 128), activate_type="mish")
input_data = convolutional(input_data, (3, 3, 128, 256), downsample=True, activate_type="mish")
route = input_data
route = convolutional(route, (1, 1, 256, 128), activate_type="mish")
input_data = convolutional(input_data, (1, 1, 256, 128), activate_type="mish")
for i in range(8):
input_data = residual_block(input_data, 128, 128, 128, activate_type="mish")
input_data = convolutional(input_data, (1, 1, 128, 128), activate_type="mish")
input_data = tf.concat([input_data, route], axis=-1)
input_data = convolutional(input_data, (1, 1, 256, 256), activate_type="mish")
route_1 = input_data
input_data = convolutional(input_data, (3, 3, 256, 512), downsample=True, activate_type="mish")
route = input_data
route = convolutional(route, (1, 1, 512, 256), activate_type="mish")
input_data = convolutional(input_data, (1, 1, 512, 256), activate_type="mish")
for i in range(8):
input_data = residual_block(input_data, 256, 256, 256, activate_type="mish")
input_data = convolutional(input_data, (1, 1, 256, 256), activate_type="mish")
input_data = tf.concat([input_data, route], axis=-1)
input_data = convolutional(input_data, (1, 1, 512, 512), activate_type="mish")
route_2 = input_data
input_data = convolutional(input_data, (3, 3, 512, 1024), downsample=True, activate_type="mish")
route = input_data
route = convolutional(route, (1, 1, 1024, 512), activate_type="mish")
input_data = convolutional(input_data, (1, 1, 1024, 512), activate_type="mish")
for i in range(4):
input_data = residual_block(input_data, 512, 512, 512, activate_type="mish")
input_data = convolutional(input_data, (1, 1, 512, 512), activate_type="mish")
input_data = tf.concat([input_data, route], axis=-1)
input_data = convolutional(input_data, (1, 1, 1024, 1024), activate_type="mish")
input_data = convolutional(input_data, (1, 1, 1024, 512))
input_data = convolutional(input_data, (3, 3, 512, 1024))
input_data = convolutional(input_data, (1, 1, 1024, 512))
max_pooling_1 = tf.keras.layers.MaxPool2D(pool_size=13, padding='SAME', strides=1)(input_data)
max_pooling_2 = tf.keras.layers.MaxPool2D(pool_size=9, padding='SAME', strides=1)(input_data)
max_pooling_3 = tf.keras.layers.MaxPool2D(pool_size=5, padding='SAME', strides=1)(input_data)
input_data = tf.concat([max_pooling_1, max_pooling_2, max_pooling_3, input_data], axis=-1)
input_data = convolutional(input_data, (1, 1, 2048, 512))
input_data = convolutional(input_data, (3, 3, 512, 1024))
input_data = convolutional(input_data, (1, 1, 1024, 512))
return route_1, route_2, input_data
def darknet19_tiny(input_data):
input_data = convolutional(input_data, (3, 3, 3, 16))
input_data = MaxPool2D(2, 2, 'same')(input_data)
input_data = convolutional(input_data, (3, 3, 16, 32))
input_data = MaxPool2D(2, 2, 'same')(input_data)
input_data = convolutional(input_data, (3, 3, 32, 64))
input_data = MaxPool2D(2, 2, 'same')(input_data)
input_data = convolutional(input_data, (3, 3, 64, 128))
input_data = MaxPool2D(2, 2, 'same')(input_data)
input_data = convolutional(input_data, (3, 3, 128, 256))
route_1 = input_data
input_data = MaxPool2D(2, 2, 'same')(input_data)
input_data = convolutional(input_data, (3, 3, 256, 512))
input_data = MaxPool2D(2, 1, 'same')(input_data)
input_data = convolutional(input_data, (3, 3, 512, 1024))
return route_1, input_data
def cspdarknet53_tiny(input_data): # not sure how this should be called
input_data = convolutional(input_data, (3, 3, 3, 32), downsample=True)
input_data = convolutional(input_data, (3, 3, 32, 64), downsample=True)
input_data = convolutional(input_data, (3, 3, 64, 64))
route = input_data
input_data = route_group(input_data, 2, 1)
input_data = convolutional(input_data, (3, 3, 32, 32))
route_1 = input_data
input_data = convolutional(input_data, (3, 3, 32, 32))
input_data = tf.concat([input_data, route_1], axis=-1)
input_data = convolutional(input_data, (1, 1, 32, 64))
input_data = tf.concat([route, input_data], axis=-1)
input_data = MaxPool2D(2, 2, 'same')(input_data)
input_data = convolutional(input_data, (3, 3, 64, 128))
route = input_data
input_data = route_group(input_data, 2, 1)
input_data = convolutional(input_data, (3, 3, 64, 64))
route_1 = input_data
input_data = convolutional(input_data, (3, 3, 64, 64))
input_data = tf.concat([input_data, route_1], axis=-1)
input_data = convolutional(input_data, (1, 1, 64, 128))
input_data = tf.concat([route, input_data], axis=-1)
input_data = MaxPool2D(2, 2, 'same')(input_data)
input_data = convolutional(input_data, (3, 3, 128, 256))
route = input_data
input_data = route_group(input_data, 2, 1)
input_data = convolutional(input_data, (3, 3, 128, 128))
route_1 = input_data
input_data = convolutional(input_data, (3, 3, 128, 128))
input_data = tf.concat([input_data, route_1], axis=-1)
input_data = convolutional(input_data, (1, 1, 128, 256))
route_1 = input_data
input_data = tf.concat([route, input_data], axis=-1)
input_data = MaxPool2D(2, 2, 'same')(input_data)
input_data = convolutional(input_data, (3, 3, 512, 512))
return route_1, input_data
def YOLOv3(input_layer, NUM_CLASS):
# After the input layer enters the Darknet-53 network, we get three branches
route_1, route_2, conv = darknet53(input_layer)
# See the orange module (DBL) in the figure above, a total of 5 Subconvolution operation
conv = convolutional(conv, (1, 1, 1024, 512))
conv = convolutional(conv, (3, 3, 512, 1024))
conv = convolutional(conv, (1, 1, 1024, 512))
conv = convolutional(conv, (3, 3, 512, 1024))
conv = convolutional(conv, (1, 1, 1024, 512))
conv_lobj_branch = convolutional(conv, (3, 3, 512, 1024))
# conv_lbbox is used to predict large-sized objects , Shape = [None, 13, 13, 255]
conv_lbbox = convolutional(conv_lobj_branch, (1, 1, 1024, 3*(NUM_CLASS + 5)), activate=False, bn=False)
conv = convolutional(conv, (1, 1, 512, 256))
# upsample here uses the nearest neighbor interpolation method, which has the advantage that the
# upsampling process does not need to learn, thereby reducing the network parameter
conv = upsample(conv)
conv = tf.concat([conv, route_2], axis=-1)
conv = convolutional(conv, (1, 1, 768, 256))
conv = convolutional(conv, (3, 3, 256, 512))
conv = convolutional(conv, (1, 1, 512, 256))
conv = convolutional(conv, (3, 3, 256, 512))
conv = convolutional(conv, (1, 1, 512, 256))
conv_mobj_branch = convolutional(conv, (3, 3, 256, 512))
# conv_mbbox is used to predict medium-sized objects, shape = [None, 26, 26, 255]
conv_mbbox = convolutional(conv_mobj_branch, (1, 1, 512, 3*(NUM_CLASS + 5)), activate=False, bn=False)
conv = convolutional(conv, (1, 1, 256, 128))
conv = upsample(conv)
conv = tf.concat([conv, route_1], axis=-1)
conv = convolutional(conv, (1, 1, 384, 128))
conv = convolutional(conv, (3, 3, 128, 256))
conv = convolutional(conv, (1, 1, 256, 128))
conv = convolutional(conv, (3, 3, 128, 256))
conv = convolutional(conv, (1, 1, 256, 128))
conv_sobj_branch = convolutional(conv, (3, 3, 128, 256))
# conv_sbbox is used to predict small size objects, shape = [None, 52, 52, 255]
conv_sbbox = convolutional(conv_sobj_branch, (1, 1, 256, 3*(NUM_CLASS +5)), activate=False, bn=False)
return [conv_sbbox, conv_mbbox, conv_lbbox]
def YOLOv4(input_layer, NUM_CLASS):
route_1, route_2, conv = cspdarknet53(input_layer)
route = conv
conv = convolutional(conv, (1, 1, 512, 256))
conv = upsample(conv)
route_2 = convolutional(route_2, (1, 1, 512, 256))
conv = tf.concat([route_2, conv], axis=-1)
conv = convolutional(conv, (1, 1, 512, 256))
conv = convolutional(conv, (3, 3, 256, 512))
conv = convolutional(conv, (1, 1, 512, 256))
conv = convolutional(conv, (3, 3, 256, 512))
conv = convolutional(conv, (1, 1, 512, 256))
route_2 = conv
conv = convolutional(conv, (1, 1, 256, 128))
conv = upsample(conv)
route_1 = convolutional(route_1, (1, 1, 256, 128))
conv = tf.concat([route_1, conv], axis=-1)
conv = convolutional(conv, (1, 1, 256, 128))
conv = convolutional(conv, (3, 3, 128, 256))
conv = convolutional(conv, (1, 1, 256, 128))
conv = convolutional(conv, (3, 3, 128, 256))
conv = convolutional(conv, (1, 1, 256, 128))
route_1 = conv
conv = convolutional(conv, (3, 3, 128, 256))
conv_sbbox = convolutional(conv, (1, 1, 256, 3 * (NUM_CLASS + 5)), activate=False, bn=False)
conv = convolutional(route_1, (3, 3, 128, 256), downsample=True)
conv = tf.concat([conv, route_2], axis=-1)
conv = convolutional(conv, (1, 1, 512, 256))
conv = convolutional(conv, (3, 3, 256, 512))
conv = convolutional(conv, (1, 1, 512, 256))
conv = convolutional(conv, (3, 3, 256, 512))
conv = convolutional(conv, (1, 1, 512, 256))
route_2 = conv
conv = convolutional(conv, (3, 3, 256, 512))
conv_mbbox = convolutional(conv, (1, 1, 512, 3 * (NUM_CLASS + 5)), activate=False, bn=False)
conv = convolutional(route_2, (3, 3, 256, 512), downsample=True)
conv = tf.concat([conv, route], axis=-1)
conv = convolutional(conv, (1, 1, 1024, 512))
conv = convolutional(conv, (3, 3, 512, 1024))
conv = convolutional(conv, (1, 1, 1024, 512))
conv = convolutional(conv, (3, 3, 512, 1024))
conv = convolutional(conv, (1, 1, 1024, 512))
conv = convolutional(conv, (3, 3, 512, 1024))
conv_lbbox = convolutional(conv, (1, 1, 1024, 3 * (NUM_CLASS + 5)), activate=False, bn=False)
return [conv_sbbox, conv_mbbox, conv_lbbox]
def YOLOv3_tiny(input_layer, NUM_CLASS):
# After the input layer enters the Darknet-53 network, we get three branches
route_1, conv = darknet19_tiny(input_layer)
conv = convolutional(conv, (1, 1, 1024, 256))
conv_lobj_branch = convolutional(conv, (3, 3, 256, 512))
# conv_lbbox is used to predict large-sized objects , Shape = [None, 26, 26, 255]
conv_lbbox = convolutional(conv_lobj_branch, (1, 1, 512, 3*(NUM_CLASS + 5)), activate=False, bn=False)
conv = convolutional(conv, (1, 1, 256, 128))
# upsample here uses the nearest neighbor interpolation method, which has the advantage that the
# upsampling process does not need to learn, thereby reducing the network parameter
conv = upsample(conv)
conv = tf.concat([conv, route_1], axis=-1)
conv_mobj_branch = convolutional(conv, (3, 3, 128, 256))
# conv_mbbox is used to predict medium size objects, shape = [None, 13, 13, 255]
conv_mbbox = convolutional(conv_mobj_branch, (1, 1, 256, 3 * (NUM_CLASS + 5)), activate=False, bn=False)
return [conv_mbbox, conv_lbbox]
def YOLOv4_tiny(input_layer, NUM_CLASS):
route_1, conv = cspdarknet53_tiny(input_layer)
conv = convolutional(conv, (1, 1, 512, 256))
conv_lobj_branch = convolutional(conv, (3, 3, 256, 512))
conv_lbbox = convolutional(conv_lobj_branch, (1, 1, 512, 3 * (NUM_CLASS + 5)), activate=False, bn=False)
conv = convolutional(conv, (1, 1, 256, 128))
conv = upsample(conv)
conv = tf.concat([conv, route_1], axis=-1)
conv_mobj_branch = convolutional(conv, (3, 3, 128, 256))
conv_mbbox = convolutional(conv_mobj_branch, (1, 1, 256, 3 * (NUM_CLASS + 5)), activate=False, bn=False)
return [conv_mbbox, conv_lbbox]
def Create_Yolo(input_size=416, channels=3, training=False, CLASSES=YOLO_COCO_CLASSES):
NUM_CLASS = len(read_class_names(CLASSES))
input_layer = Input([input_size, input_size, channels])
if TRAIN_YOLO_TINY:
if YOLO_TYPE == "yolov4":
conv_tensors = YOLOv4_tiny(input_layer, NUM_CLASS)
if YOLO_TYPE == "yolov3":
conv_tensors = YOLOv3_tiny(input_layer, NUM_CLASS)
else:
if YOLO_TYPE == "yolov4":
conv_tensors = YOLOv4(input_layer, NUM_CLASS)
if YOLO_TYPE == "yolov3":
conv_tensors = YOLOv3(input_layer, NUM_CLASS)
output_tensors = []
for i, conv_tensor in enumerate(conv_tensors):
pred_tensor = decode(conv_tensor, NUM_CLASS, i)
if training: output_tensors.append(conv_tensor)
output_tensors.append(pred_tensor)
Yolo = tf.keras.Model(input_layer, output_tensors)
return Yolo
def decode(conv_output, NUM_CLASS, i=0):
# where i = 0, 1 or 2 to correspond to the three grid scales
conv_shape = tf.shape(conv_output)
batch_size = conv_shape[0]
output_size = conv_shape[1]
conv_output = tf.reshape(conv_output, (batch_size, output_size, output_size, 3, 5 + NUM_CLASS))
#conv_raw_dxdy = conv_output[:, :, :, :, 0:2] # offset of center position
#conv_raw_dwdh = conv_output[:, :, :, :, 2:4] # Prediction box length and width offset
#conv_raw_conf = conv_output[:, :, :, :, 4:5] # confidence of the prediction box
#conv_raw_prob = conv_output[:, :, :, :, 5: ] # category probability of the prediction box
conv_raw_dxdy, conv_raw_dwdh, conv_raw_conf, conv_raw_prob = tf.split(conv_output, (2, 2, 1, NUM_CLASS), axis=-1)
# next need Draw the grid. Where output_size is equal to 13, 26 or 52
#y = tf.range(output_size, dtype=tf.int32)
#y = tf.expand_dims(y, -1)
#y = tf.tile(y, [1, output_size])
#x = tf.range(output_size,dtype=tf.int32)
#x = tf.expand_dims(x, 0)
#x = tf.tile(x, [output_size, 1])
xy_grid = tf.meshgrid(tf.range(output_size), tf.range(output_size))
xy_grid = tf.expand_dims(tf.stack(xy_grid, axis=-1), axis=2) # [gx, gy, 1, 2]
xy_grid = tf.tile(tf.expand_dims(xy_grid, axis=0), [batch_size, 1, 1, 3, 1])
xy_grid = tf.cast(xy_grid, tf.float32)
#xy_grid = tf.concat([x[:, :, tf.newaxis], y[:, :, tf.newaxis]], axis=-1)
#xy_grid = tf.tile(xy_grid[tf.newaxis, :, :, tf.newaxis, :], [batch_size, 1, 1, 3, 1])
#y_grid = tf.cast(xy_grid, tf.float32)
# Calculate the center position of the prediction box:
pred_xy = (tf.sigmoid(conv_raw_dxdy) + xy_grid) * STRIDES[i]
# Calculate the length and width of the prediction box:
pred_wh = (tf.exp(conv_raw_dwdh) * ANCHORS[i]) * STRIDES[i]
pred_xywh = tf.concat([pred_xy, pred_wh], axis=-1)
pred_conf = tf.sigmoid(conv_raw_conf) # object box calculates the predicted confidence
pred_prob = tf.sigmoid(conv_raw_prob) # calculating the predicted probability category box object
# calculating the predicted probability category box object
return tf.concat([pred_xywh, pred_conf, pred_prob], axis=-1)
def bbox_iou(boxes1, boxes2):
boxes1_area = boxes1[..., 2] * boxes1[..., 3]
boxes2_area = boxes2[..., 2] * boxes2[..., 3]
boxes1 = tf.concat([boxes1[..., :2] - boxes1[..., 2:] * 0.5,
boxes1[..., :2] + boxes1[..., 2:] * 0.5], axis=-1)
boxes2 = tf.concat([boxes2[..., :2] - boxes2[..., 2:] * 0.5,
boxes2[..., :2] + boxes2[..., 2:] * 0.5], axis=-1)
left_up = tf.maximum(boxes1[..., :2], boxes2[..., :2])
right_down = tf.minimum(boxes1[..., 2:], boxes2[..., 2:])
inter_section = tf.maximum(right_down - left_up, 0.0)
inter_area = inter_section[..., 0] * inter_section[..., 1]
union_area = boxes1_area + boxes2_area - inter_area
return 1.0 * inter_area / union_area
def bbox_giou(boxes1, boxes2):
boxes1 = tf.concat([boxes1[..., :2] - boxes1[..., 2:] * 0.5,
boxes1[..., :2] + boxes1[..., 2:] * 0.5], axis=-1)
boxes2 = tf.concat([boxes2[..., :2] - boxes2[..., 2:] * 0.5,
boxes2[..., :2] + boxes2[..., 2:] * 0.5], axis=-1)
boxes1 = tf.concat([tf.minimum(boxes1[..., :2], boxes1[..., 2:]),
tf.maximum(boxes1[..., :2], boxes1[..., 2:])], axis=-1)
boxes2 = tf.concat([tf.minimum(boxes2[..., :2], boxes2[..., 2:]),
tf.maximum(boxes2[..., :2], boxes2[..., 2:])], axis=-1)
boxes1_area = (boxes1[..., 2] - boxes1[..., 0]) * (boxes1[..., 3] - boxes1[..., 1])
boxes2_area = (boxes2[..., 2] - boxes2[..., 0]) * (boxes2[..., 3] - boxes2[..., 1])
left_up = tf.maximum(boxes1[..., :2], boxes2[..., :2])
right_down = tf.minimum(boxes1[..., 2:], boxes2[..., 2:])
inter_section = tf.maximum(right_down - left_up, 0.0)
inter_area = inter_section[..., 0] * inter_section[..., 1]
union_area = boxes1_area + boxes2_area - inter_area
# Calculate the iou value between the two bounding boxes
iou = inter_area / union_area
# Calculate the coordinates of the upper left corner and the lower right corner of the smallest closed convex surface
enclose_left_up = tf.minimum(boxes1[..., :2], boxes2[..., :2])
enclose_right_down = tf.maximum(boxes1[..., 2:], boxes2[..., 2:])
enclose = tf.maximum(enclose_right_down - enclose_left_up, 0.0)
# Calculate the area of the smallest closed convex surface C
enclose_area = enclose[..., 0] * enclose[..., 1]
# Calculate the GIoU value according to the GioU formula
giou = iou - 1.0 * (enclose_area - union_area) / enclose_area
return giou
# testing (should be better than giou)
def bbox_ciou(boxes1, boxes2):
boxes1_coor = tf.concat([boxes1[..., :2] - boxes1[..., 2:] * 0.5,
boxes1[..., :2] + boxes1[..., 2:] * 0.5], axis=-1)
boxes2_coor = tf.concat([boxes2[..., :2] - boxes2[..., 2:] * 0.5,
boxes2[..., :2] + boxes2[..., 2:] * 0.5], axis=-1)
left = tf.maximum(boxes1_coor[..., 0], boxes2_coor[..., 0])
up = tf.maximum(boxes1_coor[..., 1], boxes2_coor[..., 1])
right = tf.maximum(boxes1_coor[..., 2], boxes2_coor[..., 2])
down = tf.maximum(boxes1_coor[..., 3], boxes2_coor[..., 3])
c = (right - left) * (right - left) + (up - down) * (up - down)
iou = bbox_iou(boxes1, boxes2)
u = (boxes1[..., 0] - boxes2[..., 0]) * (boxes1[..., 0] - boxes2[..., 0]) + (boxes1[..., 1] - boxes2[..., 1]) * (boxes1[..., 1] - boxes2[..., 1])
d = u / c
ar_gt = boxes2[..., 2] / boxes2[..., 3]
ar_pred = boxes1[..., 2] / boxes1[..., 3]
ar_loss = 4 / (np.pi * np.pi) * (tf.atan(ar_gt) - tf.atan(ar_pred)) * (tf.atan(ar_gt) - tf.atan(ar_pred))
alpha = ar_loss / (1 - iou + ar_loss + 0.000001)
ciou_term = d + alpha * ar_loss
return iou - ciou_term
def compute_loss(pred, conv, label, bboxes, i=0, CLASSES=YOLO_COCO_CLASSES):
NUM_CLASS = len(read_class_names(CLASSES))
conv_shape = tf.shape(conv)
batch_size = conv_shape[0]
output_size = conv_shape[1]
input_size = STRIDES[i] * output_size
conv = tf.reshape(conv, (batch_size, output_size, output_size, 3, 5 + NUM_CLASS))
conv_raw_conf = conv[:, :, :, :, 4:5]
conv_raw_prob = conv[:, :, :, :, 5:]
pred_xywh = pred[:, :, :, :, 0:4]
pred_conf = pred[:, :, :, :, 4:5]
label_xywh = label[:, :, :, :, 0:4]
respond_bbox = label[:, :, :, :, 4:5]
label_prob = label[:, :, :, :, 5:]
giou = tf.expand_dims(bbox_giou(pred_xywh, label_xywh), axis=-1)
input_size = tf.cast(input_size, tf.float32)
bbox_loss_scale = 2.0 - 1.0 * label_xywh[:, :, :, :, 2:3] * label_xywh[:, :, :, :, 3:4] / (input_size ** 2)
giou_loss = respond_bbox * bbox_loss_scale * (1 - giou)
iou = bbox_iou(pred_xywh[:, :, :, :, np.newaxis, :], bboxes[:, np.newaxis, np.newaxis, np.newaxis, :, :])
# Find the value of IoU with the real box The largest prediction box
max_iou = tf.expand_dims(tf.reduce_max(iou, axis=-1), axis=-1)
# If the largest iou is less than the threshold, it is considered that the prediction box contains no objects, then the background box
respond_bgd = (1.0 - respond_bbox) * tf.cast( max_iou < YOLO_IOU_LOSS_THRESH, tf.float32 )
conf_focal = tf.pow(respond_bbox - pred_conf, 2)
# Calculate the loss of confidence
# we hope that if the grid contains objects, then the network output prediction box has a confidence of 1 and 0 when there is no object.
conf_loss = conf_focal * (
respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits(labels=respond_bbox, logits=conv_raw_conf)
+
respond_bgd * tf.nn.sigmoid_cross_entropy_with_logits(labels=respond_bbox, logits=conv_raw_conf)
)
prob_loss = respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits(labels=label_prob, logits=conv_raw_prob)
giou_loss = tf.reduce_mean(tf.reduce_sum(giou_loss, axis=[1,2,3,4]))
conf_loss = tf.reduce_mean(tf.reduce_sum(conf_loss, axis=[1,2,3,4]))
prob_loss = tf.reduce_mean(tf.reduce_sum(prob_loss, axis=[1,2,3,4]))
return giou_loss, conf_loss, prob_loss