diff --git a/backup/checkpoint b/backup/checkpoint new file mode 100644 index 000000000..19bb6a253 --- /dev/null +++ b/backup/checkpoint @@ -0,0 +1,9 @@ +model_checkpoint_path: "model-400" +all_model_checkpoint_paths: "model-50" +all_model_checkpoint_paths: "model-100" +all_model_checkpoint_paths: "model-150" +all_model_checkpoint_paths: "model-200" +all_model_checkpoint_paths: "model-250" +all_model_checkpoint_paths: "model-300" +all_model_checkpoint_paths: "model-350" +all_model_checkpoint_paths: "model-400" diff --git a/clean.py b/clean.py index c9a45ddc3..d45353e8b 100644 --- a/clean.py +++ b/clean.py @@ -1,3 +1,20 @@ +""" +file: ./clean.py +includes: a script to parse Pascal VOC data +this script produces the binary file parsed.bin, which contains +a cPickle dump of a list. Each element in the list corresponds +to an image, the element in turn contains a list of parsed bounding +boxes coordinates and asscociated classes of each object defined +in labels.txt. If labels.txt is left blank, the default choice of +all twenty objects are used (see list labels20 below). + +The cPickle dump will be used mainly by ./data.py, inside function +shuffle(). shuffle() will shuffle and cut the dump into batches, +preprocess them so that they are ready to be fed into net. + +WARNING: this script is messy, it hurts to read :( +""" + import os import numpy as np import cv2 @@ -15,9 +32,10 @@ "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"] +pick = list() with open('labels.txt', 'r') as f: - pick = f.readlines() - for i in range(len(pick)): pick[i] = pick[i].strip() + pick = [l.strip() for l in f.readlines()] +if pick == list(): pick = labels20 def pp(l): for i in l: print '{}: {}'.format(i,l[i]) @@ -50,10 +68,10 @@ def parse(line): w = h = int() all = current = list() + name = str() obj = False - noHuman = True - noPlant = True - for line in lines: + for i in range(len(lines)): + line = lines[i] if '' in line: w = parse(line) if '' in line: @@ -70,14 +88,13 @@ def parse(line): if '' in line: if current != list() and current[0] in pick: all += [current] - if current[0] == 'person': noHuman = False - if current[0] == 'pottedplant': noPlant = False current = list() name = parse(line) if name not in pick: obj = False continue current = [name,None,None,None,None] + if len(current) != 5: continue xn = '' in line xx = '' in line yn = '' in line @@ -89,12 +106,10 @@ def parse(line): if current != list() and current[0] in pick: all += [current] - if current[0] == 'person': noHuman = False - if current[0] == 'pottedplant': noPlant = False if all == list(): continue jpg = file.split('.')[0]+'.jpg' - add = [[jpg, [w, h, all]]] * (1 + noHuman* (15 + noPlant * 11)) + add = [[jpg, [w, h, all]]] dumps += add @@ -112,6 +127,6 @@ def parse(line): print 'Statistics:' pp(stat) print 'Dataset size: {}'.format(len(dumps)) -with open('parsed.yolotf', 'wb') as f: - pickle.dump([pick, dumps],f,protocol=-1) +with open('parsed.bin', 'wb') as f: + pickle.dump([dumps],f,protocol=-1) os.chdir(tempdir) \ No newline at end of file diff --git a/configs/process.py b/configs/process.py index 20bd162df..b749d2ec4 100644 --- a/configs/process.py +++ b/configs/process.py @@ -1,109 +1,112 @@ import numpy as np import os -def cfg_yielder(model, undiscovered = True): +def _parse(l): return l.split('=')[1].strip() + +def parser(model): """ - yielding each layer information, i.e. yielding type & size - of each layer of `model`. - Because of different reasons, it is not always be the ideal - case that following .cfg file will successfully match the - size of .weights file, so one would need to investigate the - .weights file if s/he is parsing the .cfg file for the first - time (undiscovered = True) in order to adjust the parsing - appropriately. + Read the .cfg file to extract layers into `s` + as well as model-specific parameters into `meta` """ - - # Step 1: parsing cfg file with open('configs/yolo-{}.cfg'.format(model), 'rb') as f: - lines = f.readlines() - - s = [] # contains layers' info - S = int() # the number of grid cell + lines = f.readlines() + + s = [] # will contains layers' info add = dict() for line in lines: line = line.strip() - if 'side' in line: - S = int(line.split('=')[1].strip()) + # deepnet general layers if '[' in line: - if add != {}: - s += [add] - add = dict() + if add != {}: s += [add] + add = {'type':line} else: try: - i = float(line.split('=')[1].strip()) + i = float(_parse(line)) if i == int(i): i = int(i) add[line.split('=')[0]] = i except: try: - if line.split('=')[1] == 'leaky' and 'output' in add: - add[line.split('=')[0]] = line.split('=')[1] + if _parse(line) == 'leaky': + add['activation'] = 'leaky' except: pass - yield S + add['model'] = model + return s, add + +def discoverer(weightf, s): + """ + discoverer returns: + 1. index of last convolutional layer + 2. the expected size of this conv layer's kernel + """ + allbytes = os.path.getsize(weightf) + allfloat = allbytes/4; allfloat -= 4 + last_convo = int() + for i, d in enumerate(s): + if len(d) >= 4: + last_convo = i + channel = 3; dense = False # flag for 1st dense layer + out = int() + for i, d in enumerate(s): + # ignore darknet specifications + if 'batch' in d: continue + if 'crop_width' in d: continue + if 'side' in d: continue + + if d['type'] == '[convolutional]': + kernel = d['size'] ** 2 * channel * d['filters'] + allfloat -= kernel + d['filters'] + channel = d['filters'] + if 'batch_normalize' in d: + allfloat -= 2 * d['filters'] + elif d['type'] == '[connected]': + if dense is False: + out = out1 = d['output'] + dense = True; continue + weight = out * d['output'] + allfloat -= weight + d['output'] + out = d['output'] + + allfloat -= out1 # substract the bias + if allfloat <= 0: + message = 'yolo-{}.cfg suggests a bigger size' + message += ' than yolo-{}.weights actually is' + exit('Error: {}'.format(message.format(model, model))) + + # expected size of last convolution kernel + size = (np.sqrt(1.*allfloat/out1/channel)) + print 'Last convolutional kernel size = {}'.format(size) + size = int(size) + n = last_convo + 1 + while 'output' not in s[n]: + size *= s[n].get('size',1) + n += 1 + return last_convo, size + +def cfg_yielder(model, undiscovered = True): + """ + yielding each layer information, if model is discovered + for the first time (undiscovered = True), discoverer + will be employed + """ + + layers, meta = parser(model); yield meta - # Step 2: investigate the weight file - weightf = 'yolo-{}.weights'.format(model) if undiscovered: - allbytes = os.path.getsize('yolo-{}.weights'.format(model)) - allbytes /= 4 # each float is 4 byte - allbytes -= 4 # the first 4 bytes are darknet specifications - last_convo = int() - for i, d in enumerate(s): - if len(d) == 4: - last_convo = i # the index of last convolution layer - flag = False - channel = 3 # initial number of channel in the tensor volume - out = int() - for i, d in enumerate(s): - # for each iteration in this loop - # allbytes will be gradually subtracted - # by the size of the corresponding layer (d) - # except for the 1st dense layer - # it should be what remains after subtracting - # all other layers - if len(d) == 4: - allbytes -= d['size'] ** 2 * channel * d['filters'] - allbytes -= d['filters'] - channel = d['filters'] - elif 'output' in d: # this is a dense layer - if flag is False: # this is the first dense layer - out = out1 = d['output'] # output unit of the 1st dense layer - flag = True # mark that the 1st dense layer is passed - continue # don't do anything with the 1st dense layer - allbytes -= out * d['output'] - allbytes -= d['output'] - out = d['output'] - allbytes -= out1 # substract the bias - if allbytes <= 0: - message = "Error: yolo-{}.cfg suggests a bigger size" - message += " than yolo-{}.weights actually is" - print message.format(model, model) - assert allbytes > 0 - # allbytes is now = I * out1 - # where I is the input size of the 1st dense layer - # I is also the volume of the last convolution layer - # I = size * size * channel - size = (np.sqrt(allbytes/out1/channel)) - size = int(size) - n = last_convo + 1 - while 'output' not in s[n]: - size *= s[n].get('size',1) - n += 1 - else: - last_convo = None - size = None + weightf = 'yolo-{}.weights'.format(model) + last_convo, size = discoverer(weightf, layers) + else: last_convo = None; size = None - # Step 3: Yielding config - w = 448 - h = 448 - c = 3 - l = w * h * c - flat = False - yield ['CROP'] - for i, d in enumerate(s): - #print w, h, c, l - flag = False - if len(d) == 4: + # Start yielding + w = 448; h = 448; c = 3; l = w * h * c + yield ['CROP']; flat = False # flag for 1st dense layer + for i, d in enumerate(layers): + # ignore darknet specifications + if 'batch' in d: continue + if 'crop_width' in d: continue + if 'side' in d: continue + + if d['type'] == '[convolutional]': mult = (d['size'] == 3) mult *= (d['stride'] != 2) + 1. if d['size'] == 1: d['pad'] = 0 @@ -111,6 +114,9 @@ def cfg_yielder(model, undiscovered = True): new /= d['stride'] new = int(np.floor(new + 1.)) if i == last_convo: + # signal tfnet to figure out the pad itself + # to achieve the desired `size`. Namely, to + # use the negative sign: d['pad'] = -size new = size yield ['conv', d['size'], c, d['filters'], @@ -118,31 +124,25 @@ def cfg_yielder(model, undiscovered = True): w = h = new c = d['filters'] l = w * h * c - #print w, h, c - if len(d) == 2: - if 'output' not in d: - yield ['pool', d['size'], 0, - 0, 0, 0, d['stride'], 0] - new = (w * 1.0 - d['size'])/d['stride'] + 1 - new = int(np.floor(new)) - w = h = new - l = w * h * c - else: - if not flat: - flat = True - yield ['FLATTEN'] - yield ['conn', 0, 0, - 0, 0, 0, l, d['output']] - l = d['output'] - if 'activation' in d: - yield ['LEAKY'] - if len(d) == 1: - if 'output' not in d: - yield ['DROPOUT'] - else: - if not flat: - flat = True - yield ['FLATTEN'] - yield ['conn', 0, 0, - 0, 0, 0, l, d['output']] - l = d['output'] \ No newline at end of file + if 'batch_normalize' in d: + yield['bnrm', 0, 0, c, 0, 0] + if 'activation' in d: yield ['leaky'] + + if d['type'] == '[maxpool]': + yield ['pool', d['size'], 0, + 0, 0, 0, d['stride'], 0] + new = (w * 1.0 - d['size'])/d['stride'] + 1 + new = int(np.floor(new)) + w = h = new + l = w * h * c + + if d['type'] == '[connected]': + if not flat: + yield ['flatten'] + flat = True + yield ['conn'] + [0] * 5 + [l, d['output']] + l = d['output'] + if 'activation' in d: yield ['leaky'] + + if d['type'] == '[dropout]': + yield ['drop', d['probability']] \ No newline at end of file diff --git a/configs/process_.py b/configs/process_.py new file mode 100644 index 000000000..3f5df526a --- /dev/null +++ b/configs/process_.py @@ -0,0 +1,152 @@ +import numpy as np +import os +import sys + +model = sys.argv[1] +undiscovered = True + +# Step 1: parsing cfg file +with open('yolo-{}.cfg'.format(model), 'rb') as f: + lines = f.readlines() + +s = [] # contains layers' info +S = int() # the number of grid cell +add = dict() +for line in lines: + line = line.strip() + if 'side' in line: + S = int(line.split('=')[1].strip()) + if '[' in line: + if add != {}: + s += [add] + add = dict() + else: + try: + i = float(line.split('=')[1].strip()) + if i == int(i): i = int(i) + add[line.split('=')[0]] = i + except: + try: + if line.split('=')[1] == 'leaky': + add[line.split('=')[0]] = 'leaky' + except: + pass + +# Step 2: investigate the weight file +weightf = '../yolo-{}.weights'.format(model) +if undiscovered: + allbytes = os.path.getsize(weightf.format(model)) + allbytes /= 4 # each float is 4 byte + allbytes -= 4 # the first 4 bytes are darknet specifications + last_convo = int() + for i, d in enumerate(s): + if len(d) == 4: + last_convo = i # the index of last convolution layer + flag = False + channel = 3 # initial number of channel in the tensor volume + out = int() + for i, d in enumerate(s): + if 'batch' in d: continue + if 'crop_width' in d: continue + if 'side' in d: continue + # for each iteration in this loop + # allbytes will be gradually subtracted + # by the size of the corresponding layer (d) + # except for the 1st dense layer + # it should be what remains after subtracting + # all other layers + if len(d) >= 4: + allbytes -= d['size'] ** 2 * channel * d['filters'] + allbytes -= d['filters'] + channel = d['filters'] + if 'batch_normalize' in d: + allbytes -= 2 * d['filters'] + elif 'output' in d: # this is a dense layer + if flag is False: # this is the first dense layer + out = out1 = d['output'] # output unit of the 1st dense layer + flag = True # mark that the 1st dense layer is passed + continue # don't do anything with the 1st dense layer + allbytes -= out * d['output'] + allbytes -= d['output'] + out = d['output'] + allbytes -= out1 # substract the bias + if allbytes <= 0: + message = "Error: yolo-{}.cfg suggests a bigger size" + message += " than yolo-{}.weights actually is" + print message.format(model, model) + assert allbytes > 0 + # allbytes is now = I * out1 + # where I is the input size of the 1st dense layer + # I is also the volume of the last convolution layer + # I = size * size * channel + size = (np.sqrt(allbytes/out1/channel)) + print size + size = int(size) + n = last_convo + 1 + while 'output' not in s[n]: + size *= s[n].get('size',1) + n += 1 +else: + last_convo = None + size = None + +# Step 3: printing config +w = 448 +h = 448 +c = 3 +l = w * h * c +flat = False + +for i, d in enumerate(s): + if 'batch' in d: continue + if 'crop_width' in d: continue + if 'side' in d: continue + + flag = False # flag for passing the 1st dense layer + if len(d) >= 4: + mult = (d['size'] == 3) + mult *= (d['stride'] != 2) + 1. + if d['size'] == 1: d['pad'] = 0 + new = (w + mult * d['pad'] - d['size']) + new /= d['stride'] + new = int(np.floor(new + 1.)) + if i == last_convo: + # yield the negative expected size + # instead of the indicated pad. + d['pad'] = -size + new = size + batch_norm = d.get('batch_normalize', 0) + print ['conv', d['size'], c, d['filters'], + h, w, d['stride'], d['pad'], batch_norm] + w = h = new + c = d['filters'] + l = w * h * c + if 'activation' in d: + print ['LEAKY'] + if len(d) == 2: + if 'output' not in d: + print ['pool', d['size'], 0, + 0, 0, 0, d['stride'], 0] + new = (w * 1.0 - d['size'])/d['stride'] + 1 + new = int(np.floor(new)) + w = h = new + l = w * h * c + else: + if not flat: + flat = True + print ['FLATTEN'] + print ['conn', 0, 0, + 0, 0, 0, l, d['output']] + l = d['output'] + if 'activation' in d: + print ['LEAKY'] + if len(d) == 1: + if 'output' not in d: + print ['DROPOUT'] + else: + if not flat: + flat = True + print ['FLATTEN'] + print ['conn', 0, 0, + 0, 0, 0, l, d['output']] + l = d['output'] \ No newline at end of file diff --git a/configs/yolo-2c.cfg b/configs/yolo-2c.cfg new file mode 100644 index 000000000..b6951ac6d --- /dev/null +++ b/configs/yolo-2c.cfg @@ -0,0 +1,138 @@ +[net] +batch=64 +subdivisions=64 +height=448 +width=448 +channels=3 +momentum=0.9 +decay=0.0005 + +learning_rate=0.0001 +policy=steps +steps=20,40,60,80,20000,30000 +scales=5,5,2,2,.1,.1 +max_batches = 40000 + +[crop] +crop_width=448 +crop_height=448 +flip=0 +angle=0 +saturation = 1.5 +exposure = 1.5 + +[convolutional] +filters=16 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[connected] +output=256 +activation=linear + +[connected] +output=4096 +activation=leaky + +[dropout] +probability=.5 + +[connected] +output=1452 +activation=linear + +[detection] +classes=2 +coords=4 +rescore=1 +side=11 +num=2 +softmax=0 +sqrt=1 +jitter=.2 +object_scale=1 +noobject_scale=.5 +class_scale=1 +coord_scale=5 \ No newline at end of file diff --git a/configs/yolo-baby.cfg b/configs/yolo-baby.cfg new file mode 100644 index 000000000..75218c221 --- /dev/null +++ b/configs/yolo-baby.cfg @@ -0,0 +1,125 @@ +[net] +batch=64 +subdivisions=2 +height=448 +width=448 +channels=3 +momentum=0.9 +decay=0.0005 + +saturation=.75 +exposure=.75 +hue = .1 + +learning_rate=0.0005 +policy=steps +steps=200,400,600,800,20000,30000 +scales=2.5,2,2,2,.1,.1 +max_batches = 40000 + +[convolutional] +batch_normalize=1 +filters=16 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[connected] +output= 1470 +activation=linear + +[detection] +classes=20 +coords=4 +rescore=1 +side=7 +num=2 +softmax=0 +sqrt=1 +jitter=.2 + +object_scale=1 +noobject_scale=.5 +class_scale=1 +coord_scale=5 \ No newline at end of file diff --git a/configs/yolo.cfg b/configs/yolo.cfg new file mode 100644 index 000000000..c4f415c11 --- /dev/null +++ b/configs/yolo.cfg @@ -0,0 +1,257 @@ +[net] +batch=1 +subdivisions=1 +height=448 +width=448 +channels=3 +momentum=0.9 +decay=0.0005 +saturation=1.5 +exposure=1.5 +hue=.1 + +learning_rate=0.0005 +policy=steps +steps=200,400,600,20000,30000 +scales=2.5,2,2,.1,.1 +max_batches = 40000 + +[convolutional] +batch_normalize=1 +filters=64 +size=7 +stride=2 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=192 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +####### + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=2 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[local] +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[dropout] +probability=.5 + +[connected] +output= 1715 +activation=linear + +[detection] +classes=20 +coords=4 +rescore=1 +side=7 +num=3 +softmax=0 +sqrt=1 +jitter=.2 + +object_scale=1 +noobject_scale=.5 +class_scale=1 +coord_scale=5 + diff --git a/darknet.py b/darknet.py new file mode 100644 index 000000000..29e39899d --- /dev/null +++ b/darknet.py @@ -0,0 +1,156 @@ +""" +file: darknet.py +includes: definition of class Darknet +this class works with Darknet files: .cfg, .weights +and produces Darknet objects that are easy for TFNet +to use for building the corresponding tensorflow net. + +this class uses configs/process.py as a parser for .cfg +files to understand the structure of .weights file. It +will use these information to load all the weights into +its attribute .layers - a well structured list, with each +element is an object of class layer() defined right below +""" + +from configs.process import * +import tensorflow as tf +import numpy as np +import time +import os + +class layer: + def __init__(self, type, size = 0, + c = 0, n = 0, h = 0, w = 0): + self.type = type + self.size = size + self.c, self.n = (c, n) + self.h, self.w = (h, w) + +class dropout_layer(layer): + def __init__(self, p): + self.type = 'dropout' + self.prob = p + +class btchnrm_layer(layer): + def __init__(self, size, c, n, h, w ): # <- cryin' haha + layer.__init__(self, 'batchnorm', + size, c, n, h, w) + +class maxpool_layer(layer): + def __init__(self, size, c, n, h, w, + stride, pad ): + layer.__init__(self, 'maxpool', + size, c, n, h, w) + self.stride = stride + self.pad = pad + +class convolu_layer(layer): + def __init__(self, size, c, n, h, w, + stride, pad ): + layer.__init__(self, 'convolutional', + size, c, n, h, w) + self.stride = stride + self.pad = pad + +class connect_layer(layer): + def __init__(self, size, c, n, h, w, + input_size, output_size): + layer.__init__(self, 'connected', + size, c, n, h, w) + self.output_size = output_size + self.input_size = input_size + +class Darknet(object): + + layers = list() + model = str() + partial = bool() + + def __init__(self, model, partial = False): + self.partial = partial + self.model = model + self.parse(model) + + postfix = int('-' in model) * 'binaries/' + weight_file = postfix + 'yolo-{}.weights'.format(model) + print ('Loading {} ...'.format(weight_file)) + start = time.time() + self.loadWeights(weight_file) + stop = time.time() + print ('Finished in {}s'.format(stop - start)) + + def parse(self, model): + cfg = model.split('-')[0] + print ('Parsing yolo-{}.cfg'.format(cfg)) + layers = cfg_yielder(cfg) + for i, info in enumerate(layers): + if i == 0: self.meta = info; continue + if len(info) == 1: new = layer(type = info[0]) + if info[0] == 'bnrm': new = btchnrm_layer(*info[1:]) + if info[0] == 'drop': new = dropout_layer(*info[1:]) + if info[0] == 'conv': new = convolu_layer(*info[1:]) + if info[0] == 'pool': new = maxpool_layer(*info[1:]) + if info[0] == 'conn': new = connect_layer(*info[1:]) + self.layers.append(new) + + def loadWeights(self, weight_path): + file_len = os.path.getsize(weight_path); offset = 16 + + # Read byte arrays from file + for i in range(len(self.layers)): + l = self.layers[i] + if l.type == "convolutional": + weight_number = l.n * l.c * l.size * l.size + l.biases = np.memmap(weight_path, mode = 'r', + offset = offset, shape = (), + dtype = '({})float32,'.format(l.n)) + offset += 4 * l.n + l.weights = np.memmap(weight_path, mode = 'r', + offset = offset, shape = (), + dtype = '({})float32,'.format(weight_number)) + offset += 4 * weight_number + + elif l.type == "batchnorm": + l.biases = np.memmap(weight_path, mode = 'r', + offset = offset, shape = (), + dtype = '({})float32,'.format(l.n)) + offset += 4 * l.n + l.weights = np.memmap(weight_path, mode = 'r', + offset = offset, shape = (), + dtype = '({})float32,'.format(l.n)) + offset += 4 * l.n + + elif l.type == "connected": + bias_number = l.output_size + weight_number = l.output_size * l.input_size + l.biases = np.memmap(weight_path, mode = 'r', + offset = offset, shape = (), + dtype = '({})float32,'.format(bias_number)) + offset += bias_number * 4 + l.weights = np.memmap(weight_path, mode = 'r', + offset = offset, shape = (), + dtype = '({})float32,'.format(weight_number)) + offset += weight_number * 4 + + # Defensive python right here bietch. + if offset == file_len: + print 'Successfully identified all {} bytes'.format(offset) + else: + exit('Error: expect {} bytes, found {}'.format(offset, file_len)) + + # Reshape + for i in range(len(self.layers)): + l = self.layers[i] + + if l.type == 'convolutional': + weight_array = l.weights + weight_array = np.reshape(weight_array, + [l.n, l.c, l.size, l.size]) + weight_array = weight_array.transpose([2,3,1,0]) + l.weights = weight_array + + if l.type == 'connected': + weight_array = l.weights + weight_array = np.reshape(weight_array, + [l.input_size, l.output_size]) + l.weights = weight_array \ No newline at end of file diff --git a/data.py b/data.py index 810b4ea7d..56c5a0b53 100644 --- a/data.py +++ b/data.py @@ -1,209 +1,46 @@ -from drawer import * +""" +file: ./data.py +includes: shuffle() +shuffle will load the cPickle dump parsed.bin inside +""" + import cPickle as pickle -from copy import deepcopy -import subprocess -mult = 1. +from yolo.train import * + +off_bound_msg = 'Random scale/translate sends obj off bound' -def shuffle(train_path, file, expectC, S, batch, epoch): - with open(file,'rb') as f: - pick, data = pickle.load(f) - C = len(pick) - if C != expectC: - exit("There is a mismatch between the model and the parsed annotations") +def shuffle(train_path, parsed, batch, epoch, meta): + with open(parsed, 'rb') as f: data = pickle.load(f)[0] size = len(data) - print 'Dataset of total {}'.format(size) + print 'Dataset of {} instance(s)'.format(size) + if batch > size: exit('Error: batch size is too big') batch_per_epoch = int(size / batch) + total = epoch * batch_per_epoch + yield total for i in range(epoch): print 'EPOCH {}'.format(i+1) # Shuffle data shuffle_idx = np.random.permutation(np.arange(size)) for b in range(batch_per_epoch): - for r in range(1): - start_idx = b * batch - end_idx = (b+1) * batch - - datum = list() - x_batch = list() - jpgs = list() - try: - # if True: - for j in range(start_idx,end_idx): - real_idx = shuffle_idx[j] - this = data[real_idx] - jpg = this[0] - w, h, allobj_ = this[1] - allobj = deepcopy(allobj_) - flip = (r / 2) + (r % 2) * (j % 2) - flip = flip % 2 - - path = '{}{}'.format(train_path, jpg) - img, allobj = crop(path, allobj) - - if flip == 1: - img = img[:,:,::-1,:] - - img = [img] - jpgs += [path] - - cellx = 1. * w / S - celly = 1. * h / S - for x in allobj: - # cv2.rectangle(img[0], (x[1], x[2]), (x[3], x[4]), (0,0,255), 2) - centerx = .5*(x[1]+x[3]) #xmin, xmax - centery = .5*(x[2]+x[4]) #ymin, ymax - if flip == 1: - centerx = w - centerx - cx = centerx / cellx - cy = centery / celly - x[3] = float(x[3]-x[1]) / w - x[4] = float(x[4]-x[2]) / h - x[3] = np.sqrt(x[3]) - x[4] = np.sqrt(x[4]) - x[1] = cx - np.floor(cx) - x[2] = cy - np.floor(cy) - x += [np.floor(cx)] - x += [np.floor(cy)] - - # if False: - # for x in allobj: - # cx = x[5] + x[1] - # cy = x[6] + x[2] - # centerx = cx * cellx - # centery = cy * celly - # ww = x[3] * x[3] * w - # hh = x[4] * x[4] * h - # cv2.rectangle(im, - # (int(centerx - ww/2), int(centery - hh/2)), - # (int(centerx + ww/2), int(centery + hh/2)), - # (0,0,255), 2) - - # cv2.imshow("result", im) - # cv2.waitKey() - # cv2.destroyAllWindows() - - """ - YOLO formulates the problem as a regression problem. Normally from the - annotation, we can directly produce a target tensor to calculate the L2 - loss as (network_output - target)^2. But YOLO's L2 loss formulation is not - that straightforward, namely the complication comes from its loss is selective: - not penalizes all entries in the network_output, depending on what network_output - looks like during training, moreover the loss also weights each term in the loss - differently, e.g. coordinate term is weighted more than confidence terms, etc. - - To resolve this complication, I came up with a procedure that can calculate YOLO's - loss function in two parts, all the operation in each part are tensor operations. The - first part is done here during minibatch yielding, tensor operations are done on numpy - tensors, the second part is done in decode() method inside tfnet.py, as tensorflow tensors. - Why the seperation? I believe there are three reasons: 1. tensorflow tensors - does not support member assignment, so any operation involving member assignment must be - done as numpy tensors. 2. Efficiency: some operation are best to be done here than there. - 3. Inherent constraints in YOLO's formulation of the loss, please read the comming text - for details. - - The following text explains the next 11 tensors that I'll define - They will be passed as placeholders into the network and serve as - materials for calculating YOLO's loss. I look forward to suggestions - on improving this (my) current approach. - ----------------------------------------------------------------- - - probs is the target class probability tensor - confs1 and confs2 are confidence score of boxes 1 and boxes 2 - upleft are upper left corner coordinates of bounding boxes - botright are bottom right corner coordinates of bounding boxes - So far, probs, confs1, confs2, upleft, botright constitutes the target - of regression, why do we need the ___id tensors? - - You know from the paper that only grid cells that are responsible for - correct prediction are penalized (by an L2 loss), so not all entries in - the above tensors should take part in the loss calculation, furthermore - according to the paper, coordinates terms in the loss should be weighted more - than the other terms, and of two boxes that each grid cell predicts, one with better - IOU should be weighted differently than the other. - - These __id tensors are meant to solve the above complication. They act as weights - and will be set to appropriate value either in data.py (as numpy tensors, during the - batch generating phase (this file)) or in tfnet.py (as tensorflow tensors, during the - loss calculation phase). For example, if an entry should not affect the loss, its - corresponding weight will be set to zero, if an entry correspond to coordinate loss, - the weight should be 5.0, so on. - - proid will weight probs, and its final value is set here in data.py - conid1 weights confs1 - conid2 weights confs2 - cooid1 weights coordinate of box1 - cooid2 weights coordinate of box2 - - conid1, conid2, cooid1, cooid2's values are initialised in data.py and set to correct value - in tfnet.py. Why? because we only know their correct value when IOU of each predicted box - with the target are calculated, i.e. the forward pass must be done before this. - """ - probs = np.zeros([S*S,C]) - confs = np.zeros([S*S,2]) - coord = np.zeros([S*S,2,4]) - proid = np.zeros([S*S,C]) - conid = np.zeros([S*S,2]) - cooid1 = cooid2 = np.zeros([S*S,1,4]) - prear = np.zeros([S*S,4]) - for x in allobj: - at = int(x[6] * S + x[5]) - probs[at, :] = [0.] * C - probs[at, pick.index(x[0])] = 1. - proid[at, :] = [1] * C - coord[at, 0, :] = x[1:5] - coord[at, 1, :] = x[1:5] - scale = .5 * S - prear[at,0] = x[1] - x[3]**2 * scale # xleft - prear[at,1] = x[2] - x[4]**2 * scale # yup - prear[at,2] = x[1] + x[3]**2 * scale # xright - prear[at,3] = x[2] + x[4]**2 * scale # ybot - confs[at, :] = [1.] * 2 - conid[at, :] = [1.] * 2 - cooid1[at, 0, :] = [1.] * 4 - cooid2[at, 0, :] = [1.] * 4 - upleft = np.expand_dims(prear[:,0:2], 1) # 49 x 1 - botright = np.expand_dims(prear[:,2:4], 1) - - # Finalise the placeholders' values - probs = probs.reshape([-1]) # true_class - confs1 = confs[:,0] - confs2 = confs[:,1] - coord = coord.reshape([-1]) # true_coo - upleft = np.concatenate([upleft]*2,1) - botright = np.concatenate([botright]*2,1) - proid = proid.reshape([-1]) # class_idtf - conid1 = conid[:,0] - conid2 = conid[:,1] - cooid1 = cooid1 - cooid2 = cooid2 - - # Assemble the placeholders' value - new = [ - [probs], [confs1], [confs2], [coord], - [upleft], [botright], - [proid], [conid1], [conid2], [cooid1], [cooid2] - ] - if datum == list(): - datum = new - x_batch = img - else: - x_batch += img - for i in range(len(datum)): - datum[i] = np.concatenate([datum[i], new[i]]) - - if False: - here = 0 - names = list() - while here + C < S*S*C: - consider = probs[here:here+C] - if (np.sum(consider) > 0.5): - names += [pick[np.argmax(consider)]] - here += C - print '{} : {}'.format(jpg, names) - - - x_batch = np.concatenate(x_batch, 0) - yield (x_batch, datum) - except: - print 'Random scale/translate sends object(s) out of bound' - continue + start_idx = b * batch + end_idx = (b+1) * batch + + datum = list() + x_batch = list() + offbound = False + for j in range(start_idx,end_idx): + real_idx = shuffle_idx[j] + this = data[real_idx] + img, tensors = yolo_batch(train_path, this, meta) + if img is None: offbound = True; break + x_batch += [img] + if datum == list(): datum = tensors + else: + for i in range(len(datum)): + new_datum_i = [datum[i], tensors[i]] + datum[i] = np.concatenate(new_datum_i) + + if offbound: print off_bound_msg; continue + x_batch = np.concatenate(x_batch, 0) + yield (x_batch, datum) diff --git a/drawer.py b/drawer.py deleted file mode 100644 index 64b3dd7b7..000000000 --- a/drawer.py +++ /dev/null @@ -1,131 +0,0 @@ -from box import * -from PIL import Image, ImageFile -ImageFile.LOAD_TRUNCATED_IMAGES = True -import cv2 - -def fix(x,c): - return max(min(x,c),0) - -def crop(imPath, allobj = None): - - im = cv2.imread(imPath) - if allobj is not None: - h, w, _ = im.shape - scale = np.random.uniform()/3. + 1. - max_offx = (scale-1.) * w - max_offy = (scale-1.) * h - offx = int(np.random.uniform() * max_offx) - offy = int(np.random.uniform() * max_offy) - im = cv2.resize(im, (0,0), fx = scale, fy = scale) - im = im[offy : (offy + h), offx : (offx + w)] - #--------------- - # (x,y) --> (scale*x, scale*y) - # (scale*x - offx, scale*y - offy) - #-------------- - for obj in allobj: - obj[1] = int(obj[1]*scale-offx) - obj[3] = int(obj[3]*scale-offx) - obj[2] = int(obj[2]*scale-offy) - obj[4] = int(obj[4]*scale-offy) - obj[1] = fix(obj[1], w) - obj[3] = fix(obj[3], w) - obj[2] = fix(obj[2], h) - obj[4] = fix(obj[4], h) - #print obj, w, h - - # return im - im_ = cv2.resize(im, (448, 448)) - image_array = np.array(im_) - image_array = image_array / 255. - image_array = image_array * 2. - 1. - image_array = np.expand_dims(image_array, 0) # 1, height, width, 3 - - if allobj is not None: - return image_array, allobj - else: - return image_array - -def to_color(indx, base): - base2 = base * base - b = indx / base2 - r = (indx % base2) / base - g = (indx % base2) % base - return (b * 127, r * 127, g * 127) - -def draw_predictions(predictions, - img_path, flip, threshold, - C, S, labels, colors): - - B = 2 - boxes = [] - SS = S * S # number of grid cells - prob_size = SS * C # class probabilities - conf_size = SS * B # confidences for each grid cell - probs = predictions[0 : prob_size] - confs = predictions[prob_size : (prob_size + conf_size)] - cords = predictions[(prob_size + conf_size) : ] - probs = probs.reshape([SS, C]) - confs = confs.reshape([SS, B]) - cords = cords.reshape([SS, B, 4]) - - for grid in range(SS): - for b in range(B): - new_box = BoundBox(C) - new_box.c = confs[grid, b] - new_box.x = (cords[grid, b, 0] + grid % S) / S - new_box.y = (cords[grid, b, 1] + grid // S) / S - new_box.w = cords[grid, b, 2] ** 2 - new_box.h = cords[grid, b, 3] ** 2 - new_box.id = '{}-{}'.format(grid, b) - for c in range(C): - new_box.probs[c] = new_box.c * probs[grid, c] - boxes.append(new_box) - - # non max suppress boxes - if True: - for c in range(C): - for i in range(len(boxes)): boxes[i].class_num = c - boxes = sorted(boxes, cmp=prob_compare) - for i in range(len(boxes)): - boxi = boxes[i] - if boxi.probs[c] == 0: continue - for j in range(i + 1, len(boxes)): - boxj = boxes[j] - boxij = box_intersection(boxi, boxj) - boxja = boxj.w * boxj.h - apart = boxij / boxja - if apart >= .5: - if boxi.probs[c] > boxj.probs[c]: - boxes[j].probs[c] = 0. - else: - boxes[i].probs[c] = 0. - - imgcv = cv2.imread(img_path) - if flip: imgcv = cv2.flip(imgcv, 1) - print img_path - h, w, _ = imgcv.shape - for b in boxes: - max_indx = np.argmax(b.probs) - max_prob = b.probs[max_indx] - label = 'object' * int(C < 2) - label += labels[max_indx] * int(C > 1) - if (max_prob > threshold): - left = int ((b.x - b.w/2.) * w) - right = int ((b.x + b.w/2.) * w) - top = int ((b.y - b.h/2.) * h) - bot = int ((b.y + b.h/2.) * h) - if left < 0 : left = 0 - if right > w - 1: right = w - 1 - if top < 0 : top = 0 - if bot > h - 1: bot = h - 1 - thick = int((h+w)/300) - cv2.rectangle(imgcv, - (left, top), (right, bot), - colors[max_indx], thick) - mess = '{}:{:.3f}'.format(label, max_prob) - cv2.putText(imgcv, mess, (left, top - 12), - 0, 1e-3 * h, colors[max_indx],thick/5) - - img_name = 'results/{}'.format( - img_path.split('/')[-1].split('.')[0]) - cv2.imwrite(img_name + flip * '_' + '.jpg', imgcv) \ No newline at end of file diff --git a/genw.py b/genw.py index 2bbaf679d..8a7b50675 100644 --- a/genw.py +++ b/genw.py @@ -1,32 +1,33 @@ -from configs.process import cfg_yielder -from yolo import * +from configs.process import * +from yolo.train import * +from tensorflow import flags +from darknet import * import numpy as np import os import sys +flags.DEFINE_string("src", "", "source of recollection: model name if source is complete, file name if source is partial, blank if no source") +flags.DEFINE_string("des", "", "name of new model") +flags.DEFINE_float("std", 1e-2, "standard deviation of random initialization") +FLAGS = flags.FLAGS +src = FLAGS.src +des = FLAGS.des -src = sys.argv[1] -try: - des = sys.argv[2] -except: - des = src - src = str() - -wlayer = ['CONVOLUTIONAL', 'CONNECTED'] +wlayer = ['convolutional', 'connected'] class collector(object): - def __init__(self, yolo): + def __init__(self, net): self.i = 0 - self.yolo = yolo + self.net = net def inc(self): - while self.yolo.layers[self.i].type not in wlayer: + while self.net.layers[self.i].type not in wlayer: self.i += 1 - if self.i == len(self.yolo.layers): + if self.i == len(self.net.layers): break def give(self): self.inc() - l = self.yolo.layers[self.i] + l = self.net.layers[self.i] w = l.weights - if l.type == 'CONVOLUTIONAL': + if l.type == 'convolutional': w = w.transpose([3,2,0,1]) w = w.reshape([-1]) w = np.concatenate((l.biases, w)) @@ -40,8 +41,12 @@ def give(self): offset = int(16) if src != str(): - yolo = YOLO(src) - col = collector(yolo) + partial = False + if ".weights" in src: + partial = True + src = des # same structure + net = Darknet(src, partial) + col = collector(net) flag = True # PHASE 01: recollect @@ -61,8 +66,6 @@ def give(self): elif not flag: mark = i break - if mark == i: - print 'none' else: flag = False @@ -75,11 +78,11 @@ def give(self): print k if k[0] == 'conv': w = np.random.normal( - scale = .05, + scale = FLAGS.std, size = (k[1]*k[1]*k[2]*k[3]+k[3],)) else: w = np.random.normal( - scale = .05, + scale = FLAGS.std, size = (k[6]*k[7]+k[7],)) w = np.float32(w) writer.write(w.tobytes()) diff --git a/labels.txt b/labels.txt index 95d1ed6c7..871a65b4e 100644 --- a/labels.txt +++ b/labels.txt @@ -1,3 +1,2 @@ tvmonitor -pottedplant -person \ No newline at end of file +pottedplant \ No newline at end of file diff --git a/main.py b/main.py index 3a6847b27..5d807d146 100644 --- a/main.py +++ b/main.py @@ -1,32 +1,28 @@ -from yolo import * -from box import * +from darknet import * from tfnet import * from tensorflow import flags -import sys -import time -import os -flags.DEFINE_string("test", "data", "path to testing folder") -flags.DEFINE_string("pascal", "../pascal/VOCdevkit", "path to training set") +flags.DEFINE_string("testset", "test", "path to testing directory") +flags.DEFINE_string("dataset", "../pascal/VOCdevkit/IMG/", "path to dataset directory") +flags.DEFINE_string("annotation", "../pascal/VOCdevkit/ANN/", "path to annotation directory") flags.DEFINE_float("threshold", 0.1, "detection threshold") -flags.DEFINE_string("model", "3c", "yolo configuration of choice") +flags.DEFINE_string("model", "3c", "configuration of choice") flags.DEFINE_boolean("train", False, "training mode or not?") -flags.DEFINE_boolean("load", False, "load the newest train in backup/checkpoint") +flags.DEFINE_integer("load", 0, "load a saved backup/checkpoint, -1 for newest") flags.DEFINE_boolean("savepb", False, "save net and weight to a .pb file") flags.DEFINE_float("gpu", 0.0, "How much gpu (from 0.0 to 1.0)") flags.DEFINE_float("lr", 1e-5, "Learning rate") -flags.DEFINE_string("scale", "1,1,.5,5.", - "Comma-separated scaling for probability, confidence, noobj, coordinate terms in the loss") flags.DEFINE_integer("keep",20,"Number of most recent training results to save") flags.DEFINE_integer("batch", 12, "Batch size") flags.DEFINE_integer("epoch", 1000, "Number of epoch") flags.DEFINE_integer("save", 2000, "Save checkpoint every ? training examples") + FLAGS = flags.FLAGS -image = FLAGS.pascal + '/IMG/' -annot = FLAGS.pascal + '/ANN/' + 'parsed.yolotf' +image = FLAGS.dataset +annot = FLAGS.annotation + 'parsed.bin' step = int() -if FLAGS.load: +if FLAGS.load < 0: try: with open('backup/checkpoint','r') as f: lines = f.readlines() @@ -34,20 +30,23 @@ sys.exit('Seems like there is no recent training in backup/') name = lines[-1].split(' ')[1].split('"')[1] step = int(name.split('-')[1]) -yoloNet = YOLO(FLAGS.model + int(step > 0) * '-{}'.format(step)) +else: step = FLAGS.load +yoloNet = Darknet(FLAGS.model + int(step > 0) * '-{}'.format(step)) -print ('Compiling net & initialise parameters...') +print ('\nCompiling net & fill in parameters...') start = time.time() if FLAGS.gpu <= 0.: with tf.device('cpu:0'): - model = SimpleNet(yoloNet, FLAGS) + tfnet = TFNet(yoloNet, FLAGS) else: - model = SimpleNet(yoloNet, FLAGS) -model.step = step -model.setup_meta_ops(FLAGS) + tfnet = TFNet(yoloNet, FLAGS) +tfnet.step = step +tfnet.setup_meta_ops() print ('Finished in {}s'.format(time.time() - start)) if FLAGS.train: - print 'training mode' - model.train(image, annot, FLAGS.batch, FLAGS.epoch) -model.predict(FLAGS) \ No newline at end of file + print '\nEnter training ...' + tfnet.train(image, annot, FLAGS.batch, FLAGS.epoch) + +print +tfnet.predict() \ No newline at end of file diff --git a/ops.py b/ops.py new file mode 100644 index 000000000..7a0205e7f --- /dev/null +++ b/ops.py @@ -0,0 +1,42 @@ +from yolo.train import * + +def convl(l, x, name): + if l.pad < 0: # figure the pad out + size = np.int(x.get_shape()[1]) + expect = -(l.pad + 1) * l.stride + expect += l.size - size + padding = [expect / 2, expect - expect / 2] + if padding[0] < 0: padding[0] = 0 + if padding[1] < 0: padding[1] = 0 + else: + padding = [l.pad, l.pad] + l.pad = 'VALID' + x = tf.pad(x, [[0, 0], padding, padding, [0, 0]]) + x = tf.nn.conv2d(x, l.weights, + padding = l.pad, name = name, + strides=[1, l.stride, l.stride, 1]) + # if l.batch_norm == 1: x = slim.batch_norm(x) + # else: x = tf.nn.bias_add(x, l.b) + return tf.nn.bias_add(x, l.biases) + +def bnorm(l, x, name): + return x + +def dense(l, x, name): + return tf.nn.xw_plus_b(x, l.weights, l.biases, name = name) + +def maxpool(l, x, name): + l.pad = 'VALID' + return tf.nn.max_pool(x, padding = l.pad, + ksize = [1,l.size,l.size,1], name = name, + strides = [1,l.stride,l.stride,1]) + +def flatten(x, name): + x = tf.transpose(x, [0,3,1,2]) + return slim.flatten(x, scope = name) + +def leaky(x, name): + return tf.maximum(.1*x, x, name = name) + +def dropout(x, drop, name): + return tf.nn.dropout(x, drop, name = name) diff --git a/tfnet.py b/tfnet.py index dd96a4a38..7c426b6a7 100644 --- a/tfnet.py +++ b/tfnet.py @@ -1,125 +1,84 @@ -import tensorflow as tf -import numpy as np -import os -import time -from drawer import * -from data import shuffle -from yolo import * -import subprocess -import sys - -class SimpleNet(object): +""" +file: tfnet.py +includes: definition of class TFNet +this class initializes by building the forward pass +its methods include train, predict and savepb - saving +the current model to a protobuf file (no variable included) +""" - labels = list() - colors = list() - C = int() - model = str() - step = int() - learning_rate = float() - scale_prob = float() - scale_conf = float() - scale_noobj = float() - scale_coor = float() - save_every = int() +import sys +from yolo.drawer import * +from darknet import * +from ops import * +from data import * - def __init__(self, yolo, FLAGS): - self.model = yolo.model - self.S = yolo.S - self.labels = yolo.labels - self.C = len(self.labels) +const_layer = ['leaky', 'dropout'] +var_layer = ['convolutional', 'connected', 'batchnorm'] - base = int(np.ceil(pow(self.C, 1./3))) - for x in range(len(self.labels)): - self.colors += [to_color(x, base)] +class TFNet(object): + def __init__(self, darknet, FLAGS): + # Attach model's hyper params to the tfnet + self.meta = yolo_metaprocess(darknet.meta) + self.FLAGS = FLAGS + # Placeholders self.inp = tf.placeholder(tf.float32, [None, 448, 448, 3], name = 'input') - self.drop = tf.placeholder(tf.float32, name = 'dropout') - + self.drop = dict() + self.feed = dict() + + # Iterate through darknet layers now = self.inp - for i in range(yolo.layer_number): - print now.get_shape() - l = yolo.layers[i] - if l.type == 'CONVOLUTIONAL': - if l.pad < 0: - size = np.int(now.get_shape()[1]) - expect = -(l.pad + 1) * l.stride # there you go bietche - expect += l.size - size - padding = [expect / 2, expect - expect / 2] - if padding[0] < 0: padding[0] = 0 - if padding[1] < 0: padding[1] = 0 - else: - padding = [l.pad, l.pad] - l.pad = 'VALID' - now = tf.pad(now, [[0, 0], padding, padding, [0, 0]]) - if FLAGS.savepb: - b = tf.constant(l.biases) - w = tf.constant(l.weights) - else: - b = tf.Variable(l.biases) - w = tf.Variable(l.weights) - now = tf.nn.conv2d(now, w, - strides=[1, l.stride, l.stride, 1], - padding=l.pad) - now = tf.nn.bias_add(now, b) - now = tf.maximum(0.1 * now, now) - elif l.type == 'MAXPOOL': - l.pad = 'VALID' - now = tf.nn.max_pool(now, - padding = l.pad, - ksize = [1,l.size,l.size,1], - strides = [1,l.stride,l.stride,1]) - elif l.type == 'FLATTEN': - now = tf.transpose(now, [0,3,1,2]) - now = tf.reshape(now, - [-1, int(np.prod(now.get_shape()[1:]))]) - elif l.type == 'CONNECTED': - name = str() - if i == yolo.layer_number - 1: name = 'output' - else: name = 'conn' - if FLAGS.savepb: - b = tf.constant(l.biases) - w = tf.constant(l.weights) - else: - b = tf.Variable(l.biases) - w = tf.Variable(l.weights) - now = tf.nn.xw_plus_b(now, w, b, name = name) - elif l.type == 'LEAKY': - now = tf.maximum(0.1 * now, now) - elif l.type == 'DROPOUT': - if not FLAGS.savepb: - print ('dropout') - now = tf.nn.dropout(now, keep_prob = self.drop) - print now.get_shape() + for i, l in enumerate(darknet.layers): + if i == len(darknet.layers)-1: name = 'output' + else: name = l.type+'-{}'.format(i) + # no variable when saving to .pb file + if l.type in var_layer and not FLAGS.savepb: + l.biases = tf.Variable(l.biases) + l.weights = tf.Variable(l.weights) + arg = [l, now, name] + if l.type=='convolutional': now = convl(*arg) + elif l.type == 'connected': now = dense(*arg) + elif l.type == 'batchnorm': now = bnorm(*arg) + elif l.type == 'maxpool': now = maxpool(*arg) + elif l.type == 'flatten': now = flatten(*arg[1:]) + elif l.type == 'leaky' : now = leaky(*arg[1:]) + # Dropout + elif l.type == 'dropout' and not FLAGS.savepb: + self.drop[name] = tf.placeholder(tf.float32) + self.drop[name + '_'] = l.prob + self.feed[self.drop[name]] = self.drop[name+'_'] + print 'Dropout p = {}'.format(l.prob) + now = dropout(now, self.drop[name], name) + if l.type not in const_layer: print now.get_shape() + + # Attach the output to this tfnet self.out = now - def setup_meta_ops(self, FLAGS): - self.save_every = FLAGS.save - self.learning_rate = FLAGS.lr - scales = [float(f) for i, f in enumerate(FLAGS.scale.split(','))] - self.scale_prob, self.scale_conf, self.scale_noobj, self.scale_coor = scales - if FLAGS.gpu > 0: + def setup_meta_ops(self): + if self.FLAGS.gpu > 0: percentage = min(FLAGS.gpu, 1.) - print 'gpu mode {} usage'.format(percentage) + print 'GPU mode with {} usage'.format(percentage) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=percentage) self.sess = tf.Session(config = tf.ConfigProto( allow_soft_placement = True, log_device_placement = False, gpu_options = gpu_options)) else: - print 'cpu mode' + print 'CPU mode' self.sess = tf.Session(config = tf.ConfigProto( allow_soft_placement = False, log_device_placement = False)) - if FLAGS.train: self.decode() - if FLAGS.savepb: - self.savepb('graph-{}.pb'.format(self.model)) + if self.FLAGS.train: yolo_loss(self) + if self.FLAGS.savepb: + self.savepb('graph-{}.pb'.format(self.meta['model'])) sys.exit() - else: self.saver = tf.train.Saver(tf.all_variables(), max_to_keep = FLAGS.keep) + else: self.saver = tf.train.Saver(tf.all_variables(), + max_to_keep = self.FLAGS.keep) self.sess.run(tf.initialize_all_variables()) - if FLAGS.load: + if self.FLAGS.load > 0: load_point = 'backup/model-{}'.format(self.step) - print 'loading from {}'.format(load_point) + print 'Loading from {}'.format(load_point) self.saver.restore(self.sess, load_point) def savepb(self, name): @@ -128,7 +87,7 @@ def savepb(self, name): def to_constant(self, inc = 0): with open('binaries/yolo-{}-{}.weights'.format( - self.model.split('-')[0], self.step + inc), 'w') as f: + self.meta['model'].split('-')[0], self.step + inc), 'w') as f: f.write(np.array([1]*4, dtype=np.int32).tobytes()) for i, variable in enumerate(tf.trainable_variables()): val = variable.eval(self.sess) @@ -137,183 +96,61 @@ def to_constant(self, inc = 0): val = val.reshape([-1]) f.write(val.tobytes()) - def decode(self): - """ - Please refer to the comment section inside data.py - to understand the below placeholders. I look forward - to receiving comments/improvements on my current - implementation of YOLO's loss calculation - """ - - print ('Set up loss and train ops (may cause lag)...') - SS = self.S * self.S - self.true_class = tf.placeholder(tf.float32, # - [None, SS * self.C]) - self.true_coo = tf.placeholder(tf.float32, # - [None, SS * 2 * 4]) - self.class_idtf = tf.placeholder(tf.float32, # - [None, SS * self.C]) - self.cooid1 = tf.placeholder(tf.float32, # - [None, SS, 1, 4]) - self.cooid2 = tf.placeholder(tf.float32, # - [None, SS, 1, 4]) - self.confs1 = tf.placeholder(tf.float32, # - [None, SS]) - self.confs2 = tf.placeholder(tf.float32, # - [None, SS]) - self.conid1 = tf.placeholder(tf.float32, # - [None, SS]) - self.conid2 = tf.placeholder(tf.float32, # - [None, SS]) - self.upleft = tf.placeholder(tf.float32, # - [None, SS, 2, 2]) - self.botright = tf.placeholder(tf.float32, # - [None, SS, 2, 2]) - - # Extract the coordinate prediction from - # output of YOLO's net - coords = self.out[:, SS * (self.C + 2):] - coords = tf.reshape(coords, [-1, SS, 2, 4]) - - wh = tf.pow(coords[:,:,:,2:4], 2) * (.5 * self.S); # weight & height of each box - xy = coords[:,:,:,0:2] # the center coordinates of each box - floor = xy - wh - ceil = xy + wh - - # calculate the coordinates of the intersection - # between predicted boxes and correct boxes - intersect_upleft = tf.maximum(floor, self.upleft) - intersect_botright = tf.minimum(ceil, self.botright) - intersect_wh = intersect_botright - intersect_upleft - intersect_wh = tf.maximum(intersect_wh, 0.0) - - # calculate the areas of intersection - intersect_area1 = tf.mul(intersect_wh[:,:,0,0], intersect_wh[:,:,0,1]) - intersect_area2 = tf.mul(intersect_wh[:,:,1,0], intersect_wh[:,:,1,1]) - # determine which box has worse & which box has better IOU to ground truth - inferior_cell = intersect_area1 > intersect_area2 - inferior_cell = tf.to_float(inferior_cell) - - # since the initial value of confs is 1.0 throughout - # now we know which box of each pair has worse IOU - # its value should be set to 0.0 - confs1 = tf.mul(inferior_cell, self.confs1) - confs2 = tf.mul((1.-inferior_cell), self.confs2) - confs1 = tf.expand_dims(confs1, -1) - confs2 = tf.expand_dims(confs2, -1) - confs = tf.concat(2, [confs1, confs2]) - - # Again, since now we know which box of each pair has worse IOU - # it should not contribute to the loss value - # hence the corresponding conid is set to 0.0 - mult = inferior_cell - conid1 = tf.mul(mult, self.conid1) - conid2 = tf.mul((1. - mult), self.conid2) - conid1 = tf.expand_dims(conid1, -1) - conid2 = tf.expand_dims(conid2, -1) - conid = tf.concat(2, [conid1, conid2]) + def train(self, train_set, parsed_annota, batch, epoch): + batches = shuffle(train_set, parsed_annota, batch, epoch, self.meta) + + print 'Training statistics:' + print ' Learning rate : {}'.format(self.FLAGS.lr) + print ' Batch size : {}'.format(batch) + print ' Epoch number : {}'.format(epoch) + print ' Backup every : {}'.format(self.FLAGS.save) + + total = int() # total number of batches + for i, packet in enumerate(batches): + if i == 0: total = packet; continue + x_batch, datum = packet + feed_dict = yolo_feed_dict(self, x_batch, datum) + feed_dict[self.inp] = x_batch + for k in self.feed: feed_dict[k] = self.feed[k] - # Again, since now we know which box of each pair has worse IOU - # it should not contribute to the loss value, - # hence the corresponding cooid is set to 0.0 - times = tf.expand_dims(inferior_cell, -1) # [batch, 49, 1] - times = tf.expand_dims(times, 2) # [batch, 49, 1, 1] - times = tf.concat(3, [times]*4) # [batch, 49, 1, 4] - cooid1 = tf.mul(times, self.cooid1) - cooid2 = (1. - times) * self.cooid2 - cooid = tf.concat(2, [cooid1, cooid2]) # [batch, 49, 2, 4] - - # reshape - confs = tf.reshape(confs, - [-1, int(np.prod(confs.get_shape()[1:]))]) - conid = tf.reshape(conid, - [-1, int(np.prod(conid.get_shape()[1:]))]) - cooid = tf.reshape(cooid, - [-1, int(np.prod(cooid.get_shape()[1:]))]) - - conid = conid + tf.to_float(conid > .5) * (self.scale_conf - 1.) - conid = conid + tf.to_float(conid < .5) * self.scale_noobj - - # true is the regression target - # idtf is the weight - # the L2 loss of YOLO is then: tf.mul(idtf, (self.out - true)**2) - true = tf.concat(1,[self.true_class, confs, self.true_coo]) - idtf = tf.concat(1,[self.class_idtf * self.scale_prob, conid, - cooid * self.scale_coor]) - - self.loss = tf.pow(self.out - true, 2) - self.loss = tf.mul(self.loss, idtf) - self.loss = tf.reduce_sum(self.loss, 1) - self.loss = .5 * tf.reduce_mean(self.loss) - - optimizer = tf.train.RMSPropOptimizer(self.learning_rate) - gradients = optimizer.compute_gradients(self.loss) - self.train_op = optimizer.apply_gradients(gradients) - - def train(self, train_set, annotate, batch_size, epoch): - batches = shuffle(train_set, annotate, self.C, self.S, batch_size, epoch) - for i, batch in enumerate(batches): - x_batch, datum = batch - feed_dict = { - self.inp : x_batch, - self.drop : .5, - self.true_class : datum[0], - self.confs1 : datum[1], - self.confs2 : datum[2], - self.true_coo : datum[3], - self.upleft : datum[4], - self.botright : datum[5], - self.class_idtf : datum[6], - self.conid1 : datum[7], - self.conid2 : datum[8], - self.cooid1 : datum[9], - self.cooid2 : datum[10], - } _, loss = self.sess.run([self.train_op, self.loss], feed_dict) - print 'step {} - batch {} - loss {}'.format(1+i+self.step, 1+i, loss) - if (i+1) % (self.save_every/batch_size) == 0: - print 'save checkpoint and binaries at step {}'.format(self.step+i+1) - self.saver.save(self.sess, 'backup/model-{}'.format(self.step+i+1)) - self.to_constant(inc = i+1) - - print 'save checkpoint and binaries at step {}'.format(self.step+i+1) - self.saver.save(self.sess, 'backup/model-{}'.format(self.step+i+1)) - self.to_constant(inc = i+1) - - def predict(self, FLAGS): - img_path = FLAGS.test - threshold = FLAGS.threshold - all_img_ = os.listdir(img_path) - batch = min(FLAGS.batch, len(all_img_)) - for j in range(len(all_img_)/batch): - img_feed = list() - all_img = all_img_[j*batch: (j*batch+batch)] + print 'step {} - batch {} - loss {}'.format(i+self.step, i, loss) + if i % (self.FLAGS.save/batch) == 0 or i == total: + print 'save checkpoint and binaries at step {}'.format(self.step+i) + self.saver.save(self.sess, 'backup/model-{}'.format(self.step+i)) + self.to_constant(inc = i) + + def predict(self): + inp_path = self.FLAGS.testset + all_inp_ = os.listdir(inp_path) + all_inp_ = [i for i in all_inp_ if is_yolo_inp(i)] + batch = min(self.FLAGS.batch, len(all_inp_)) + + for j in range(len(all_inp_)/batch): + inp_feed = list() + all_inp = all_inp_[j*batch: (j*batch+batch)] new_all = list() - for img in all_img: - if '.jpg' not in img: continue - new_all += [img] - this_img = '{}/{}'.format(img_path, img) - this_img = crop(this_img) - img_feed.append(this_img) - img_feed.append(this_img[:,:,::-1,:]) - all_img = new_all - - feed_dict = { - self.inp : np.concatenate(img_feed, 0), - self.drop : 1.0 - } + for inp in all_inp: + new_all += [inp] + this_inp = '{}/{}'.format(inp_path, inp) + this_inp = yolo_preprocess(this_inp) + inp_feed.append(this_inp) + all_inp = new_all + + feed_dict = {self.inp : np.concatenate(inp_feed, 0)} + for k in self.feed: feed_dict[k] = 1.0 - print ('Forwarding {} images ...'.format(len(img_feed))) + print ('Forwarding {} inputs ...'.format(len(inp_feed))) start = time.time() out = self.sess.run([self.out], feed_dict) stop = time.time() last = stop - start - print ('Total time = {}s / {} imgs = {} fps'.format( - last, len(img_feed), len(img_feed) / last)) + print ('Total time = {}s / {} inps = {} ips'.format( + last, len(inp_feed), len(inp_feed) / last)) + for i, prediction in enumerate(out[0]): - draw_predictions( - prediction, - '{}/{}'.format(img_path, all_img[i/2]), - i % 2, threshold, - self.C, self.S, self.labels, self.colors) - print ('Results stored in results/') + yolo_postprocess( + prediction, '{}/{}'.format(inp_path, all_inp[i]), + self.FLAGS, self.meta) + + print ('Results stored in results/') diff --git a/yolo.py b/yolo.py deleted file mode 100644 index 4dd7a1a3c..000000000 --- a/yolo.py +++ /dev/null @@ -1,146 +0,0 @@ -import numpy as np -import os -import tensorflow as tf -import time -from configs.process import cfg_yielder - -labels20 = ["aeroplane", "bicycle", "bird", "boat", "bottle", - "bus", "car", "cat", "chair", "cow", "diningtable", "dog", - "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", - "train", "tvmonitor"] -default_models = ['full', 'small', 'tiny'] - -class layer: - def __init__(self, type, size = 0, - c = 0, n = 0, h = 0, w = 0): - self.type = type - self.size = size - self.c, self.n = (c, n) - self.h, self.w = (h, w) - -class maxpool_layer(layer): - def __init__(self, size, c, n, h, w, stride, pad): - layer.__init__(self, 'MAXPOOL', - size, c, n, h, w) - self.stride = stride - self.pad = pad - -class convolu_layer(layer): - def __init__(self, size, c, n, h, w, stride, pad): - layer.__init__(self, 'CONVOLUTIONAL', - size, c, n, h, w) - self.stride = stride - self.pad = pad - -class connect_layer(layer): - def __init__(self, size, c, n, h, w, - input_size, output_size): - layer.__init__(self, 'CONNECTED', - size, c, n, h, w) - self.output_size = output_size - self.input_size = input_size - -class YOLO(object): - - layers = [] - S = int() - model = str() - - def __init__(self, model): - with open('labels.txt', 'r') as f: - pick = f.readlines() - for i in range(len(pick)): pick[i] = pick[i].strip() - if model in default_models: pick = labels20 - self.labels = pick - self.model = model - self.layers = [] - self.build(model) - self.layer_number = len(self.layers) - postfix = int('-' in model) * 'binaries/' - weight_file = postfix + 'yolo-{}.weights'.format(model) - print ('Loading {} ...'.format(weight_file)) - start = time.time() - self.loadWeights(weight_file) - stop = time.time() - print ('Finished in {}s'.format(stop - start)) - - def build(self, model): - cfg = model.split('-')[0] - print ('parsing yolo-{}.cfg'.format(cfg)) - layers = cfg_yielder(cfg) - for i, info in enumerate(layers): - if i == 0: - self.S = info - continue - if len(info) == 1: new = layer(type = info[0]) - if info[0] == 'conv': new = convolu_layer(*info[1:]) - if info[0] == 'pool': new = maxpool_layer(*info[1:]) - if info[0] == 'conn': new = connect_layer(*info[1:]) - self.layers.append(new) - - def loadWeights(self, weight_path): - self.startwith = np.array( - np.memmap(weight_path, mode = 'r', - offset = 0, shape = (), - dtype = '(4)i4,')) - #self.startwith = np.array(self.startwith) - offset = 16 - chunkMB = 1000 - chunk = int(chunkMB * 2**18) - - # Read byte arrays from file - for i in range(self.layer_number): - l = self.layers[i] - if l.type == "CONVOLUTIONAL": - weight_number = l.n * l.c * l.size * l.size - l.biases = np.memmap(weight_path, mode = 'r', - offset = offset, shape = (), - dtype = '({})float32,'.format(l.n)) - offset += 4 * l.n - l.weights = np.memmap(weight_path, mode = 'r', - offset = offset, shape = (), - dtype = '({})float32,'.format(weight_number)) - offset += 4 * weight_number - - elif l.type == "CONNECTED": - bias_number = l.output_size - weight_number = l.output_size * l.input_size - l.biases = np.memmap(weight_path, mode = 'r', - offset = offset, shape = (), - dtype = '({})float32,'.format(bias_number)) - offset += bias_number * 4 - - chunks = [chunk] * (weight_number / chunk) - chunks += [weight_number % chunk] - l.weights = np.array([], dtype = np.float32) - for c in chunks: - l.weights = np.concatenate((l.weights, - np.memmap(weight_path, mode = 'r', - offset = offset, shape = (), - dtype = '({})float32,'.format(c)))) - offset += c * 4 - - # Defensive python right here bietch. - if offset == os.path.getsize(weight_path): - print ('Successfully identified all {} bytes'.format( - offset)) - else: - print 'expect ', offset, ' bytes, found ', os.path.getsize(weight_path) - exit() - - # Reshape - for i in range(self.layer_number): - l = self.layers[i] - - if l.type == 'CONVOLUTIONAL': - weight_array = l.weights - weight_array = np.reshape(weight_array, - [l.n, l.c, l.size, l.size]) - weight_array = weight_array.transpose([2,3,1,0]) - l.weights = weight_array - - if l.type == 'CONNECTED': - weight_array = l.weights - weight_array = np.reshape(weight_array, - [l.input_size, l.output_size]) - l.weights = weight_array diff --git a/yolo/__init__.py b/yolo/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/box.py b/yolo/box.py similarity index 100% rename from box.py rename to yolo/box.py diff --git a/yolo/drawer.py b/yolo/drawer.py new file mode 100644 index 000000000..7582f0090 --- /dev/null +++ b/yolo/drawer.py @@ -0,0 +1,191 @@ +""" +file: yolo/drawer.py +includes: yolo_metaprocess(), yolo_preprocess() and yolo_postprocess() +together they add yolo framework's specificities into the general framework: + 0. what to do with the net's hyper-parameters? + 1. what to do before flowing the net? + 2. what to do with the net's output? +""" + +from box import * +from PIL import Image, ImageFile +ImageFile.LOAD_TRUNCATED_IMAGES = True +import cv2 + +labels20 = ["aeroplane", "bicycle", "bird", "boat", "bottle", + "bus", "car", "cat", "chair", "cow", "diningtable", "dog", + "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", + "train", "tvmonitor"] +default_models = ['full', 'small', 'tiny'] + +def yolo_metaprocess(meta): + """ + Add to meta (a dict) `labels` correspond to that model and + `colors` correspond to these labels, for drawing predictions. + """ + def to_color(indx, base): + base2 = base * base + b = indx / base2 + r = (indx % base2) / base + g = (indx % base2) % base + return (b * 127, r * 127, g * 127) + if meta['model'] in default_models: meta['labels'] = labels20 + else: + with open('labels.txt','r') as f: + meta['labels'] = [l.strip() for l in f.readlines()] + if len(meta['labels']) != meta['classes']: + msg = 'labels.txt and configs/yolo-{}.cfg ' + msg+= 'indicate different class number' + exit('Error: {}'.format(msg.format(meta['model']))) + colors = list() + base = int(np.ceil(pow(meta['classes'], 1./3))) + for x in range(len(meta['labels'])): + colors += [to_color(x, base)] + meta['colors'] = colors + return meta + +def is_yolo_inp(name): return '.jpg' in name + +def yolo_preprocess(imPath, allobj = None): + """ + Takes an image, return it as a numpy tensor that is readily + to be fed into tfnet. If there is an accompanied annotation (allobj), + meaning this preprocessing is serving the train process, then this + image will be transformed with random noise to augment training data, + using scale, translation, flipping and recolor. The accompanied + parsed annotation (allobj) will also be modified accordingly. + """ + def recolor(im): + # `im` is a cv2 image python object + # recolor `im` by adding in random + # intensity transformations, DO NOT + # perform shift/scale or rotate here + # ADD YOUR CODE BELOW: + return im + + def fix(x,c): # fit x inside [0,c] + return max(min(x,c),0) + + im = cv2.imread(imPath) + if allobj is not None: # in training mode + h, w, _ = im.shape + # Scale and translate + scale = np.random.uniform() / 5. + 1. + max_offx = (scale-1.) * w + max_offy = (scale-1.) * h + offx = int(np.random.uniform() * max_offx) + offy = int(np.random.uniform() * max_offy) + im = cv2.resize(im, (0,0), fx = scale, fy = scale) + im = im[offy : (offy + h), offx : (offx + w)] + flip = np.random.binomial(1, .5) + for obj in allobj: + obj[1] = int(obj[1]*scale-offx) + obj[3] = int(obj[3]*scale-offx) + obj[2] = int(obj[2]*scale-offy) + obj[4] = int(obj[4]*scale-offy) + obj[1] = fix(obj[1], w) #xmin + obj[3] = fix(obj[3], w) #xmax + obj[2] = fix(obj[2], h) #ymin + obj[4] = fix(obj[4], h) #ymax + if flip: + temp = obj[1] + obj[1] = w - obj[3] + obj[3] = w - temp + + if flip: im = cv2.flip(im, 1) + im = recolor(im) + + # return np array input to YOLO + im_ = cv2.resize(im, (448, 448)) + image_array = np.array(im_) + image_array = image_array / 255. + image_array = image_array * 2. - 1. + image_array = np.expand_dims(image_array, 0) + if allobj is not None: return image_array, allobj + else: return image_array + + +def yolo_postprocess(predictions, + img_path, FLAGS, meta): + """ + Takes net output, draw predictions, save to results/ + prediction is a numpy tensor - net's output + img_path is the path to testing folder + FLAGS contains threshold for predictions + meta supplies labels and colors for drawing + """ + # meta + threshold = FLAGS.threshold + C, B, S = meta['classes'], meta['num'], meta['side'] + colors, labels = meta['colors'], meta['labels'] + + boxes = [] + SS = S * S # number of grid cells + prob_size = SS * C # class probabilities + conf_size = SS * B # confidences for each grid cell + probs = predictions[0 : prob_size] + confs = predictions[prob_size : (prob_size + conf_size)] + cords = predictions[(prob_size + conf_size) : ] + probs = probs.reshape([SS, C]) + confs = confs.reshape([SS, B]) + cords = cords.reshape([SS, B, 4]) + + for grid in range(SS): + for b in range(B): + new_box = BoundBox(C) + new_box.c = confs[grid, b] + new_box.x = (cords[grid, b, 0] + grid % S) / S + new_box.y = (cords[grid, b, 1] + grid // S) / S + new_box.w = cords[grid, b, 2] ** 2 + new_box.h = cords[grid, b, 3] ** 2 + new_box.id = '{}-{}'.format(grid, b) + for c in range(C): + new_box.probs[c] = new_box.c * probs[grid, c] + boxes.append(new_box) + + # non max suppress boxes + for c in range(C): + for i in range(len(boxes)): boxes[i].class_num = c + boxes = sorted(boxes, cmp=prob_compare) + for i in range(len(boxes)): + boxi = boxes[i] + if boxi.probs[c] == 0: continue + for j in range(i + 1, len(boxes)): + boxj = boxes[j] + boxij = box_intersection(boxi, boxj) + boxja = boxj.w * boxj.h + apart = boxij / boxja + if apart >= .5: + if boxi.probs[c] > boxj.probs[c]: + boxes[j].probs[c] = 0. + else: + boxes[i].probs[c] = 0. + + imgcv = cv2.imread(img_path) + print img_path + h, w, _ = imgcv.shape + for b in boxes: + max_indx = np.argmax(b.probs) + max_prob = b.probs[max_indx] + label = 'object' * int(C < 2) + label += labels[max_indx] * int(C > 1) + if (max_prob > threshold): + left = int ((b.x - b.w/2.) * w) + right = int ((b.x + b.w/2.) * w) + top = int ((b.y - b.h/2.) * h) + bot = int ((b.y + b.h/2.) * h) + if left < 0 : left = 0 + if right > w - 1: right = w - 1 + if top < 0 : top = 0 + if bot > h - 1: bot = h - 1 + thick = int((h+w)/300) + cv2.rectangle(imgcv, + (left, top), (right, bot), + colors[max_indx], thick) + mess = '{}:{:.3f}'.format(label, max_prob) + cv2.putText(imgcv, mess, (left, top - 12), + 0, 1e-3 * h, colors[max_indx],thick/5) + + img_name = 'results/{}'.format( + img_path.split('/')[-1].split('.')[0]) + cv2.imwrite(img_name + '.jpg', imgcv) \ No newline at end of file diff --git a/yolo/train.py b/yolo/train.py new file mode 100644 index 000000000..152a761c7 --- /dev/null +++ b/yolo/train.py @@ -0,0 +1,195 @@ +""" +file: /yolo/train.py +includes: yolo_batch(), yolo_feed_dict() and yolo_loss() +together they support the pipeline: + annotation -> minibatch -> loss evaluation -> training +""" + +import tensorflow.contrib.slim as slim +import tensorflow as tf +from copy import deepcopy +from drawer import * + +# ignore this function +def show(im, allobj, S, w, h, cellx, celly): + for obj in allobj: + a = obj[5] % S + b = obj[5] / S + cx = a + obj[1] + cy = b + obj[2] + centerx = cx * cellx + centery = cy * celly + ww = obj[3] * w + hh = obj[4] * h + cv2.rectangle(im, + (int(centerx - ww/2), int(centery - hh/2)), + (int(centerx + ww/2), int(centery + hh/2)), + (0,0,255), 2) + cv2.imshow("result", im) + cv2.waitKey() + cv2.destroyAllWindows() + +def yolo_batch(train_path, chunk, meta): + """ + Takes a chunk of parsed annotations + return placeholders for net's input + correspond to this chunk + """ + # meta + S, B = meta['side'], meta['num'] + C, labels = meta['classes'], meta['labels'] + + # preprocess + jpg = chunk[0]; w, h, allobj_ = chunk[1] + allobj = deepcopy(allobj_) + path = '{}{}'.format(train_path, jpg) + img, allobj = yolo_preprocess(path, allobj) + + # Calculate regression target + cellx = 1. * w / S + celly = 1. * h / S + for obj in allobj: + centerx = .5*(obj[1]+obj[3]) #xmin, xmax + centery = .5*(obj[2]+obj[4]) #ymin, ymax + cx = centerx / cellx + cy = centery / celly + if cx >= S or cy >= S: return None, None + obj[3] = float(obj[3]-obj[1]) / w + obj[4] = float(obj[4]-obj[2]) / h + obj[3] = np.sqrt(obj[3]) + obj[4] = np.sqrt(obj[4]) + obj[1] = cx - np.floor(cx) # centerx + obj[2] = cy - np.floor(cy) # centery + obj += [int(np.floor(cy) * S + np.floor(cx))] + + # Calculate placeholders' values + probs = np.zeros([S*S,C]) + confs = np.zeros([S*S,B]) + coord = np.zeros([S*S,B,4]) + proid = np.zeros([S*S,C]) + conid = np.zeros([S*S,B]) + cooid = np.zeros([S*S,B,4]) + prear = np.zeros([S*S,4]) + for obj in allobj: + probs[obj[5], :] = [0.] * C + probs[obj[5], labels.index(obj[0])] = 1. + proid[obj[5], :] = [1] * C + coord[obj[5], :, :] = [obj[1:5]] * B + prear[obj[5],0] = obj[1] - obj[3]**2 * .5 * S # xleft + prear[obj[5],1] = obj[2] - obj[4]**2 * .5 * S # yup + prear[obj[5],2] = obj[1] + obj[3]**2 * .5 * S # xright + prear[obj[5],3] = obj[2] + obj[4]**2 * .5 * S # ybot + confs[obj[5], :] = [1.] * B + conid[obj[5], :] = [1.] * B + cooid[obj[5], :, :] = [[1.] * 4] * B + + # Finalise the placeholders' values + upleft = np.expand_dims(prear[:,0:2], 1) + botright = np.expand_dims(prear[:,2:4], 1) + wh = botright - upleft; + area = wh[:,:,0] * wh[:,:,1] + upleft = np.concatenate([upleft] * B, 1) + botright = np.concatenate([botright] * B, 1) + areas = np.concatenate([area] * B, 1) + + # Assemble the placeholders' value + tensors = [[probs], [confs] , [coord], + [proid], [conid] , [cooid], + [areas], [upleft], [botright]] + + return img, tensors + +def yolo_feed_dict(net, x_batch, datum): + return { + net.probs : datum[0], net.confs : datum[1], + net.coord : datum[2], net.proid : datum[3], + net.conid : datum[4], net.cooid : datum[5], + net.areas : datum[6], net.upleft : datum[7], + net.botright : datum[8] + } + +def yolo_loss(net): + """ + Takes net.out and placeholders - + listed in feed_dict() func above, + to build net.train_op and net.loss + """ + # meta + m = net.meta + sprob = m['class_scale'] + sconf = m['object_scale'] + snoob = m['noobject_scale'] + scoor = m['coord_scale'] + S, B, C = m['side'], m['num'], m['classes'] + SS = S * S # number of grid cells + + print 'Loss hyper-parameters:' + print '\tside = {}'.format(m['side']) + print '\tbox = {}'.format(m['num']) + print '\tclasses = {}'.format(m['classes']) + print '\tscales = {}'.format([sprob, sconf, snoob, scoor]) + + size1 = [None, SS, C] + size2 = [None, SS, B] + # target of regression + net.probs = tf.placeholder(tf.float32, size1) + net.confs = tf.placeholder(tf.float32, size2) + net.coord = tf.placeholder(tf.float32, size2 + [4]) + # weights term for L2 loss + net.proid = tf.placeholder(tf.float32, size1) + net.conid = tf.placeholder(tf.float32, size2) + net.cooid = tf.placeholder(tf.float32, size2 + [4]) + # material for loss calculation + net.upleft = tf.placeholder(tf.float32, size2 + [2]) + net.botright = tf.placeholder(tf.float32, size2 + [2]) + net.areas = tf.placeholder(tf.float32, size2) + + # Extract the coordinate prediction from net.out + coords = net.out[:, SS * (C + B):] + coords = tf.reshape(coords, [-1, SS, B, 4]) + wh = tf.pow(coords[:,:,:,2:4], 2) * S # unit: grid cell + area_pred = wh[:,:,:,0] * wh[:,:,:,1] # unit: grid cell^2 + centers = coords[:,:,:,0:2] # [batch, SS, B, 2] + floor = centers - (wh * .5) # [batch, SS, B, 2] + ceil = centers + (wh * .5) # [batch, SS, B, 2] + + # calculate the intersection areas + intersect_upleft = tf.maximum(floor, net.upleft) + intersect_botright = tf.minimum(ceil , net.botright) + intersect_wh = intersect_botright - intersect_upleft + intersect_wh = tf.maximum(intersect_wh, 0.0) + intersect = tf.mul(intersect_wh[:,:,:,0], intersect_wh[:,:,:,1]) + + # calculate the best IOU, set 0.0 confidence for worse boxes + iou = tf.div(intersect, net.areas + area_pred - intersect) + best_box = tf.equal(iou, tf.reduce_max(iou, [2], True)) + best_box = tf.to_float(best_box) + confs = tf.mul(best_box, net.confs) + + # take care of the weight terms + weight_con = snoob*(1.-best_box) + sconf*best_box + conid = tf.mul(net.conid, weight_con) + weight_coo = tf.concat(3, 4 * [tf.expand_dims(best_box, -1)]) + cooid = tf.mul(net.cooid, scoor * weight_coo) + proid = sprob * net.proid + + # flatten 'em all + probs = slim.flatten(net.probs) + proid = slim.flatten(proid) + confs = slim.flatten(confs) + conid = slim.flatten(conid) + coord = slim.flatten(net.coord) + cooid = slim.flatten(cooid) + true = tf.concat(1, [probs, confs, coord]) + wght = tf.concat(1, [proid, conid, cooid]) + + print 'Building net.loss' + net.loss = tf.pow(net.out - true, 2) + net.loss = tf.mul(net.loss, wght) + net.loss = tf.reduce_sum(net.loss, 1) + net.loss = .5 * tf.reduce_mean(net.loss) + + print 'Building net.train_op' + optimizer = tf.train.RMSPropOptimizer(net.FLAGS.lr) + gradients = optimizer.compute_gradients(net.loss) + net.train_op = optimizer.apply_gradients(gradients) \ No newline at end of file