Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added performance flags for better inference performance on Xeon #760

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion darkflow/defaults.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from .platform_util import platform

class argHandler(dict):
#A super duper fancy custom made CLI argument handler!!
__getattr__ = dict.get
__setattr__ = dict.__setitem__
__delattr__ = dict.__delitem__
_descriptions = {'help, --h, -h': 'show this super helpful message and exit'}

def setDefaults(self):
p = platform()

self.define('imgdir', './sample_img/', 'path to testing directory with images')
self.define('binary', './bin/', 'path to .weights directory')
self.define('config', './cfg/', 'path to .cfg directory')
Expand Down Expand Up @@ -35,6 +39,12 @@ def setDefaults(self):
self.define('saveVideo', False, 'Records video from input video or camera')
self.define('pbLoad', '', 'path to .pb protobuf file (metaLoad must also be specified)')
self.define('metaLoad', '', 'path to .meta file generated during --savepb that corresponds to .pb file')
self.define('inter_op', 2, 'Maximum number of ops to run in parallel');
self.define('intra_op', p.num_cores_per_socket() * p.num_cpu_sockets(), 'Number of threads to use for each CPU op')
self.define('KMP_BLOCKTIME', 0, 'Time (in ms) a thread should wait after a parallel region before sleeping')
self.define('KMP_SETTINGS', 0, 'Enables printing of OpenMP environment variables')
self.define('KMP_AFFINITY', 'granularity=fine,compact,1,0', 'Enables binding of threads to physical processing units')
self.define('timeline_enabled', False, 'Run 20 batches and then dump the timeline in JSON format');

def define(self, argName, default, description):
self[argName] = default
Expand Down
20 changes: 19 additions & 1 deletion darkflow/net/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,24 @@ def setup_meta_ops(self):
self.say('Running entirely on CPU')
cfg['device_count'] = {'GPU': 0}

# Set CPU-specific parallelism flags
cfg['inter_op_parallelism_threads'] = self.FLAGS.inter_op
cfg['intra_op_parallelism_threads'] = self.FLAGS.intra_op
os.environ["KMP_BLOCKTIME"] = str(self.FLAGS.KMP_BLOCKTIME)
os.environ["KMP_SETTINGS"] = str(self.FLAGS.KMP_SETTINGS)
os.environ["KMP_AFFINITY"] = str(self.FLAGS.KMP_AFFINITY)
os.environ["OMP_NUM_THREADS"]= str(self.FLAGS.intra_op)

print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print ('CPU parallelism settings - tweak defaults for better performance:')
print ('See https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel_mkl_dnn for details')
print ('inter_op', self.FLAGS.inter_op)
print ('intra_op', self.FLAGS.intra_op)
print ('KMP_BLOCKTIME', self.FLAGS.KMP_BLOCKTIME)
print ('KMP_AFFINITY', self.FLAGS.KMP_AFFINITY)
print ('KMP_SETTINGS', self.FLAGS.KMP_SETTINGS)
print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

if self.FLAGS.train: self.build_train_op()

if self.FLAGS.summary:
Expand Down Expand Up @@ -174,4 +192,4 @@ def savepb(self):
json.dump(self.meta, fp)
self.say('Saving const graph def to {}'.format(name))
graph_def = tfnet_pb.sess.graph_def
tf.train.write_graph(graph_def,'./', name, False)
tf.train.write_graph(graph_def,'./', name, False)
31 changes: 30 additions & 1 deletion darkflow/net/flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import tensorflow as tf
import pickle
from multiprocessing.pool import ThreadPool
from tensorflow.python.client import timeline

train_stats = (
'Training statistics: \n'
Expand Down Expand Up @@ -115,6 +116,9 @@ def predict(self):

# predict in batches
n_batch = int(math.ceil(len(all_inps) / batch))
total_time = 0
num_imgs = 0

for j in range(n_batch):
from_idx = j * batch
to_idx = min(from_idx + batch, len(all_inps))
Expand All @@ -128,9 +132,21 @@ def predict(self):
# Feed to the net
feed_dict = {self.inp : np.concatenate(inp_feed, 0)}
self.say('Forwarding {} inputs ...'.format(len(inp_feed)))
num_imgs += len(inp_feed)

run_meta = tf.RunMetadata()
start = time.time()
out = self.sess.run(self.out, feed_dict)
if self.FLAGS.timeline_enabled:
out = self.sess.run(self.out, feed_dict,
options=tf.RunOptions(
trace_level=tf.RunOptions.FULL_TRACE),
run_metadata=run_meta)
else:
out = self.sess.run(self.out, feed_dict)

stop = time.time(); last = stop - start

total_time += last
self.say('Total time = {}s / {} inps = {} ips'.format(
last, len(inp_feed), len(inp_feed) / last))

Expand All @@ -146,3 +162,16 @@ def predict(self):
# Timing
self.say('Total time = {}s / {} inps = {} ips'.format(
last, len(inp_feed), len(inp_feed) / last))

if self.FLAGS.timeline_enabled:
# Let performance stabilize before taking the timeline
if j is 20:
# Create the Timeline object, and write it to a json file
fetched_timeline = timeline.Timeline(run_meta.step_stats)
chrome_trace = fetched_timeline.generate_chrome_trace_format()
with open('timeline.json', 'w') as f:
f.write(chrome_trace)
return

self.say('\nFinal time = {}s / {} inps = {} ips'.format(total_time,
num_imgs, num_imgs / total_time))
79 changes: 79 additions & 0 deletions darkflow/platform_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import os
import subprocess
import sys
from sys import exit

'''This module implements a platform utility that exposes functions that detect platform information.'''

NUMA_NODES_STR_ = b"NUMA node(s)"
CPU_SOCKETS_STR_ = b"Socket(s)"
CORES_PER_SOCKET_STR_ = b"Core(s) per socket"
THREADS_PER_CORE_STR_ = b"Thread(s) per core"
LOGICAL_CPUS_STR_ = b"CPU(s)"

class platform:
cpu_sockets_ = 0
cores_per_socket_ = 0
threads_per_core_ = 0
logical_cpus_ = 0
numa_nodes_ = 0

def num_cpu_sockets(self):
return self.cpu_sockets_

def num_cores_per_socket(self):
return self.cores_per_socket_

def num_threads_per_core(self):
return self.threads_per_core_

def num_logical_cpus(self):
return self.logical_cpus_

def num_numa_nodes(self):
return self.numa_nodes_

def __init__(self):
#check to see if the lscpu command is present
lscpu_path = ''
try:
process = subprocess.Popen(["which", "lscpu"],
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout,stderr = process.communicate()
if stderr:
print ("Error: ", stderr)
exit(1)
else:
lscpu_path = stdout.strip()
except:
print ("Error attempting to locate lscpu: ", sys.exc_info()[0])

#get the lscpu output
cpu_info = ''
if lscpu_path:
try:
process = subprocess.Popen(lscpu_path,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdout,stderr = process.communicate()
cpu_info = stdout.split(b"\n")
except:
print ("Error running lscpu: ", sys.exc_info()[0])

#parse it
for line in cpu_info:
# NUMA_NODES_STR_ = "NUMA node(s)"
if line.find(NUMA_NODES_STR_) == 0:
self.numa_nodes_ = int(line.split(b":")[1].strip())
# CPU_SOCKETS_STR_ = "Socket(s)"
elif line.find(CPU_SOCKETS_STR_) == 0:
self.cpu_sockets_ = int(line.split(b":")[1].strip())
# CORES_PER_SOCKET_STR_ = "Core(s) per socket"
elif line.find(CORES_PER_SOCKET_STR_) == 0:
self.cores_per_socket_ = int(line.split(b":")[1].strip())
# THREADS_PER_CORE_STR_ = "Thread(s) per core"
elif line.find(THREADS_PER_CORE_STR_) == 0:
self.threads_per_core_ = int(line.split(b":")[1].strip())
# LOGICAL_CPUS_STR_ = "CPU(s)"
elif line.find(LOGICAL_CPUS_STR_) == 0:
self.logical_cpus_ = int(line.split(b":")[1].strip())