From 08c7380016f77d96af3027fe35d38ed2d2d7a212 Mon Sep 17 00:00:00 2001 From: Vivek Rane Date: Tue, 15 May 2018 14:21:23 -0700 Subject: [PATCH 1/3] Added flags for getting better performance during inference when running on TensorFlow built with MKL-DNN --- darkflow/defaults.py | 12 +++++- darkflow/net/build.py | 18 +++++++++ darkflow/net/flow.py | 31 +++++++++++++++- darkflow/platform_util.py | 78 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 137 insertions(+), 2 deletions(-) create mode 100644 darkflow/platform_util.py diff --git a/darkflow/defaults.py b/darkflow/defaults.py index a54b2ec27..f297436ed 100644 --- a/darkflow/defaults.py +++ b/darkflow/defaults.py @@ -1,11 +1,15 @@ +from platform_util import platform + class argHandler(dict): #A super duper fancy custom made CLI argument handler!! __getattr__ = dict.get __setattr__ = dict.__setitem__ __delattr__ = dict.__delitem__ _descriptions = {'help, --h, -h': 'show this super helpful message and exit'} - + def setDefaults(self): + p = platform() + self.define('imgdir', './sample_img/', 'path to testing directory with images') self.define('binary', './bin/', 'path to .weights directory') self.define('config', './cfg/', 'path to .cfg directory') @@ -35,6 +39,12 @@ def setDefaults(self): self.define('saveVideo', False, 'Records video from input video or camera') self.define('pbLoad', '', 'path to .pb protobuf file (metaLoad must also be specified)') self.define('metaLoad', '', 'path to .meta file generated during --savepb that corresponds to .pb file') + self.define('inter_op', 2, 'Maximum number of ops to run in parallel'); + self.define('intra_op', p.num_cores_per_socket() * p.num_cpu_sockets(), 'Number of threads to use for each CPU op') + self.define('KMP_BLOCKTIME', 0, 'Time (in ms) a thread should wait after a parallel region before sleeping') + self.define('KMP_SETTINGS', 0, 'Enables printing of OpenMP environment variables') + self.define('KMP_AFFINITY', 'granularity=fine,compact,1,0', 'Enables binding of threads to physical processing units') + self.define('timeline_enabled', False, 'Run 20 batches and then dump the timeline in JSON format'); def define(self, argName, default, description): self[argName] = default diff --git a/darkflow/net/build.py b/darkflow/net/build.py index 1359f9f12..7f4401e6c 100644 --- a/darkflow/net/build.py +++ b/darkflow/net/build.py @@ -136,6 +136,24 @@ def setup_meta_ops(self): self.say('Running entirely on CPU') cfg['device_count'] = {'GPU': 0} + # Set CPU-specific parallelism flags + cfg['inter_op_parallelism_threads'] = self.FLAGS.inter_op + cfg['intra_op_parallelism_threads'] = self.FLAGS.intra_op + os.environ["KMP_BLOCKTIME"] = str(self.FLAGS.KMP_BLOCKTIME) + os.environ["KMP_SETTINGS"] = str(self.FLAGS.KMP_SETTINGS) + os.environ["KMP_AFFINITY"] = str(self.FLAGS.KMP_AFFINITY) + os.environ["OMP_NUM_THREADS"]= str(self.FLAGS.intra_op) + + print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' + print 'CPU parallelism settings - tweak defaults for better performance:' + print 'See https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel_mkl_dnn for details' + print 'inter_op', self.FLAGS.inter_op + print 'intra_op', self.FLAGS.intra_op + print 'KMP_BLOCKTIME', self.FLAGS.KMP_BLOCKTIME + print 'KMP_AFFINITY', self.FLAGS.KMP_AFFINITY + print 'KMP_SETTINGS', self.FLAGS.KMP_SETTINGS + print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' + if self.FLAGS.train: self.build_train_op() if self.FLAGS.summary: diff --git a/darkflow/net/flow.py b/darkflow/net/flow.py index 1f25fda21..1e9b96d7e 100644 --- a/darkflow/net/flow.py +++ b/darkflow/net/flow.py @@ -4,6 +4,7 @@ import tensorflow as tf import pickle from multiprocessing.pool import ThreadPool +from tensorflow.python.client import timeline train_stats = ( 'Training statistics: \n' @@ -115,6 +116,9 @@ def predict(self): # predict in batches n_batch = int(math.ceil(len(all_inps) / batch)) + total_time = 0 + num_imgs = 0 + for j in range(n_batch): from_idx = j * batch to_idx = min(from_idx + batch, len(all_inps)) @@ -128,9 +132,21 @@ def predict(self): # Feed to the net feed_dict = {self.inp : np.concatenate(inp_feed, 0)} self.say('Forwarding {} inputs ...'.format(len(inp_feed))) + num_imgs += len(inp_feed) + + run_meta = tf.RunMetadata() start = time.time() - out = self.sess.run(self.out, feed_dict) + if self.FLAGS.timeline_enabled: + out = self.sess.run(self.out, feed_dict, + options=tf.RunOptions( + trace_level=tf.RunOptions.FULL_TRACE), + run_metadata=run_meta) + else: + out = self.sess.run(self.out, feed_dict) + stop = time.time(); last = stop - start + + total_time += last self.say('Total time = {}s / {} inps = {} ips'.format( last, len(inp_feed), len(inp_feed) / last)) @@ -146,3 +162,16 @@ def predict(self): # Timing self.say('Total time = {}s / {} inps = {} ips'.format( last, len(inp_feed), len(inp_feed) / last)) + + if self.FLAGS.timeline_enabled: + # Let performance stabilize before taking the timeline + if j is 20: + # Create the Timeline object, and write it to a json file + fetched_timeline = timeline.Timeline(run_meta.step_stats) + chrome_trace = fetched_timeline.generate_chrome_trace_format() + with open('timeline.json', 'w') as f: + f.write(chrome_trace) + return + + self.say('\nFinal time = {}s / {} inps = {} ips'.format(total_time, + num_imgs, num_imgs / total_time)) diff --git a/darkflow/platform_util.py b/darkflow/platform_util.py new file mode 100644 index 000000000..f4d059729 --- /dev/null +++ b/darkflow/platform_util.py @@ -0,0 +1,78 @@ +import os +import subprocess +from sys import exit + +'''This module implements a platform utility that exposes functions that detect platform information.''' + +NUMA_NODES_STR_ = "NUMA node(s)" +CPU_SOCKETS_STR_ = "Socket(s)" +CORES_PER_SOCKET_STR_ = "Core(s) per socket" +THREADS_PER_CORE_STR_ = "Thread(s) per core" +LOGICAL_CPUS_STR_ = "CPU(s)" + +class platform: + cpu_sockets_ = 0 + cores_per_socket_ = 0 + threads_per_core_ = 0 + logical_cpus_ = 0 + numa_nodes_ = 0 + + def num_cpu_sockets(self): + return self.cpu_sockets_ + + def num_cores_per_socket(self): + return self.cores_per_socket_ + + def num_threads_per_core(self): + return self.threads_per_core_ + + def num_logical_cpus(self): + return self.logical_cpus_ + + def num_numa_nodes(self): + return self.numa_nodes_ + + def __init__(self): + #check to see if the lscpu command is present + lscpu_path = '' + try: + process = subprocess.Popen(["which", "lscpu"], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout,stderr = process.communicate() + if stderr: + print "Error: {}".format(stderr) + exit(1) + else: + lscpu_path = stdout.strip() + except: + print "Error!" + + #get the lscpu output + cpu_info = '' + if lscpu_path: + try: + process = subprocess.Popen(lscpu_path, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + stdout,stderr = process.communicate() + cpu_info = stdout.split('\n') + except: + print "Error!@" + + #parse it + for line in cpu_info: +# NUMA_NODES_STR_ = "NUMA node(s)" + if line.find(NUMA_NODES_STR_) == 0: + self.numa_nodes_ = int(line.split(":")[1].strip()) +# CPU_SOCKETS_STR_ = "Socket(s)" + elif line.find(CPU_SOCKETS_STR_) == 0: + self.cpu_sockets_ = int(line.split(":")[1].strip()) +# CORES_PER_SOCKET_STR_ = "Core(s) per socket" + elif line.find(CORES_PER_SOCKET_STR_) == 0: + self.cores_per_socket_ = int(line.split(":")[1].strip()) +# THREADS_PER_CORE_STR_ = "Thread(s) per core" + elif line.find(THREADS_PER_CORE_STR_) == 0: + self.threads_per_core_ = int(line.split(":")[1].strip()) +# LOGICAL_CPUS_STR_ = "CPU(s)" + elif line.find(LOGICAL_CPUS_STR_) == 0: + self.logical_cpus_ = int(line.split(":")[1].strip()) From 98f3c63d42493328688602803e6c22bddeb7e02d Mon Sep 17 00:00:00 2001 From: Vivek Rane Date: Tue, 15 May 2018 14:41:42 -0700 Subject: [PATCH 2/3] Changed spaces to tabs to be consistent with the rest of the file --- darkflow/net/build.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/darkflow/net/build.py b/darkflow/net/build.py index 7f4401e6c..11671d250 100644 --- a/darkflow/net/build.py +++ b/darkflow/net/build.py @@ -136,23 +136,23 @@ def setup_meta_ops(self): self.say('Running entirely on CPU') cfg['device_count'] = {'GPU': 0} - # Set CPU-specific parallelism flags - cfg['inter_op_parallelism_threads'] = self.FLAGS.inter_op - cfg['intra_op_parallelism_threads'] = self.FLAGS.intra_op - os.environ["KMP_BLOCKTIME"] = str(self.FLAGS.KMP_BLOCKTIME) - os.environ["KMP_SETTINGS"] = str(self.FLAGS.KMP_SETTINGS) - os.environ["KMP_AFFINITY"] = str(self.FLAGS.KMP_AFFINITY) - os.environ["OMP_NUM_THREADS"]= str(self.FLAGS.intra_op) - - print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' - print 'CPU parallelism settings - tweak defaults for better performance:' - print 'See https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel_mkl_dnn for details' - print 'inter_op', self.FLAGS.inter_op - print 'intra_op', self.FLAGS.intra_op - print 'KMP_BLOCKTIME', self.FLAGS.KMP_BLOCKTIME - print 'KMP_AFFINITY', self.FLAGS.KMP_AFFINITY - print 'KMP_SETTINGS', self.FLAGS.KMP_SETTINGS - print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' + # Set CPU-specific parallelism flags + cfg['inter_op_parallelism_threads'] = self.FLAGS.inter_op + cfg['intra_op_parallelism_threads'] = self.FLAGS.intra_op + os.environ["KMP_BLOCKTIME"] = str(self.FLAGS.KMP_BLOCKTIME) + os.environ["KMP_SETTINGS"] = str(self.FLAGS.KMP_SETTINGS) + os.environ["KMP_AFFINITY"] = str(self.FLAGS.KMP_AFFINITY) + os.environ["OMP_NUM_THREADS"]= str(self.FLAGS.intra_op) + + print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' + print 'CPU parallelism settings - tweak defaults for better performance:' + print 'See https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel_mkl_dnn for details' + print 'inter_op', self.FLAGS.inter_op + print 'intra_op', self.FLAGS.intra_op + print 'KMP_BLOCKTIME', self.FLAGS.KMP_BLOCKTIME + print 'KMP_AFFINITY', self.FLAGS.KMP_AFFINITY + print 'KMP_SETTINGS', self.FLAGS.KMP_SETTINGS + print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' if self.FLAGS.train: self.build_train_op() @@ -192,4 +192,4 @@ def savepb(self): json.dump(self.meta, fp) self.say('Saving const graph def to {}'.format(name)) graph_def = tfnet_pb.sess.graph_def - tf.train.write_graph(graph_def,'./', name, False) \ No newline at end of file + tf.train.write_graph(graph_def,'./', name, False) From 5bb204ef8b782f38afaedecd37a8e3bd89a5758d Mon Sep 17 00:00:00 2001 From: Vivek Rane Date: Tue, 15 May 2018 15:39:25 -0700 Subject: [PATCH 3/3] Updated code for Python3.5 --- darkflow/defaults.py | 2 +- darkflow/net/build.py | 18 +++++++++--------- darkflow/platform_util.py | 29 +++++++++++++++-------------- 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/darkflow/defaults.py b/darkflow/defaults.py index f297436ed..5548ae301 100644 --- a/darkflow/defaults.py +++ b/darkflow/defaults.py @@ -1,4 +1,4 @@ -from platform_util import platform +from .platform_util import platform class argHandler(dict): #A super duper fancy custom made CLI argument handler!! diff --git a/darkflow/net/build.py b/darkflow/net/build.py index 11671d250..0aaa92cf0 100644 --- a/darkflow/net/build.py +++ b/darkflow/net/build.py @@ -144,15 +144,15 @@ def setup_meta_ops(self): os.environ["KMP_AFFINITY"] = str(self.FLAGS.KMP_AFFINITY) os.environ["OMP_NUM_THREADS"]= str(self.FLAGS.intra_op) - print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' - print 'CPU parallelism settings - tweak defaults for better performance:' - print 'See https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel_mkl_dnn for details' - print 'inter_op', self.FLAGS.inter_op - print 'intra_op', self.FLAGS.intra_op - print 'KMP_BLOCKTIME', self.FLAGS.KMP_BLOCKTIME - print 'KMP_AFFINITY', self.FLAGS.KMP_AFFINITY - print 'KMP_SETTINGS', self.FLAGS.KMP_SETTINGS - print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' + print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') + print ('CPU parallelism settings - tweak defaults for better performance:') + print ('See https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel_mkl_dnn for details') + print ('inter_op', self.FLAGS.inter_op) + print ('intra_op', self.FLAGS.intra_op) + print ('KMP_BLOCKTIME', self.FLAGS.KMP_BLOCKTIME) + print ('KMP_AFFINITY', self.FLAGS.KMP_AFFINITY) + print ('KMP_SETTINGS', self.FLAGS.KMP_SETTINGS) + print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') if self.FLAGS.train: self.build_train_op() diff --git a/darkflow/platform_util.py b/darkflow/platform_util.py index f4d059729..d73bd2465 100644 --- a/darkflow/platform_util.py +++ b/darkflow/platform_util.py @@ -1,14 +1,15 @@ import os import subprocess +import sys from sys import exit '''This module implements a platform utility that exposes functions that detect platform information.''' -NUMA_NODES_STR_ = "NUMA node(s)" -CPU_SOCKETS_STR_ = "Socket(s)" -CORES_PER_SOCKET_STR_ = "Core(s) per socket" -THREADS_PER_CORE_STR_ = "Thread(s) per core" -LOGICAL_CPUS_STR_ = "CPU(s)" +NUMA_NODES_STR_ = b"NUMA node(s)" +CPU_SOCKETS_STR_ = b"Socket(s)" +CORES_PER_SOCKET_STR_ = b"Core(s) per socket" +THREADS_PER_CORE_STR_ = b"Thread(s) per core" +LOGICAL_CPUS_STR_ = b"CPU(s)" class platform: cpu_sockets_ = 0 @@ -40,12 +41,12 @@ def __init__(self): stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout,stderr = process.communicate() if stderr: - print "Error: {}".format(stderr) + print ("Error: ", stderr) exit(1) else: lscpu_path = stdout.strip() except: - print "Error!" + print ("Error attempting to locate lscpu: ", sys.exc_info()[0]) #get the lscpu output cpu_info = '' @@ -55,24 +56,24 @@ def __init__(self): stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout,stderr = process.communicate() - cpu_info = stdout.split('\n') + cpu_info = stdout.split(b"\n") except: - print "Error!@" + print ("Error running lscpu: ", sys.exc_info()[0]) #parse it for line in cpu_info: # NUMA_NODES_STR_ = "NUMA node(s)" if line.find(NUMA_NODES_STR_) == 0: - self.numa_nodes_ = int(line.split(":")[1].strip()) + self.numa_nodes_ = int(line.split(b":")[1].strip()) # CPU_SOCKETS_STR_ = "Socket(s)" elif line.find(CPU_SOCKETS_STR_) == 0: - self.cpu_sockets_ = int(line.split(":")[1].strip()) + self.cpu_sockets_ = int(line.split(b":")[1].strip()) # CORES_PER_SOCKET_STR_ = "Core(s) per socket" elif line.find(CORES_PER_SOCKET_STR_) == 0: - self.cores_per_socket_ = int(line.split(":")[1].strip()) + self.cores_per_socket_ = int(line.split(b":")[1].strip()) # THREADS_PER_CORE_STR_ = "Thread(s) per core" elif line.find(THREADS_PER_CORE_STR_) == 0: - self.threads_per_core_ = int(line.split(":")[1].strip()) + self.threads_per_core_ = int(line.split(b":")[1].strip()) # LOGICAL_CPUS_STR_ = "CPU(s)" elif line.find(LOGICAL_CPUS_STR_) == 0: - self.logical_cpus_ = int(line.split(":")[1].strip()) + self.logical_cpus_ = int(line.split(b":")[1].strip())