thtrieu · vivek-rane · May 15, 2018 · May 15, 2018 · May 15, 2018
diff --git a/darkflow/defaults.py b/darkflow/defaults.py
@@ -1,11 +1,15 @@
+from .platform_util import platform
+
 class argHandler(dict):
     #A super duper fancy custom made CLI argument handler!!
     __getattr__ = dict.get
     __setattr__ = dict.__setitem__
     __delattr__ = dict.__delitem__
     _descriptions = {'help, --h, -h': 'show this super helpful message and exit'}
-    
+
     def setDefaults(self):
+        p = platform()
+
         self.define('imgdir', './sample_img/', 'path to testing directory with images')
         self.define('binary', './bin/', 'path to .weights directory')
         self.define('config', './cfg/', 'path to .cfg directory')
@@ -35,6 +39,12 @@ def setDefaults(self):
         self.define('saveVideo', False, 'Records video from input video or camera')
         self.define('pbLoad', '', 'path to .pb protobuf file (metaLoad must also be specified)')
         self.define('metaLoad', '', 'path to .meta file generated during --savepb that corresponds to .pb file')
+        self.define('inter_op', 2, 'Maximum number of ops to run in parallel');
+        self.define('intra_op', p.num_cores_per_socket() * p.num_cpu_sockets(), 'Number of threads to use for each CPU op')
+        self.define('KMP_BLOCKTIME', 0, 'Time (in ms) a thread should wait after a parallel region before sleeping')
+        self.define('KMP_SETTINGS', 0, 'Enables printing of OpenMP environment variables')
+        self.define('KMP_AFFINITY', 'granularity=fine,compact,1,0', 'Enables binding of threads to physical processing units')
+        self.define('timeline_enabled', False, 'Run 20 batches and then dump the timeline in JSON format');
 
     def define(self, argName, default, description):
         self[argName] = default

diff --git a/darkflow/net/build.py b/darkflow/net/build.py
@@ -136,6 +136,24 @@ def setup_meta_ops(self):
 			self.say('Running entirely on CPU')
 			cfg['device_count'] = {'GPU': 0}
 
+		# Set CPU-specific parallelism flags
+		cfg['inter_op_parallelism_threads'] = self.FLAGS.inter_op
+		cfg['intra_op_parallelism_threads'] = self.FLAGS.intra_op
+		os.environ["KMP_BLOCKTIME"] = str(self.FLAGS.KMP_BLOCKTIME)
+		os.environ["KMP_SETTINGS"] = str(self.FLAGS.KMP_SETTINGS)
+		os.environ["KMP_AFFINITY"] = str(self.FLAGS.KMP_AFFINITY)
+		os.environ["OMP_NUM_THREADS"]= str(self.FLAGS.intra_op)
+
+		print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
+		print ('CPU parallelism settings - tweak defaults for better performance:')
+		print ('See https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel_mkl_dnn for details')
+		print ('inter_op', self.FLAGS.inter_op)
+		print ('intra_op', self.FLAGS.intra_op)
+		print ('KMP_BLOCKTIME', self.FLAGS.KMP_BLOCKTIME)
+		print ('KMP_AFFINITY', self.FLAGS.KMP_AFFINITY)
+		print ('KMP_SETTINGS', self.FLAGS.KMP_SETTINGS)
+		print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
+
 		if self.FLAGS.train: self.build_train_op()
 
 		if self.FLAGS.summary:
@@ -174,4 +192,4 @@ def savepb(self):
 			json.dump(self.meta, fp)
 		self.say('Saving const graph def to {}'.format(name))
 		graph_def = tfnet_pb.sess.graph_def
-		tf.train.write_graph(graph_def,'./', name, False)
+		tf.train.write_graph(graph_def,'./', name, False)
diff --git a/darkflow/net/flow.py b/darkflow/net/flow.py
@@ -4,6 +4,7 @@
 import tensorflow as tf
 import pickle
 from multiprocessing.pool import ThreadPool
+from tensorflow.python.client import timeline
 
 train_stats = (
     'Training statistics: \n'
@@ -115,6 +116,9 @@ def predict(self):
 
     # predict in batches
     n_batch = int(math.ceil(len(all_inps) / batch))
+    total_time = 0
+    num_imgs = 0
+
     for j in range(n_batch):
         from_idx = j * batch
         to_idx = min(from_idx + batch, len(all_inps))
@@ -128,9 +132,21 @@ def predict(self):
         # Feed to the net
         feed_dict = {self.inp : np.concatenate(inp_feed, 0)}    
         self.say('Forwarding {} inputs ...'.format(len(inp_feed)))
+        num_imgs += len(inp_feed)
+
+        run_meta = tf.RunMetadata()
         start = time.time()
-        out = self.sess.run(self.out, feed_dict)
+        if self.FLAGS.timeline_enabled:
+            out = self.sess.run(self.out, feed_dict,
+                            options=tf.RunOptions(
+                                trace_level=tf.RunOptions.FULL_TRACE),
+                            run_metadata=run_meta)
+        else:
+            out = self.sess.run(self.out, feed_dict)
+
         stop = time.time(); last = stop - start
+
+        total_time += last
         self.say('Total time = {}s / {} inps = {} ips'.format(
             last, len(inp_feed), len(inp_feed) / last))
 
@@ -146,3 +162,16 @@ def predict(self):
         # Timing
         self.say('Total time = {}s / {} inps = {} ips'.format(
             last, len(inp_feed), len(inp_feed) / last))
+
+        if self.FLAGS.timeline_enabled:
+            # Let performance stabilize before taking the timeline
+            if j is 20:
+                # Create the Timeline object, and write it to a json file
+                fetched_timeline = timeline.Timeline(run_meta.step_stats)
+                chrome_trace = fetched_timeline.generate_chrome_trace_format()
+                with open('timeline.json', 'w') as f:
+                    f.write(chrome_trace)
+                return
+
+    self.say('\nFinal time = {}s / {} inps = {} ips'.format(total_time,
+        num_imgs, num_imgs / total_time))
diff --git a/darkflow/platform_util.py b/darkflow/platform_util.py
@@ -0,0 +1,79 @@
+import os
+import subprocess
+import sys
+from sys import exit
+
+'''This module implements a platform utility that exposes functions that detect platform information.'''
+
+NUMA_NODES_STR_       = b"NUMA node(s)"
+CPU_SOCKETS_STR_      = b"Socket(s)"
+CORES_PER_SOCKET_STR_ = b"Core(s) per socket"
+THREADS_PER_CORE_STR_ = b"Thread(s) per core"
+LOGICAL_CPUS_STR_     = b"CPU(s)"
+
+class platform:
+  cpu_sockets_      = 0
+  cores_per_socket_ = 0
+  threads_per_core_ = 0
+  logical_cpus_     = 0
+  numa_nodes_       = 0
+
+  def num_cpu_sockets(self):
+    return self.cpu_sockets_
+
+  def num_cores_per_socket(self):
+    return self.cores_per_socket_
+
+  def num_threads_per_core(self):
+    return self.threads_per_core_
+
+  def num_logical_cpus(self):
+    return self.logical_cpus_
+
+  def num_numa_nodes(self):
+    return self.numa_nodes_
+
+  def __init__(self):
+    #check to see if the lscpu command is present
+    lscpu_path = ''
+    try:
+      process = subprocess.Popen(["which", "lscpu"],
+                  stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+      stdout,stderr = process.communicate()
+      if stderr:
+        print ("Error: ", stderr)
+        exit(1)
+      else:
+        lscpu_path = stdout.strip()
+    except:
+      print ("Error attempting to locate lscpu: ", sys.exc_info()[0])
+
+    #get the lscpu output
+    cpu_info = ''
+    if lscpu_path:
+      try:
+        process = subprocess.Popen(lscpu_path,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE)
+        stdout,stderr = process.communicate()
+        cpu_info = stdout.split(b"\n")
+      except:
+        print ("Error running lscpu: ", sys.exc_info()[0])
+
+    #parse it
+    for line in cpu_info:
+#      NUMA_NODES_STR_       = "NUMA node(s)"
+      if line.find(NUMA_NODES_STR_) == 0:
+        self.numa_nodes_ = int(line.split(b":")[1].strip())
+#      CPU_SOCKETS_STR_      = "Socket(s)"
+      elif line.find(CPU_SOCKETS_STR_) == 0:
+        self.cpu_sockets_ = int(line.split(b":")[1].strip())
+#      CORES_PER_SOCKET_STR_ = "Core(s) per socket"
+      elif line.find(CORES_PER_SOCKET_STR_) == 0:
+        self.cores_per_socket_ = int(line.split(b":")[1].strip())
+#      THREADS_PER_CORE_STR_ = "Thread(s) per core"
+      elif line.find(THREADS_PER_CORE_STR_) == 0:
+        self.threads_per_core_ = int(line.split(b":")[1].strip())
+#      LOGICAL_CPUS_STR_     = "CPU(s)"
+      elif line.find(LOGICAL_CPUS_STR_) == 0:
+        self.logical_cpus_ = int(line.split(b":")[1].strip())