From 08c7380016f77d96af3027fe35d38ed2d2d7a212 Mon Sep 17 00:00:00 2001
From: Vivek Rane <vivek.v.rane@intel.com>
Date: Tue, 15 May 2018 14:21:23 -0700
Subject: [PATCH 1/3] Added flags for getting better performance during
 inference when running on TensorFlow built with MKL-DNN

---
 darkflow/defaults.py      | 12 +++++-
 darkflow/net/build.py     | 18 +++++++++
 darkflow/net/flow.py      | 31 +++++++++++++++-
 darkflow/platform_util.py | 78 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 137 insertions(+), 2 deletions(-)
 create mode 100644 darkflow/platform_util.py

diff --git a/darkflow/defaults.py b/darkflow/defaults.py
index a54b2ec27..f297436ed 100644
--- a/darkflow/defaults.py
+++ b/darkflow/defaults.py
@@ -1,11 +1,15 @@
+from platform_util import platform
+
 class argHandler(dict):
     #A super duper fancy custom made CLI argument handler!!
     __getattr__ = dict.get
     __setattr__ = dict.__setitem__
     __delattr__ = dict.__delitem__
     _descriptions = {'help, --h, -h': 'show this super helpful message and exit'}
-    
+
     def setDefaults(self):
+        p = platform()
+
         self.define('imgdir', './sample_img/', 'path to testing directory with images')
         self.define('binary', './bin/', 'path to .weights directory')
         self.define('config', './cfg/', 'path to .cfg directory')
@@ -35,6 +39,12 @@ def setDefaults(self):
         self.define('saveVideo', False, 'Records video from input video or camera')
         self.define('pbLoad', '', 'path to .pb protobuf file (metaLoad must also be specified)')
         self.define('metaLoad', '', 'path to .meta file generated during --savepb that corresponds to .pb file')
+        self.define('inter_op', 2, 'Maximum number of ops to run in parallel');
+        self.define('intra_op', p.num_cores_per_socket() * p.num_cpu_sockets(), 'Number of threads to use for each CPU op')
+        self.define('KMP_BLOCKTIME', 0, 'Time (in ms) a thread should wait after a parallel region before sleeping')
+        self.define('KMP_SETTINGS', 0, 'Enables printing of OpenMP environment variables')
+        self.define('KMP_AFFINITY', 'granularity=fine,compact,1,0', 'Enables binding of threads to physical processing units')
+        self.define('timeline_enabled', False, 'Run 20 batches and then dump the timeline in JSON format');
 
     def define(self, argName, default, description):
         self[argName] = default
diff --git a/darkflow/net/build.py b/darkflow/net/build.py
index 1359f9f12..7f4401e6c 100644
--- a/darkflow/net/build.py
+++ b/darkflow/net/build.py
@@ -136,6 +136,24 @@ def setup_meta_ops(self):
 			self.say('Running entirely on CPU')
 			cfg['device_count'] = {'GPU': 0}
 
+                # Set CPU-specific parallelism flags
+                cfg['inter_op_parallelism_threads'] = self.FLAGS.inter_op
+                cfg['intra_op_parallelism_threads'] = self.FLAGS.intra_op
+                os.environ["KMP_BLOCKTIME"] = str(self.FLAGS.KMP_BLOCKTIME)
+                os.environ["KMP_SETTINGS"] = str(self.FLAGS.KMP_SETTINGS)
+                os.environ["KMP_AFFINITY"] = str(self.FLAGS.KMP_AFFINITY)
+                os.environ["OMP_NUM_THREADS"]= str(self.FLAGS.intra_op)
+
+                print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
+                print 'CPU parallelism settings - tweak defaults for better performance:'
+                print 'See https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel_mkl_dnn for details'
+                print 'inter_op', self.FLAGS.inter_op
+                print 'intra_op', self.FLAGS.intra_op
+                print 'KMP_BLOCKTIME', self.FLAGS.KMP_BLOCKTIME
+                print 'KMP_AFFINITY', self.FLAGS.KMP_AFFINITY
+                print 'KMP_SETTINGS', self.FLAGS.KMP_SETTINGS
+                print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
+
 		if self.FLAGS.train: self.build_train_op()
 		
 		if self.FLAGS.summary:
diff --git a/darkflow/net/flow.py b/darkflow/net/flow.py
index 1f25fda21..1e9b96d7e 100644
--- a/darkflow/net/flow.py
+++ b/darkflow/net/flow.py
@@ -4,6 +4,7 @@
 import tensorflow as tf
 import pickle
 from multiprocessing.pool import ThreadPool
+from tensorflow.python.client import timeline
 
 train_stats = (
     'Training statistics: \n'
@@ -115,6 +116,9 @@ def predict(self):
 
     # predict in batches
     n_batch = int(math.ceil(len(all_inps) / batch))
+    total_time = 0
+    num_imgs = 0
+
     for j in range(n_batch):
         from_idx = j * batch
         to_idx = min(from_idx + batch, len(all_inps))
@@ -128,9 +132,21 @@ def predict(self):
         # Feed to the net
         feed_dict = {self.inp : np.concatenate(inp_feed, 0)}    
         self.say('Forwarding {} inputs ...'.format(len(inp_feed)))
+        num_imgs += len(inp_feed)
+
+        run_meta = tf.RunMetadata()
         start = time.time()
-        out = self.sess.run(self.out, feed_dict)
+        if self.FLAGS.timeline_enabled:
+            out = self.sess.run(self.out, feed_dict,
+                            options=tf.RunOptions(
+                                trace_level=tf.RunOptions.FULL_TRACE),
+                            run_metadata=run_meta)
+        else:
+            out = self.sess.run(self.out, feed_dict)
+
         stop = time.time(); last = stop - start
+
+        total_time += last
         self.say('Total time = {}s / {} inps = {} ips'.format(
             last, len(inp_feed), len(inp_feed) / last))
 
@@ -146,3 +162,16 @@ def predict(self):
         # Timing
         self.say('Total time = {}s / {} inps = {} ips'.format(
             last, len(inp_feed), len(inp_feed) / last))
+
+        if self.FLAGS.timeline_enabled:
+            # Let performance stabilize before taking the timeline
+            if j is 20:
+                # Create the Timeline object, and write it to a json file
+                fetched_timeline = timeline.Timeline(run_meta.step_stats)
+                chrome_trace = fetched_timeline.generate_chrome_trace_format()
+                with open('timeline.json', 'w') as f:
+                    f.write(chrome_trace)
+                return
+
+    self.say('\nFinal time = {}s / {} inps = {} ips'.format(total_time,
+        num_imgs, num_imgs / total_time))
diff --git a/darkflow/platform_util.py b/darkflow/platform_util.py
new file mode 100644
index 000000000..f4d059729
--- /dev/null
+++ b/darkflow/platform_util.py
@@ -0,0 +1,78 @@
+import os
+import subprocess
+from sys import exit
+
+'''This module implements a platform utility that exposes functions that detect platform information.'''
+
+NUMA_NODES_STR_       = "NUMA node(s)"
+CPU_SOCKETS_STR_      = "Socket(s)"
+CORES_PER_SOCKET_STR_ = "Core(s) per socket"
+THREADS_PER_CORE_STR_ = "Thread(s) per core"
+LOGICAL_CPUS_STR_     = "CPU(s)"
+
+class platform:
+  cpu_sockets_      = 0
+  cores_per_socket_ = 0
+  threads_per_core_ = 0
+  logical_cpus_     = 0
+  numa_nodes_       = 0
+
+  def num_cpu_sockets(self):
+    return self.cpu_sockets_
+
+  def num_cores_per_socket(self):
+    return self.cores_per_socket_
+
+  def num_threads_per_core(self):
+    return self.threads_per_core_
+
+  def num_logical_cpus(self):
+    return self.logical_cpus_
+
+  def num_numa_nodes(self):
+    return self.numa_nodes_
+
+  def __init__(self):
+    #check to see if the lscpu command is present
+    lscpu_path = ''
+    try:
+      process = subprocess.Popen(["which", "lscpu"],
+                  stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+      stdout,stderr = process.communicate()
+      if stderr:
+        print "Error: {}".format(stderr)
+        exit(1)
+      else:
+        lscpu_path = stdout.strip()
+    except:
+      print "Error!"
+
+    #get the lscpu output
+    cpu_info = ''
+    if lscpu_path:
+      try:
+        process = subprocess.Popen(lscpu_path,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE)
+        stdout,stderr = process.communicate()
+        cpu_info = stdout.split('\n')
+      except:
+        print "Error!@"
+
+    #parse it
+    for line in cpu_info:
+#      NUMA_NODES_STR_       = "NUMA node(s)"
+      if line.find(NUMA_NODES_STR_) == 0:
+        self.numa_nodes_ = int(line.split(":")[1].strip())
+#      CPU_SOCKETS_STR_      = "Socket(s)"
+      elif line.find(CPU_SOCKETS_STR_) == 0:
+        self.cpu_sockets_ = int(line.split(":")[1].strip())
+#      CORES_PER_SOCKET_STR_ = "Core(s) per socket"
+      elif line.find(CORES_PER_SOCKET_STR_) == 0:
+        self.cores_per_socket_ = int(line.split(":")[1].strip())
+#      THREADS_PER_CORE_STR_ = "Thread(s) per core"
+      elif line.find(THREADS_PER_CORE_STR_) == 0:
+        self.threads_per_core_ = int(line.split(":")[1].strip())
+#      LOGICAL_CPUS_STR_     = "CPU(s)"
+      elif line.find(LOGICAL_CPUS_STR_) == 0:
+        self.logical_cpus_ = int(line.split(":")[1].strip())

From 98f3c63d42493328688602803e6c22bddeb7e02d Mon Sep 17 00:00:00 2001
From: Vivek Rane <vivek.v.rane@intel.com>
Date: Tue, 15 May 2018 14:41:42 -0700
Subject: [PATCH 2/3] Changed spaces to tabs to be consistent with the rest of
 the file

---
 darkflow/net/build.py | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/darkflow/net/build.py b/darkflow/net/build.py
index 7f4401e6c..11671d250 100644
--- a/darkflow/net/build.py
+++ b/darkflow/net/build.py
@@ -136,23 +136,23 @@ def setup_meta_ops(self):
 			self.say('Running entirely on CPU')
 			cfg['device_count'] = {'GPU': 0}
 
-                # Set CPU-specific parallelism flags
-                cfg['inter_op_parallelism_threads'] = self.FLAGS.inter_op
-                cfg['intra_op_parallelism_threads'] = self.FLAGS.intra_op
-                os.environ["KMP_BLOCKTIME"] = str(self.FLAGS.KMP_BLOCKTIME)
-                os.environ["KMP_SETTINGS"] = str(self.FLAGS.KMP_SETTINGS)
-                os.environ["KMP_AFFINITY"] = str(self.FLAGS.KMP_AFFINITY)
-                os.environ["OMP_NUM_THREADS"]= str(self.FLAGS.intra_op)
-
-                print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
-                print 'CPU parallelism settings - tweak defaults for better performance:'
-                print 'See https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel_mkl_dnn for details'
-                print 'inter_op', self.FLAGS.inter_op
-                print 'intra_op', self.FLAGS.intra_op
-                print 'KMP_BLOCKTIME', self.FLAGS.KMP_BLOCKTIME
-                print 'KMP_AFFINITY', self.FLAGS.KMP_AFFINITY
-                print 'KMP_SETTINGS', self.FLAGS.KMP_SETTINGS
-                print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
+		# Set CPU-specific parallelism flags
+		cfg['inter_op_parallelism_threads'] = self.FLAGS.inter_op
+		cfg['intra_op_parallelism_threads'] = self.FLAGS.intra_op
+		os.environ["KMP_BLOCKTIME"] = str(self.FLAGS.KMP_BLOCKTIME)
+		os.environ["KMP_SETTINGS"] = str(self.FLAGS.KMP_SETTINGS)
+		os.environ["KMP_AFFINITY"] = str(self.FLAGS.KMP_AFFINITY)
+		os.environ["OMP_NUM_THREADS"]= str(self.FLAGS.intra_op)
+
+		print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
+		print 'CPU parallelism settings - tweak defaults for better performance:'
+		print 'See https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel_mkl_dnn for details'
+		print 'inter_op', self.FLAGS.inter_op
+		print 'intra_op', self.FLAGS.intra_op
+		print 'KMP_BLOCKTIME', self.FLAGS.KMP_BLOCKTIME
+		print 'KMP_AFFINITY', self.FLAGS.KMP_AFFINITY
+		print 'KMP_SETTINGS', self.FLAGS.KMP_SETTINGS
+		print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
 
 		if self.FLAGS.train: self.build_train_op()
 		
@@ -192,4 +192,4 @@ def savepb(self):
 			json.dump(self.meta, fp)
 		self.say('Saving const graph def to {}'.format(name))
 		graph_def = tfnet_pb.sess.graph_def
-		tf.train.write_graph(graph_def,'./', name, False)
\ No newline at end of file
+		tf.train.write_graph(graph_def,'./', name, False)

From 5bb204ef8b782f38afaedecd37a8e3bd89a5758d Mon Sep 17 00:00:00 2001
From: Vivek Rane <vivek.v.rane@intel.com>
Date: Tue, 15 May 2018 15:39:25 -0700
Subject: [PATCH 3/3] Updated code for Python3.5

---
 darkflow/defaults.py      |  2 +-
 darkflow/net/build.py     | 18 +++++++++---------
 darkflow/platform_util.py | 29 +++++++++++++++--------------
 3 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/darkflow/defaults.py b/darkflow/defaults.py
index f297436ed..5548ae301 100644
--- a/darkflow/defaults.py
+++ b/darkflow/defaults.py
@@ -1,4 +1,4 @@
-from platform_util import platform
+from .platform_util import platform
 
 class argHandler(dict):
     #A super duper fancy custom made CLI argument handler!!
diff --git a/darkflow/net/build.py b/darkflow/net/build.py
index 11671d250..0aaa92cf0 100644
--- a/darkflow/net/build.py
+++ b/darkflow/net/build.py
@@ -144,15 +144,15 @@ def setup_meta_ops(self):
 		os.environ["KMP_AFFINITY"] = str(self.FLAGS.KMP_AFFINITY)
 		os.environ["OMP_NUM_THREADS"]= str(self.FLAGS.intra_op)
 
-		print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
-		print 'CPU parallelism settings - tweak defaults for better performance:'
-		print 'See https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel_mkl_dnn for details'
-		print 'inter_op', self.FLAGS.inter_op
-		print 'intra_op', self.FLAGS.intra_op
-		print 'KMP_BLOCKTIME', self.FLAGS.KMP_BLOCKTIME
-		print 'KMP_AFFINITY', self.FLAGS.KMP_AFFINITY
-		print 'KMP_SETTINGS', self.FLAGS.KMP_SETTINGS
-		print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
+		print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
+		print ('CPU parallelism settings - tweak defaults for better performance:')
+		print ('See https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel_mkl_dnn for details')
+		print ('inter_op', self.FLAGS.inter_op)
+		print ('intra_op', self.FLAGS.intra_op)
+		print ('KMP_BLOCKTIME', self.FLAGS.KMP_BLOCKTIME)
+		print ('KMP_AFFINITY', self.FLAGS.KMP_AFFINITY)
+		print ('KMP_SETTINGS', self.FLAGS.KMP_SETTINGS)
+		print ('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
 
 		if self.FLAGS.train: self.build_train_op()
 		
diff --git a/darkflow/platform_util.py b/darkflow/platform_util.py
index f4d059729..d73bd2465 100644
--- a/darkflow/platform_util.py
+++ b/darkflow/platform_util.py
@@ -1,14 +1,15 @@
 import os
 import subprocess
+import sys
 from sys import exit
 
 '''This module implements a platform utility that exposes functions that detect platform information.'''
 
-NUMA_NODES_STR_       = "NUMA node(s)"
-CPU_SOCKETS_STR_      = "Socket(s)"
-CORES_PER_SOCKET_STR_ = "Core(s) per socket"
-THREADS_PER_CORE_STR_ = "Thread(s) per core"
-LOGICAL_CPUS_STR_     = "CPU(s)"
+NUMA_NODES_STR_       = b"NUMA node(s)"
+CPU_SOCKETS_STR_      = b"Socket(s)"
+CORES_PER_SOCKET_STR_ = b"Core(s) per socket"
+THREADS_PER_CORE_STR_ = b"Thread(s) per core"
+LOGICAL_CPUS_STR_     = b"CPU(s)"
 
 class platform:
   cpu_sockets_      = 0
@@ -40,12 +41,12 @@ def __init__(self):
                   stdout=subprocess.PIPE, stderr=subprocess.PIPE)
       stdout,stderr = process.communicate()
       if stderr:
-        print "Error: {}".format(stderr)
+        print ("Error: ", stderr)
         exit(1)
       else:
         lscpu_path = stdout.strip()
     except:
-      print "Error!"
+      print ("Error attempting to locate lscpu: ", sys.exc_info()[0])
 
     #get the lscpu output
     cpu_info = ''
@@ -55,24 +56,24 @@ def __init__(self):
                     stdout=subprocess.PIPE,
                     stderr=subprocess.PIPE)
         stdout,stderr = process.communicate()
-        cpu_info = stdout.split('\n')
+        cpu_info = stdout.split(b"\n")
       except:
-        print "Error!@"
+        print ("Error running lscpu: ", sys.exc_info()[0])
 
     #parse it
     for line in cpu_info:
 #      NUMA_NODES_STR_       = "NUMA node(s)"
       if line.find(NUMA_NODES_STR_) == 0:
-        self.numa_nodes_ = int(line.split(":")[1].strip())
+        self.numa_nodes_ = int(line.split(b":")[1].strip())
 #      CPU_SOCKETS_STR_      = "Socket(s)"
       elif line.find(CPU_SOCKETS_STR_) == 0:
-        self.cpu_sockets_ = int(line.split(":")[1].strip())
+        self.cpu_sockets_ = int(line.split(b":")[1].strip())
 #      CORES_PER_SOCKET_STR_ = "Core(s) per socket"
       elif line.find(CORES_PER_SOCKET_STR_) == 0:
-        self.cores_per_socket_ = int(line.split(":")[1].strip())
+        self.cores_per_socket_ = int(line.split(b":")[1].strip())
 #      THREADS_PER_CORE_STR_ = "Thread(s) per core"
       elif line.find(THREADS_PER_CORE_STR_) == 0:
-        self.threads_per_core_ = int(line.split(":")[1].strip())
+        self.threads_per_core_ = int(line.split(b":")[1].strip())
 #      LOGICAL_CPUS_STR_     = "CPU(s)"
       elif line.find(LOGICAL_CPUS_STR_) == 0:
-        self.logical_cpus_ = int(line.split(":")[1].strip())
+        self.logical_cpus_ = int(line.split(b":")[1].strip())