diff --git a/dory/Hardware_targets/PULP/GAP9/Templates/network_c_template.c b/dory/Hardware_targets/PULP/GAP9/Templates/network_c_template.c
index 34199ba6..230ac9c2 100644
--- a/dory/Hardware_targets/PULP/GAP9/Templates/network_c_template.c
+++ b/dory/Hardware_targets/PULP/GAP9/Templates/network_c_template.c
@@ -119,7 +119,7 @@ void ${prefix}execute_layer_fork(void *args) {
 #endif
 }
 
-void ${prefix}network_run(void *l2_buffer, size_t l2_buffer_size, void *l2_final_output, int exec${", void *L2_input_h" if not l3_supported else ""})
+struct ${prefix}network_run_token ${prefix}network_run_async(void *l2_buffer, size_t l2_buffer_size, void *l2_final_output, int exec, int initial_dir${", void *L2_input_h" if not l3_supported else ""})
 {
   struct pi_device cluster_dev = {0};
   struct pi_cluster_conf conf;
@@ -135,8 +135,9 @@ void ${prefix}network_run(void *l2_buffer, size_t l2_buffer_size, void *l2_final
   args[1] = (unsigned int) l2_buffer_size;
   args[2] = (unsigned int) l2_final_output;
   args[3] = (unsigned int) exec;
+  args[4] = (unsigned int) initial_dir;
   % if not l3_supported:
-  args[4] = (unsigned int) L2_input_h;
+  args[5] = (unsigned int) L2_input_h;
   % endif
   // open cluster...
   pi_cluster_task(&cluster_task, ${prefix}network_run_cluster, args);
@@ -149,20 +150,33 @@ void ${prefix}network_run(void *l2_buffer, size_t l2_buffer_size, void *l2_final
 #endif
   cluster_task.slave_stack_size = ${slave_stack};
   pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);
-  pi_cluster_close(&cluster_dev);
+  return (struct ${prefix}network_run_token) {
+    .cluster_dev = cluster_dev
+  };
+}
+
+void ${prefix}network_run_wait(struct ${prefix}network_run_token token)
+{
+  pi_cluster_close(&token.cluster_dev);
   % if 'Perf_final' in verbose_level:
   print_perf("Final", ${prefix}cycle_network_execution, ${MACs});
   % endif
 }
 
+void ${prefix}network_run(void *l2_buffer, size_t l2_buffer_size, void *l2_final_output, int exec, int initial_dir${", void *L2_input_h" if not l3_supported else ""})
+{
+  ${prefix}network_run_wait(network_run_async(l2_buffer, l2_buffer_size, l2_final_output, exec, initial_dir${", L2_input_h" if not l3_supported else ""}));
+}
+
 void ${prefix}network_run_cluster(void *args) {
   unsigned int * real_args = (unsigned int *) args;
   void * l2_buffer = (void *) real_args[0];
   size_t l2_buffer_size = (size_t) real_args[1];
   void * l2_final_output = (void *) real_args[2];
   int exec = (int) real_args[3];
+  int dir = (int) real_args[4];
   % if not l3_supported:
-  void * L2_input_h = (void *)real_args[4];
+  void * L2_input_h = (void *)real_args[5];
   % endif
 /*
   - initial buffer allocation L2 and L1
@@ -177,7 +191,6 @@ void ${prefix}network_run_cluster(void *args) {
   void *L3_weights_curr = L3_weights;
   void *bypass_activations = NULL;
 
-  int dir = 1;
   int residual_number = 0;
   int bypass_dimension = 0;
   % if not l3_supported:
diff --git a/dory/Hardware_targets/PULP/GAP9/Templates/network_h_template.h b/dory/Hardware_targets/PULP/GAP9/Templates/network_h_template.h
deleted file mode 100644
index 31eb3c5c..00000000
--- a/dory/Hardware_targets/PULP/GAP9/Templates/network_h_template.h
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * network.h
- * Alessio Burrello <alessio.burrello@unibo.it>
- *
- * Copyright (C) 2019-2020 University of Bologna
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. 
- */
-
-#ifndef __${prefix.upper()}NETWORK_H__
-#define __${prefix.upper()}NETWORK_H__
-
-<%
-   l3_supported = DORY_HW_graph[0].HW_description['memory']['levels'] > 2
-   single_input = n_inputs==1
-%>\
-% if not l3_supported:
-#include "${prefix}weights_definition.h"
-% endif
-#include <stddef.h>
-
-
-% if l3_supported:
-void ${prefix}network_terminate();
-void ${prefix}network_initialize();
-% endif
-void ${prefix}network_run_cluster(void * args);
-void ${prefix}network_run(void *l2_buffer, size_t l2_buffer_size, void *l2_final_output, int exec${", void *L2_input_h" if not l3_supported else ""});
-void ${prefix}execute_layer_fork(void *arg);
-
-% if l3_supported and not single_input:
-static char * ${prefix}Input_names[${n_inputs}] = { \
-  % for n in range(n_inputs-1):
-  "${f"{prefix}inputs_{n}.hex"}",
-  % endfor
-  "${f"{prefix}inputs_{n_inputs-1}.hex"}"
-};
-% endif
-
-#ifdef DEFINE_CONSTANTS
-% if l3_supported:
-// allocation of buffers with parameters needed by the network execution
-static const char * L3_weights_files[] = {
-  ${files_list}
-};
-static int L3_weights_size[${weights_number}];
-static int layers_pointers[${len(DORY_HW_graph)}];
-% endif
-static char * Layers_name[${len(DORY_HW_graph)}] = {\
-% for node in DORY_HW_graph:
-"${node.prefixed_name}"${'' if loop.last else ', '}\
-% endfor
-};
-% if l3_supported:
-static int L3_input_layers[${len(DORY_HW_graph)}] = {\
-1,
-% for node in DORY_HW_graph[1:]:
-% if node.L3_input != 0:
-1${'' if loop.last else ', '}\
-% else:
-0${'' if loop.last else ', '}\
-% endif
-% endfor
-};
-static int L3_output_layers[${len(DORY_HW_graph)}] = {\
-% for node in DORY_HW_graph:
-% if node.tiling_dimensions["L3"]["output_dimensions"] != node.tiling_dimensions["L2"]["output_dimensions"]:
-1${'' if loop.last else ', '}\
-% else:
-0${'' if loop.last else ', '}\
-% endif
-% endfor
-};
-static int allocate_layer[${len(DORY_HW_graph)}] = {\
-% for node in DORY_HW_graph:
-% if node.tiling_dimensions["L3"]["weights_dimensions"] == node.tiling_dimensions["L2"]["weights_dimensions"] and ('FullyConnected' in node.name or 'Conv' in node.name):
-1${'' if loop.last else ', '}\
-% else:
-0${'' if loop.last else ', '}\
-% endif
-% endfor
-};
-% else:
-static char *Weights_name[${len(DORY_HW_graph)}] = {\
-% for i in range(len(DORY_HW_graph)):
-% if 'Conv' in DORY_HW_graph[i].name or 'FullyConnected' in DORY_HW_graph[i].name:
-Weights_${DORY_HW_graph[i].prefixed_name}${'' if loop.last else ', '}\
-% else:
-"None"${'' if loop.last else ', '}\
-% endif
-% endfor
-};
-% endif
-static int branch_input[${len(DORY_HW_graph)}] = {\
-% for node in DORY_HW_graph:
-% if node.branch_in == 1:
-1${'' if loop.last else ', '}\
-% else:
-0${'' if loop.last else ', '}\
-% endif
-% endfor
-};
-static int branch_output[${len(DORY_HW_graph)}] = {\
-% for node in DORY_HW_graph:
-% if node.branch_out == 1:
-1${'' if loop.last else ', '}\
-% else:
-0${'' if loop.last else ', '}\
-% endif
-% endfor
-};
-static int branch_change[${len(DORY_HW_graph)}] = {\
-% for node in DORY_HW_graph:
-% if node.branch_change == 1:
-1${'' if loop.last else ', '}\
-% else:
-0${'' if loop.last else ', '}\
-% endif
-% endfor
-};
-static int weights_checksum[${len(DORY_HW_graph)}] = {\
-% for node in DORY_HW_graph:
-${node.check_sum_w}${'' if loop.last else ', '}\
-% endfor
-};
-static int weights_size[${len(DORY_HW_graph)}] = {\
-% for node in DORY_HW_graph:
-% if l3_supported:
-${int((node.tiling_dimensions["L2"]["weight_memory"] + node.tiling_dimensions["L2"]["constants_memory"] + node.tiling_dimensions["L2"]["bias_memory"]) * (1 + int(node.tiling_dimensions["L3"]["weights_dimensions"] != node.tiling_dimensions["L2"]["weights_dimensions"])))}${'' if loop.last else ', '}\
-% else:
-${int(node.tiling_dimensions["L2"]["weight_memory"] + node.tiling_dimensions["L2"]["constants_memory"] + node.tiling_dimensions["L2"]["bias_memory"])}${'' if loop.last else ', '}\
-% endif
-% endfor
-};
-static int activations_checksum[${len(DORY_HW_graph)}][${DORY_HW_graph[0].n_test_inputs}] = {\
-% for i in range(len(DORY_HW_graph)):
-{
-  % for j in range(DORY_HW_graph[0].n_test_inputs):
-  ${DORY_HW_graph[i].check_sum_in[j]}${", " if j != DORY_HW_graph[0].n_test_inputs-1 else ""}  \
-  % endfor
-}${"," if i != len(DORY_HW_graph)-1 else ""}
-% endfor
-};
-static int activations_size[${len(DORY_HW_graph)}] = {\
-% for node in DORY_HW_graph:
-% if l3_supported:
-${int(node.tiling_dimensions["L2"]["input_activation_memory"] * (1 + int(node.tiling_dimensions["L3"]["input_dimensions"] != node.tiling_dimensions["L2"]["input_dimensions"])))}${'' if loop.last else ', '}\
-% else:
-${int(node.tiling_dimensions["L2"]["input_activation_memory"])}${'' if loop.last else ', '}\
-% endif
-% endfor
-};
-static int activations_out_checksum[${len(DORY_HW_graph)}][${DORY_HW_graph[0].n_test_inputs}] = {\
-% for i in range(len(DORY_HW_graph)):
-{
-  % for j in range(DORY_HW_graph[0].n_test_inputs):
-  ${DORY_HW_graph[i].check_sum_out[j]}${", " if j != DORY_HW_graph[0].n_test_inputs-1 else ""} \
-    % endfor
-}${"," if i != len(DORY_HW_graph)-1 else ""}
-% endfor
-};
-static int activations_out_size[${len(DORY_HW_graph)}] = {\
-% for node in DORY_HW_graph:
-% if l3_supported:
-${int(node.tiling_dimensions["L2"]["output_activation_memory"] * (1 + int(node.tiling_dimensions["L3"]["output_dimensions"] != node.tiling_dimensions["L2"]["output_dimensions"])))}${'' if loop.last else ', '}\
-% else:
-${int(node.tiling_dimensions["L2"]["output_activation_memory"])}${'' if loop.last else ', '}\
-% endif
-% endfor
-};
-static int layer_with_weights[${len(DORY_HW_graph)}] = {\
-% for node in DORY_HW_graph:
-% if 'Conv' in node.name or 'FullyConnected' in node.name:
-1${'' if loop.last else ', '}\
-% else:
-0${'' if loop.last else ', '}\
-% endif
-% endfor
-};
-% if 'Yes' in performance:
-static int NODEs_MACS[${len(DORY_HW_graph)}] = {\
-% for node in DORY_HW_graph:
-${node.MACs}${'' if loop.last else ', '}\
-% endfor
-};
-% endif
-#endif
-
-#endif  // __NETWORK_H__
diff --git a/dory/Hardware_targets/PULP/GAP9/Templates/network_h_template.h b/dory/Hardware_targets/PULP/GAP9/Templates/network_h_template.h
new file mode 120000
index 00000000..246946c0
--- /dev/null
+++ b/dory/Hardware_targets/PULP/GAP9/Templates/network_h_template.h
@@ -0,0 +1 @@
+../../Common/Templates/network.h.t
\ No newline at end of file
diff --git a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_h_template.h b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_h_template.h
index 7ea5b4bd..246946c0 120000
--- a/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_h_template.h
+++ b/dory/Hardware_targets/PULP/GAP9_NE16/Templates/network_h_template.h
@@ -1 +1 @@
-../../GAP9/Templates/network_h_template.h
\ No newline at end of file
+../../Common/Templates/network.h.t
\ No newline at end of file
diff --git a/test_PULP.py b/test_PULP.py
index af08f7f8..bccf2fd5 100644
--- a/test_PULP.py
+++ b/test_PULP.py
@@ -163,7 +163,7 @@ def test_network(network, capsys, compat, appdir):
     try:
         proc = subprocess.run(cmd, check=True, capture_output=True, text=True, timeout=360)
     except subprocess.CalledProcessError as e:
-        assert False, f"Building application failed with exit status {e.returncode}\nBuild error:\n{e.stderr}"
+        assert False, f"Building application failed with exit status {e.returncode}\nBuild output:\n{e.stdout}\nBuild error:\n{e.stderr}"
     except subprocess.TimeoutExpired as e:
         print(f"Test timed out...\nSTDOUT:")
         if e.output is not None: