diff --git a/docs/api/configuration.rst b/docs/api/configuration.rst
index 1bc8f0676c..aa24e07329 100644
--- a/docs/api/configuration.rst
+++ b/docs/api/configuration.rst
@@ -101,6 +101,20 @@ Finally, one then uses the configuration to create an hls model:
         backend='Vitis'
     )
 
+To target an oneAPI Board Support Package (BSP) enabled FPGA for offload acceleration, you can specify the ``part`` argument to be the path to your BSP and the BSP variant. Then, set ``use_oneapi_bsp=True``.
+
+.. code-block:: python
+
+  hls_model = hls4ml.converters.convert_from_keras_model(
+        model,
+        hls_config=config,
+        output_dir="my_project_dir",
+        io_type="io_parallel",
+        backend="oneAPI",
+        part="/path/to/my/bsp:bsp_variant",
+        use_oneapi_bsp=True
+    )
+
 See :py:class:`~hls4ml.converters.convert_from_keras_model` for more information on the various options. Similar functions exist for ONNX and PyTorch.
 
 ----
@@ -132,6 +146,9 @@ It looks like this:
    ClockPeriod: 5
    IOType: io_parallel # options: io_parallel/io_stream
 
+   # oneAPI Offload Acceleration flag.
+   UseOneAPIBSP: True
+
    HLSConfig:
      Model:
        Precision: fixed<16,6>
@@ -168,6 +185,10 @@ For Vivado backend the options are:
   * **PipelineInterval**\ : Optionally override the desired initiation interval of the design. Only valid in combination with "pipeline" style. If unspecified, it is left to the compiler to decide, ideally matching the largest reuse factor of the network.
   * **Precision**\ : this defines the precision of your inputs, outputs, weights and biases. It is denoted by ``fixed<X,Y>``\ , where ``Y`` is the number of bits representing the signed number above the binary point (i.e. the integer part), and ``X`` is the total number of bits. Additionally, integers in the type (\ ``int<N>``\ , where ``N`` is a bit-size from 1 to 1024) can also be used. The format follows ``ap_fixed`` and ``ap_int`` conventions. You have a chance to further configure this more finely with per-layer configuration described below. In the per-layer configuration (but not globally) one can also use ``'auto'`` precision.
 
+For oneaAPI the options are similar but also include:
+
+* **UseOneAPIBSP**\ : path to the oneAPI Board Support Package (and the BSP variant) to enable offload acceleration with an Altera FPGA. This is only needed if you are using oneAPI in the accelerator style.
+
 2.2 Per-Layer Configuration
 ---------------------------
 
diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
index a4000529c3..17654810c8 100644
--- a/hls4ml/backends/oneapi/oneapi_backend.py
+++ b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -130,11 +130,13 @@ def get_default_flow(self):
     def get_writer_flow(self):
         return self._writer_flow
 
-    def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_parallel', write_tar=False, **_):
+    def create_initial_config(
+        self, part='Agilex7', clock_period=5, io_type='io_parallel', write_tar=False, use_oneapi_bsp=False, **_
+    ):
         """Create initial configuration of the oneAPI backend.
 
         Args:
-            part (str, optional): The FPGA part to be used. Defaults to 'Arria10'.
+            part (str, optional): The FPGA part to be used. Defaults to 'Agilex7'.
             clock_period (int, optional): The clock period. Defaults to 5.
             io_type (str, optional): Type of implementation used. One of
                 'io_parallel' or 'io_stream'. Defaults to 'io_parallel'.
@@ -146,7 +148,7 @@ def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_para
 
         config = {}
 
-        config['Part'] = part if part is not None else 'Arria10'
+        config['Part'] = part if part is not None else 'Agilex7'
         config['ClockPeriod'] = clock_period
         config['IOType'] = io_type
         config['HLSConfig'] = {}
@@ -154,7 +156,7 @@ def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_para
             # TODO:  add namespace
             'WriteTar': write_tar,
         }
-
+        config['UseOneAPIBSP'] = use_oneapi_bsp
         return config
 
     def compile(self, model):
diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py
index 3106e1e10d..410057e062 100644
--- a/hls4ml/backends/oneapi/oneapi_types.py
+++ b/hls4ml/backends/oneapi/oneapi_types.py
@@ -170,11 +170,18 @@ def definition_cpp(self, name_suffix='', as_reference=False):
         else:
             return f'{self.type.name} {self.name}{name_suffix}'
 
-    def declare_cpp(self, pipe_min_size=0, indent=''):
-        lines = indent + f'class {self.pipe_id};\n'
-        lines += indent + (
-            f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
-            + f'{self.type.name}, {pipe_min_size}, PipeProps>;\n'
+    # Updated pipe min size to be 32 for simulation.
+    def declare_cpp(self, pipe_min_size=32, indent=''):
+        # Updated to use streaming beat for restartable streaming kernel.
+        # Streaming beat is a wrapper type of the actual type with sideband control signals.
+        # Syntax: using BeatT = sycl::ext::intel::experimental::StreamingBeat<DataT, eop, empty>;
+        streaming_beat_t = f"{self.pipe_name}BeatT"
+        lines = (
+            f"{indent}class {self.pipe_id};\n"
+            f"{indent}using {streaming_beat_t} = "
+            f"sycl::ext::intel::experimental::StreamingBeat<{self.type.name}, true, true>;\n"
+            f"{indent}using {self.pipe_name} = sycl::ext::intel::experimental::pipe<"
+            f"{self.pipe_id}, {streaming_beat_t}, {pipe_min_size}, HostPipePropertiesT>;\n"
         )
         return lines
 
@@ -193,10 +200,13 @@ def definition_cpp(self, name_suffix='', as_reference=True):
         return f'{self.name}{name_suffix}'
 
     def declare_cpp(self, indent=''):
-        lines = indent + f'class {self.pipe_id};\n'
-        lines += indent + (
-            f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
-            + f'{self.type.name}, {self.pragma[-1]}>;\n'
+        streaming_beat_t = f"{self.pipe_name}BeatT"
+        lines = (
+            f"{indent}class {self.pipe_id};\n"
+            f"{indent}using {streaming_beat_t} = "
+            f"sycl::ext::intel::experimental::StreamingBeat<{self.type.name}, true, true>;\n"
+            f"{indent}using {self.pipe_name} = "
+            f"sycl::ext::intel::experimental::pipe<{self.pipe_id}, {streaming_beat_t}, {self.pragma[-1]}>;\n"
         )
         return lines
 
diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt
index e2b386d70d..fe15197210 100644
--- a/hls4ml/templates/oneapi/CMakeLists.txt
+++ b/hls4ml/templates/oneapi/CMakeLists.txt
@@ -38,15 +38,20 @@ set(LIBRARY_NAME myproject-${LIB_STAMP})
 # You can also specify a device family (E.g. "Arria10" or "Stratix10") or a
 # specific part number (E.g. "10AS066N3F40E2SG") to generate a standalone IP.
 if(NOT DEFINED FPGA_DEVICE)
-    set(FPGA_DEVICE "Arria10")
+    set(FPGA_DEVICE "Agilex7")
+    set(BSP_FLAG "")
 endif()
 
+# Set the target to a BSP if we target an actual accelerator board.
+# hls-fpga-machine-learning insert oneapi_bsp_cmake_flag
+
 # Use cmake -DUSER_FPGA_FLAGS=<flags> to set extra flags for FPGA backend
 # compilation.
-set(USER_FPGA_FLAGS -Wno-unused-label ${USER_FPGA_FLAGS})
+# -Xsoptimize=latency Turns off the hyper-optimized handshake
+set(USER_FPGA_FLAGS -Wno-unused-label;${USER_FPGA_FLAGS};-Xsoptimize=latency)
 
 # Use cmake -DUSER_FLAGS=<flags> to set extra flags for general compilation.
-set(USER_FLAGS -Wno-unused-label -fconstexpr-steps=134217728 ${USER_FLAGS})
+set(USER_FLAGS -Wno-unused-label -fconstexpr-steps=134217728 ${USER_FLAGS} ${BSP_FLAG})
 
 # Use cmake -DUSER_INCLUDE_PATHS=<paths> to set extra paths for general
 # compilation.
diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h
index 082ae5dc8c..1dc86c0cb6 100644
--- a/hls4ml/templates/oneapi/firmware/myproject.h
+++ b/hls4ml/templates/oneapi/firmware/myproject.h
@@ -5,8 +5,12 @@
 
 // This file defines the interface to the kernel
 
-// currently this is fixed
-using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(sycl::ext::intel::experimental::ready_latency<0>));
+// Pipe properties for host pipes. Host pipes connect to the data source DMA and sink DMA.
+// They are connected to the first and the last layer to stream data into and out from the kernel.
+using HostPipePropertiesT = decltype(sycl::ext::oneapi::experimental::properties(
+    sycl::ext::intel::experimental::ready_latency<0>, sycl::ext::intel::experimental::bits_per_symbol<16>,
+    sycl::ext::intel::experimental::uses_valid<true>, sycl::ext::intel::experimental::first_symbol_in_high_order_bits<true>,
+    sycl::ext::intel::experimental::protocol_avalon_streaming_uses_ready));
 
 // Need to declare the input and output pipes
 
@@ -16,12 +20,15 @@ using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(sycl::ext
 class MyProjectID;
 
 struct MyProject {
-
+#ifndef IS_BSP
     // kernel property method to config invocation interface
     auto get(sycl::ext::oneapi::experimental::properties_tag) {
         return sycl::ext::oneapi::experimental::properties{sycl::ext::intel::experimental::streaming_interface<>,
                                                            sycl::ext::intel::experimental::pipelined<>};
     }
+#else
+    // kernel properties and pipelining is not supported in BSP (accelerator style).
+#endif
 
     SYCL_EXTERNAL void operator()() const;
 };
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
index 13de5ab3bb..ebccee1f77 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
@@ -9,72 +9,93 @@ namespace nnet {
 // *************************************************
 //       Linear Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void linear_stream() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> [[intel::use_stall_enable_clusters]] void linear_stream() {
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    bool keep_going = true;
 LinearActLoop:
-    [[intel::initiation_interval(
-        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
-        auto in_data = data_pipe::read();
-        typename ExtractPipeType<res_pipe>::value_type out_data;
-
-    LinearPackLoop:
-        #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
-            out_data[j] = in_data[j];
-        }
+    [[intel::initiation_interval(1)]] while (keep_going) {
+        for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<ResT>{}; i++) {
+            auto in_data = data_pipe::read();
+
+        LinearPackLoop:
+            #pragma unroll
+            for (int j = 0; j < std::tuple_size<ResT>{}; j++) {
+                out_data.data[j] = in_data.data[j];
+            }
+
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
 
-        res_pipe::write(out_data);
+            keep_going = !in_data.eop;
+        }
     }
 }
 
 // *************************************************
 //       ReLU Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void relu_stream() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> [[intel::use_stall_enable_clusters]] void relu_stream() {
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    bool keep_going = true;
 ReLUActLoop:
-    [[intel::initiation_interval(
-        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
-        auto in_data = data_pipe::read();
-        typename ExtractPipeType<res_pipe>::value_type out_data;
-
-    ReLUPackLoop:
-        #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
-            if (in_data[j] > 0)
-                out_data[j] = in_data[j];
-            else
-                out_data[j] = 0;
-        }
+    [[intel::initiation_interval(1)]] while (keep_going) {
+        for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<ResT>{}; i++) {
+            [[intel::fpga_register]] auto in_data = data_pipe::read();
+        ReLUPackLoop:
+            #pragma unroll
+            for (int j = 0; j < std::tuple_size<ResT>{}; j++) {
+                if (in_data.data[j] > 0)
+                    out_data.data[j] = in_data.data[j];
+                else
+                    out_data.data[j] = 0;
+            }
+
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
 
-        res_pipe::write(out_data);
+            keep_going = !in_data.eop;
+        }
     }
 }
 
 // *************************************************
 //       Leaky RELU Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void leaky_relu_stream(typename CONFIG_T::param_t alpha) {
-    constexpr unsigned multiplier_limit =
-        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
-
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+[[intel::use_stall_enable_clusters]] void leaky_relu_stream(typename CONFIG_T::param_t alpha) {
+    using DataT = typename ExtractDataType<typename ExtractPipeType<data_pipe>::value_type>::value_type;
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(std::tuple_size<DataT>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<DataT>{} / multiplier_limit;
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    bool keep_going = true;
 LeakyReLUActLoop:
-    [[intel::initiation_interval(pipeline)]] for (int i = 0;
-                                                  i < CONFIG_T::n_in /
-                                                          std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
-                                                  i++) {
-        auto in_data = data_pipe::read();
-        typename ExtractPipeType<res_pipe>::value_type out_data;
-
-    LeakyReLUPackLoop:
-        #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
-            if (in_data[j] > 0)
-                out_data[j] = in_data[j];
-            else
-                out_data[j] = alpha * in_data[j];
-        }
+    [[intel::initiation_interval(pipeline)]] while (keep_going) {
+        for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<ResT>{}; i++) {
+            auto in_data = data_pipe::read();
+
+        LeakyReLUPackLoop:
+            #pragma unroll
+            for (int j = 0; j < std::tuple_size<ResT>{}; j++) {
+                if (in_data.data[j] > 0)
+                    out_data.data[j] = in_data.data[j];
+                else
+                    out_data.data[j] = alpha * in_data.data[j];
+            }
+
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
 
-        res_pipe::write(out_data);
+            keep_going = !in_data.eop;
+        }
     }
 }
 
@@ -82,186 +103,231 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void leaky_relu_st
 //       Thresholded RELU Activation
 // *************************************************
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void thresholded_relu_stream(typename CONFIG_T::param_t theta) {
+[[intel::use_stall_enable_clusters]] void thresholded_relu_stream(typename CONFIG_T::param_t theta) {
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    bool keep_going = true;
+
 ThresholdedReLUActLoop:
-    [[intel::initiation_interval(
-        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
-        auto in_data = data_pipe::read();
-        typename ExtractPipeType<res_pipe>::value_type out_data;
-
-    ThresholdedReLUPackLoop:
-        #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
-            if (in_data[j] > theta)
-                out_data[j] = in_data[j];
-            else
-                out_data[j] = 0;
-        }
+    [[intel::initiation_interval(1)]] while (keep_going) {
+        for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<ResT>{}; i++) {
+            auto in_data = data_pipe::read();
 
-        res_pipe::write(out_data);
+        ThresholdedReLUPackLoop:
+            #pragma unroll
+            for (int j = 0; j < std::tuple_size<ResT>{}; j++) {
+                if (in_data.data[j] > theta)
+                    out_data.data[j] = in_data.data[j];
+                else
+                    out_data.data[j] = 0;
+            }
+
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
+
+            keep_going = !in_data.eop;
+        }
     }
 }
 
 // *************************************************
 //       ELU Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void elu_stream(typename CONFIG_T::param_t alpha) {
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+[[intel::use_stall_enable_clusters]] void elu_stream(typename CONFIG_T::param_t alpha) {
 #include "activation_tables/elu_table.tb"
+    using DataT = typename ExtractDataType<typename ExtractPipeType<data_pipe>::value_type>::value_type;
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(std::tuple_size<DataT>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<DataT>{} / multiplier_limit;
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
-    constexpr unsigned multiplier_limit =
-        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
-
+    bool keep_going = true;
 EluActLoop:
-    [[intel::initiation_interval(pipeline)]] for (int i = 0;
-                                                  i < CONFIG_T::n_in /
-                                                          std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
-                                                  i++) {
-        auto in_data = data_pipe::read();
-        typename ExtractPipeType<res_pipe>::value_type out_data;
-
-    EluPackLoop:
-        #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
-            [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type datareg = in_data[j];
-            if (datareg >= 0) {
-                out_data[j] = datareg;
-            } else {
-                int index = (datareg * CONFIG_T::table_size / -8).to_int();
-                if (index > CONFIG_T::table_size - 1)
-                    index = CONFIG_T::table_size - 1;
-                out_data[j] = alpha * elu_table[index];
+    [[intel::initiation_interval(pipeline)]] while (keep_going) {
+        for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<ResT>{}; i++) {
+            auto in_data = data_pipe::read();
+
+        EluPackLoop:
+            #pragma unroll
+            for (int j = 0; j < std::tuple_size<ResT>{}; j++) {
+                [[intel::fpga_register]] auto datareg = in_data.data[j];
+                if (datareg >= 0) {
+                    out_data.data[j] = datareg;
+                } else {
+                    int index = (datareg * CONFIG_T::table_size / -8).to_int();
+                    if (index > CONFIG_T::table_size - 1)
+                        index = CONFIG_T::table_size - 1;
+                    out_data.data[j] = alpha * elu_table[index];
+                }
             }
-        }
 
-        res_pipe::write(out_data);
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
+
+            keep_going = !in_data.eop;
+        }
     }
 }
 
 // *************************************************
 //       SeLU Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void selu_stream() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> [[intel::use_stall_enable_clusters]] void selu_stream() {
 #include "activation_tables/selu_table.tb"
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    bool keep_going = true;
+
+    constexpr ac_fixed<16, 1, false, AC_RND> scale = 1.0507009873554804934193349852946;
 
 SeluActLoop:
-    [[intel::initiation_interval(
-        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
-        auto in_data = data_pipe::read();
-        typename ExtractPipeType<res_pipe>::value_type out_data;
-
-    SeluPackLoop:
-        #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
-            [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type datareg = in_data[j];
-            if (datareg >= 0) {
-                out_data[j] =
-                    typename ExtractPipeType<data_pipe>::value_type::value_type(1.0507009873554804934193349852946) * datareg;
-            } else {
-                int index = (datareg * CONFIG_T::table_size / -8).to_int();
-                if (index > CONFIG_T::table_size - 1)
-                    index = CONFIG_T::table_size - 1;
-                out_data[j] = selu_table[index];
+    [[intel::initiation_interval(1)]] while (keep_going) {
+        for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<ResT>{}; i++) {
+            auto in_data = data_pipe::read();
+
+        SeluPackLoop:
+            #pragma unroll
+            for (int j = 0; j < std::tuple_size<ResT>{}; j++) {
+                [[intel::fpga_register]] auto datareg = in_data.data[j];
+
+                if (datareg >= 0) {
+                    out_data.data[j] = scale * datareg;
+                } else {
+                    int index = (datareg * CONFIG_T::table_size / -8).to_int();
+                    if (index > CONFIG_T::table_size - 1)
+                        index = CONFIG_T::table_size - 1;
+                    out_data.data[j] = selu_table[index];
+                }
             }
-        }
 
-        res_pipe::write(out_data);
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
+
+            keep_going = !in_data.eop;
+        }
     }
 }
 
 // *************************************************
 //       PReLU Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void prelu_stream(typename CONFIG_T::param_t alpha) {
-    constexpr unsigned multiplier_limit =
-        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
-
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+[[intel::use_stall_enable_clusters]] void prelu_stream(typename CONFIG_T::param_t alpha) {
+    using DataT = typename ExtractDataType<typename ExtractPipeType<data_pipe>::value_type>::value_type;
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(std::tuple_size<DataT>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<DataT>{} / multiplier_limit;
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    bool keep_going = true;
 PReLUActLoop:
-    [[intel::initiation_interval(pipeline)]] for (int i = 0;
-                                                  i < CONFIG_T::n_in /
-                                                          std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
-                                                  i++) {
-        auto in_data = data_pipe::read();
-        typename ExtractPipeType<res_pipe>::value_type out_data;
-
-    PReLUPackLoop:
-        #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
-            if (in_data[j] > 0)
-                out_data[j] = in_data[j];
-            else
-                out_data[j] = alpha[i * std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{} + j] * in_data[j];
-        }
+    [[intel::initiation_interval(pipeline)]] while (keep_going) {
+        for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<ResT>{}; i++) {
+            auto in_data = data_pipe::read();
+
+        PReLUPackLoop:
+            #pragma unroll
+            for (int j = 0; j < std::tuple_size<ResT>{}; j++) {
+                if (in_data.data[j] > 0)
+                    out_data.data[j] = in_data.data[j];
+                else
+                    out_data.data[j] = alpha[i * std::tuple_size<ResT>{} + j] * in_data.data[j];
+            }
 
-        res_pipe::write(out_data);
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
+
+            keep_going = !in_data.eop;
+        }
     }
 }
 
 // *************************************************
 //       Softplus Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void softplus_stream() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> [[intel::use_stall_enable_clusters]] void softplus_stream() {
 #include "activation_tables/softplus_table.tb"
 
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    bool keep_going = true;
+
 SoftplusActLoop:
-    [[intel::initiation_interval(
-        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
-        auto in_data = data_pipe::read();
-        typename ExtractPipeType<res_pipe>::value_type out_data;
-
-    SoftplusPackLoop:
-        #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
-            [[intel::fpga_register]] int data_round = (in_data[j] * CONFIG_T::table_size / 16).to_int();
-            [[intel::fpga_register]] int index = data_round + 8 * CONFIG_T::table_size / 16;
-            if (index < 0)
-                index = 0;
-            else if (index > CONFIG_T::table_size - 1)
-                index = CONFIG_T::table_size - 1;
-            out_data[j] = softplus_table[index];
-        }
+    [[intel::initiation_interval(1)]] while (keep_going) {
+        for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<ResT>{}; i++) {
+            auto in_data = data_pipe::read();
+
+        SoftplusPackLoop:
+            #pragma unroll
+            for (int j = 0; j < std::tuple_size<ResT>{}; j++) {
+                [[intel::fpga_register]] int data_round = (in_data.data[j] * CONFIG_T::table_size / 16).to_int();
+                [[intel::fpga_register]] int index = data_round + 8 * CONFIG_T::table_size / 16;
+                if (index < 0)
+                    index = 0;
+                else if (index > CONFIG_T::table_size - 1)
+                    index = CONFIG_T::table_size - 1;
+                out_data.data[j] = softplus_table[index];
+            }
 
-        res_pipe::write(out_data);
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
+
+            keep_going = !in_data.eop;
+        }
     }
 }
 
 // *************************************************
 //       Softsign Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void softsign_stream() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> [[intel::use_stall_enable_clusters]] void softsign_stream() {
 #include "activation_tables/softsign_table.tb"
+    using DataT = typename ExtractDataType<typename ExtractPipeType<data_pipe>::value_type>::value_type;
+    constexpr int MAX_VALUE = 8;
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
-    static const int MAX_VALUE = 8;
-
+    bool keep_going = true;
 SoftsignActLoop:
-    [[intel::initiation_interval(
-        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
-        auto in_data = data_pipe::read();
-        typename ExtractPipeType<res_pipe>::value_type out_data;
-
-    SoftsignPackLoop:
-        #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
-            [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type absValue;
-            ;
-            if (in_data[j] < 0) {
-                absValue = -in_data[j];
-            } else {
-                absValue = in_data[j];
-            }
-            ac_int<16> index = (absValue * CONFIG_T::table_size / MAX_VALUE).to_int();
-            if (absValue > MAX_VALUE)
-                index = CONFIG_T::table_size - 1;
-            if (in_data[j] < 0) {
-                out_data[j] =
-                    static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(-softsign_table[index]);
-            } else {
-                out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(softsign_table[index]);
+    [[intel::initiation_interval(1)]] while (keep_going) {
+        for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<ResT>{}; i++) {
+            auto in_data = data_pipe::read();
+
+        SoftsignPackLoop:
+            #pragma unroll
+            for (int j = 0; j < std::tuple_size<ResT>{}; j++) {
+                [[intel::fpga_register]] typename DataT::value_type absValue;
+                ;
+                if (in_data.data[j] < 0) {
+                    absValue = -in_data.data[j];
+                } else {
+                    absValue = in_data.data[j];
+                }
+                ac_int<16> index = (absValue * CONFIG_T::table_size / MAX_VALUE).to_int();
+                if (absValue > MAX_VALUE)
+                    index = CONFIG_T::table_size - 1;
+                if (in_data.data[j] < 0) {
+                    out_data.data[j] = -softsign_table[index];
+                } else {
+                    out_data.data[j] = softsign_table[index];
+                }
             }
-        }
 
-        res_pipe::write(out_data);
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
+
+            keep_going = !in_data.eop;
+        }
     }
 }
 
@@ -269,220 +335,246 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softsign_stre
 //       Softmax Activation
 // *************************************************
 
-template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_stable_stream() {
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+[[intel::use_stall_enable_clusters]] void softmax_stable_stream() {
 #include "activation_tables/exp_table.tb"
 #include "activation_tables/invert_table.tb"
+    using DataT = typename ExtractDataType<typename ExtractPipeType<data_pipe>::value_type>::value_type;
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(std::tuple_size<DataT>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<DataT>{} / multiplier_limit;
 
-    constexpr unsigned multiplier_limit =
-        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
+    [[intel::fpga_register]] typename DataT::value_type data_array[std::tuple_size<DataT>{}];
 
-    [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type
-        data_array[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    bool keep_going = true;
 
 SoftmaxArrayLoop:
-    [[intel::initiation_interval(pipeline)]] for (unsigned i = 0;
-                                                  i < CONFIG_T::n_in /
-                                                          std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
-                                                  i++) {
-        auto in_pack = data_pipe::read();
-
-    SoftmaxArrayPackLoop:
-        #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
-            data_array[j] = in_pack[j];
-        }
+    [[intel::initiation_interval(pipeline)]] while (keep_going) {
+        for (unsigned i = 0; i < CONFIG_T::n_in / std::tuple_size<DataT>{}; i++) {
+            auto in_data = data_pipe::read();
 
-        // Find the max and compute all delta(x_i, x_max)
-        Op_max<typename ExtractPipeType<data_pipe>::value_type::value_type> op_max;
-        [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type x_max =
-            reduce<typename ExtractPipeType<data_pipe>::value_type::value_type,
-                   std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{},
-                   Op_max<typename ExtractPipeType<data_pipe>::value_type::value_type>>(data_array, op_max);
-
-        // For the diffs, use the same type as the input but force rounding and saturation
-        [[intel::fpga_register]] ac_fixed<ExtractPipeType<data_pipe>::value_type::value_type::width,
-                                          ExtractPipeType<data_pipe>::value_type::value_type::i_width, true, AC_RND, AC_SAT>
-            d_xi_xmax[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
-        #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
-            d_xi_xmax[j] = data_array[j] - x_max;
-        }
+        SoftmaxArrayPackLoop:
+            #pragma unroll
+            for (unsigned j = 0; j < std::tuple_size<DataT>{}; j++) {
+                data_array[j] = in_data.data[j];
+            }
 
-        // Calculate all the e^x's
-        [[intel::fpga_register]]
-        typename CONFIG_T::exp_table_t exp_res[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
-        #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
-            exp_res[j] =
-                exp_table[softmax_stable_idx_from_real_val<typename ExtractPipeType<data_pipe>::value_type::value_type,
-                                                           CONFIG_T>(d_xi_xmax[j])];
-        }
+            // Find the max and compute all delta(x_i, x_max)
+            Op_max<typename DataT::value_type> op_max;
+            [[intel::fpga_register]] auto x_max =
+                reduce<typename DataT::value_type, std::tuple_size<DataT>{}, Op_max<typename DataT::value_type>>(data_array,
+                                                                                                                 op_max);
+
+            // For the diffs, use the same type as the input but force rounding and saturation
+            [[intel::fpga_register]] ac_fixed<ExtractPipeType<data_pipe>::value_type::value_type::width,
+                                              ExtractPipeType<data_pipe>::value_type::value_type::i_width, true, AC_RND,
+                                              AC_SAT>
+                d_xi_xmax[std::tuple_size<DataT>{}];
+            #pragma unroll
+            for (unsigned j = 0; j < std::tuple_size<DataT>{}; j++) {
+                d_xi_xmax[j] = data_array[j] - x_max;
+            }
 
-        // Explicitly sum the results with an adder tree.
-        // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
-        Op_add<typename CONFIG_T::exp_table_t> op_add;
-        [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
-            reduce<typename CONFIG_T::exp_table_t, std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{},
-                   Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+            // Calculate all the e^x's
+            [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[std::tuple_size<DataT>{}];
+            #pragma unroll
+            for (unsigned j = 0; j < std::tuple_size<DataT>{}; j++) {
+                exp_res[j] = exp_table[softmax_stable_idx_from_real_val<DataT::value_type, CONFIG_T>(d_xi_xmax[j])];
+            }
 
-        [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
-            invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
-        typename ExtractPipeType<res_pipe>::value_type out_pack;
+            // Explicitly sum the results with an adder tree.
+            // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
+            Op_add<typename CONFIG_T::exp_table_t> op_add;
+            [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
+                reduce<typename CONFIG_T::exp_table_t, std::tuple_size<DataT>{}, Op_add<typename CONFIG_T::exp_table_t>>(
+                    exp_res, op_add);
 
-    SoftmaxInvPackLoop:
-        #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
+            [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
+                invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
 
-            // TODO - Find Quartus-equivalent pragma
-            // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+        SoftmaxInvPackLoop:
+            #pragma unroll
+            for (unsigned j = 0; j < std::tuple_size<ResT>{}; j++) {
 
-            out_pack[j] = exp_res[j] * inv_exp_sum;
-        }
+                // TODO - Find Quartus-equivalent pragma
+                // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
 
-        res_pipe::write(out_pack);
+                out_data.data[j] = exp_res[j] * inv_exp_sum;
+            }
+
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
+
+            keep_going = !in_data.eop;
+        }
     }
 }
 
-template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_latency_stream() {
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+[[intel::use_stall_enable_clusters]] void softmax_latency_stream() {
 #include "activation_tables/exp_table_latency.tb"
 #include "activation_tables/invert_table_latency.tb"
+    using DataT = typename ExtractDataType<typename ExtractPipeType<data_pipe>::value_type>::value_type;
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(std::tuple_size<DataT>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<DataT>{} / multiplier_limit;
+
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
-    constexpr unsigned multiplier_limit =
-        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
+    bool keep_going = true;
 
     // Calculate all the e^x's
-    [[intel::fpga_register]]
-    typename CONFIG_T::exp_table_t exp_res[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
+    [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[std::tuple_size<DataT>{}];
 
 SoftmaxExpLoop:
-    [[intel::initiation_interval(pipeline)]] for (unsigned i = 0;
-                                                  i < CONFIG_T::n_in /
-                                                          std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
-                                                  i++) {
-        auto in_pack = data_pipe::read();
-
-    SoftmaxExpPackLoop:
-        #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
-            exp_res[j] = exp_table_latency[softmax_latency_idx_from_real_val<
-                typename ExtractPipeType<data_pipe>::value_type::value_type, CONFIG_T>(in_pack[j])];
-        }
+    [[intel::initiation_interval(pipeline)]] while (keep_going) {
+        for (unsigned i = 0; i < CONFIG_T::n_in / std::tuple_size<DataT>{}; i++) {
+            auto in_data = data_pipe::read();
 
-        // Explicitly sum the results with an adder tree.
-        // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
-        Op_add<typename CONFIG_T::exp_table_t> op_add;
-        [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
-            reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
-
-        // Multiply previously calculated exponetials with the reciprocal of the sum
-        [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
-            invert_table_latency[softmax_latency_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
-
-        typename ExtractPipeType<res_pipe>::value_type out_pack;
-    SoftmaxInvPackLoop:
-        #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
-            // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
-            out_pack[j] = exp_res[j] * inv_exp_sum;
-        }
+        SoftmaxExpPackLoop:
+            #pragma unroll
+            for (unsigned j = 0; j < std::tuple_size<DataT>{}; j++) {
+                exp_res[j] =
+                    exp_table_latency[softmax_latency_idx_from_real_val<DataT::value_type, CONFIG_T>(in_data.data[j])];
+            }
+
+            // Explicitly sum the results with an adder tree.
+            // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
+            Op_add<typename CONFIG_T::exp_table_t> op_add;
+            [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
+                reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res,
+                                                                                                               op_add);
+
+            // Multiply previously calculated exponetials with the reciprocal of the sum
+            [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
+                invert_table_latency[softmax_latency_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
 
-        res_pipe::write(out_pack);
+        SoftmaxInvPackLoop:
+            #pragma unroll
+            for (unsigned j = 0; j < std::tuple_size<ResT>{}; j++) {
+                // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation
+                out_data.data[j] = exp_res[j] * inv_exp_sum;
+            }
+
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
+
+            keep_going = !in_data.eop;
+        }
     }
 }
 
-template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_legacy_stream() {
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+[[intel::use_stall_enable_clusters]] void softmax_legacy_stream() {
 #include "activation_tables/exp_table_legacy.tb"
 #include "activation_tables/invert_table_legacy.tb"
+    using DataT = typename ExtractDataType<typename ExtractPipeType<data_pipe>::value_type>::value_type;
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    bool keep_going = true;
 
     // Index into the lookup table based on data for exponentials
-    [[intel::fpga_register]]
-    typename CONFIG_T::table_t exp_res[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
+    [[intel::fpga_register]] typename CONFIG_T::table_t exp_res[std::tuple_size<DataT>{}];
     [[intel::fpga_register]] typename CONFIG_T::table_t exp_diff_res;
-    [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type
-        data_cache[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
+    [[intel::fpga_register]] typename DataT::value_type data_cache[std::tuple_size<DataT>{}];
 
 SoftmaxInitLoop:
-    [[intel::initiation_interval(1)]] for (unsigned s = 0;
-                                           s < CONFIG_T::n_in /
-                                                   std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
-                                           s++) {
-        auto in_pack = data_pipe::read();
-
-    SoftmaxInitPackLoop:
-        #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
-            data_cache[j] = in_pack[j];
-            exp_res[j] = 0;
-        }
+    [[intel::initiation_interval(1)]] while (keep_going) {
+        for (unsigned s = 0; s < CONFIG_T::n_in / std::tuple_size<DataT>{}; s++) {
+            auto in_data = data_pipe::read();
 
-    SoftmaxExpLoop:
-        #pragma unroll
-        for (int i = 0; i < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; i++) {
-        SoftmaxExpInner:
+        SoftmaxInitPackLoop:
             #pragma unroll
-            for (int j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
-                if (i == j) {
-                    exp_diff_res = 1;
-                } else {
-                    int data_round = ((data_cache[j] - data_cache[i]) * CONFIG_T::table_size / 16).to_int();
-                    int index = data_round + 8 * CONFIG_T::table_size / 16;
-                    if (index < 0)
-                        index = 0;
-                    if (index > CONFIG_T::table_size - 1)
-                        index = CONFIG_T::table_size - 1;
-                    exp_diff_res = exp_table_legacy[index];
+            for (unsigned j = 0; j < std::tuple_size<DataT>{}; j++) {
+                data_cache[j] = in_data.data[j];
+                exp_res[j] = 0;
+            }
+
+        SoftmaxExpLoop:
+            #pragma unroll
+            for (int i = 0; i < std::tuple_size<DataT>{}; i++) {
+            SoftmaxExpInner:
+                #pragma unroll
+                for (int j = 0; j < std::tuple_size<DataT>{}; j++) {
+                    if (i == j) {
+                        exp_diff_res = 1;
+                    } else {
+                        int data_round = ((data_cache[j] - data_cache[i]) * CONFIG_T::table_size / 16).to_int();
+                        int index = data_round + 8 * CONFIG_T::table_size / 16;
+                        if (index < 0)
+                            index = 0;
+                        if (index > CONFIG_T::table_size - 1)
+                            index = CONFIG_T::table_size - 1;
+                        exp_diff_res = exp_table_legacy[index];
+                    }
+                    exp_res[i] += exp_diff_res;
                 }
-                exp_res[i] += exp_diff_res;
             }
-        }
 
-        typename ExtractPipeType<res_pipe>::value_type out_pack;
-    SoftmaxInvPackLoop:
-        #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
-            int exp_res_index = (exp_res[j] * CONFIG_T::table_size / 64).to_int();
-            if (exp_res_index < 0)
-                exp_res_index = 0;
-            if (exp_res_index > CONFIG_T::table_size - 1)
-                exp_res_index = CONFIG_T::table_size - 1;
-            out_pack[j] =
-                static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(invert_table_legacy[exp_res_index]);
-        }
+        SoftmaxInvPackLoop:
+            #pragma unroll
+            for (unsigned j = 0; j < std::tuple_size<ResT>{}; j++) {
+                int exp_res_index = (exp_res[j] * CONFIG_T::table_size / 64).to_int();
+                if (exp_res_index < 0)
+                    exp_res_index = 0;
+                if (exp_res_index > CONFIG_T::table_size - 1)
+                    exp_res_index = CONFIG_T::table_size - 1;
+                out_data.data[j] = static_cast<typename ResT::value_type>(invert_table_legacy[exp_res_index]);
+            }
 
-        res_pipe::write(out_pack);
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
+
+            keep_going = !in_data.eop;
+        }
     }
 }
 
-template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_argmax_stream() {
-    [[intel::initiation_interval(
-        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
-        auto in_data = data_pipe::read();
-        typename ExtractPipeType<res_pipe>::value_type out_data;
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+[[intel::use_stall_enable_clusters]] void softmax_argmax_stream() {
+    using DataT = typename ExtractDataType<typename ExtractPipeType<data_pipe>::value_type>::value_type;
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
-        #pragma unroll
-        for (int i = 0; i < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
-            out_data[i] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(0);
-        }
+    bool keep_going = true;
 
-        [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type maximum = in_data[0];
-        [[intel::fpga_register]] int idx = 0;
+    [[intel::initiation_interval(1)]] while (keep_going) {
+        for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<ResT>{}; i++) {
+            auto in_data = data_pipe::read();
 
-        [[intel::initiation_interval(1)]] for (int i = 1;
-                                               i < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
-            if (in_data[i] > maximum) {
-                maximum = in_data[i];
-                idx = i;
+            #pragma unroll
+            for (int i = 0; i < std::tuple_size<ResT>{}; i++) {
+                out_data.data[i] = 0;
             }
-        }
 
-        out_data[idx] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(1);
-        res_pipe::write(out_data);
+            [[intel::fpga_register]] auto maximum = in_data.data[0];
+            [[intel::fpga_register]] int idx = 0;
+
+            [[intel::initiation_interval(1)]] for (int i = 1; i < std::tuple_size<ResT>{}; i++) {
+                if (in_data.data[i] > maximum) {
+                    maximum = in_data.data[i];
+                    idx = i;
+                }
+            }
+
+            out_data.data[idx] = 1;
+
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
+
+            keep_going = !in_data.eop;
+        }
     }
 }
 
-template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_stream() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> [[intel::use_stall_enable_clusters]] void softmax_stream() {
     switch (CONFIG_T::implementation) {
     case softmax_implementation::latency:
         softmax_latency_stream<data_pipe, res_pipe, CONFIG_T>();
@@ -505,91 +597,101 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_strea
 // *************************************************
 //       TanH Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void dense_tanh_stream() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> [[intel::use_stall_enable_clusters]] void dense_tanh_stream() {
 #include "activation_tables/tanh_table.tb"
-    static const int MAX_VALUE = 4;
+    constexpr int MAX_VALUE = 4;
+    using DataT = typename ExtractDataType<typename ExtractPipeType<data_pipe>::value_type>::value_type;
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(std::tuple_size<DataT>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<DataT>{} / multiplier_limit;
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
-    constexpr unsigned multiplier_limit =
-        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
+    bool keep_going = true;
 
 TanHActLoop:
-    [[intel::initiation_interval(pipeline)]] for (int i = 0;
-                                                  i < CONFIG_T::n_in /
-                                                          std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
-                                                  i++) {
-
-        auto in_data = data_pipe::read();
-        typename ExtractPipeType<res_pipe>::value_type out_data;
-
-    TanHPackLoop:
-        #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
-            [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type absoluteValue;
-
-            if (in_data[j] < 0)
-                absoluteValue = (-1) * in_data[j];
-            else
-                absoluteValue = in_data[j];
-
-            [[intel::fpga_register]] int index;
-            if (absoluteValue <= MAX_VALUE)
-                index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int();
-            else
-                index = CONFIG_T::table_size - 1;
-
-            if (in_data[j] > 0)
-                out_data[j] = tanh_table[index];
-            else
-                out_data[j] = -tanh_table[index];
-        }
+    [[intel::initiation_interval(pipeline)]] while (keep_going) {
+        for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<ResT>{}; i++) {
+
+            auto in_data = data_pipe::read();
+
+        TanHPackLoop:
+            #pragma unroll
+            for (int j = 0; j < std::tuple_size<ResT>{}; j++) {
+                [[intel::fpga_register]] typename DataT::value_type absoluteValue;
+
+                if (in_data.data[j] < 0)
+                    absoluteValue = (-1) * in_data.data[j];
+                else
+                    absoluteValue = in_data.data[j];
+
+                [[intel::fpga_register]] int index;
+                if (absoluteValue <= MAX_VALUE)
+                    index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int();
+                else
+                    index = CONFIG_T::table_size - 1;
 
-        res_pipe::write(out_data);
+                if (in_data.data[j] > 0)
+                    out_data.data[j] = tanh_table[index];
+                else
+                    out_data.data[j] = -tanh_table[index];
+            }
+
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
+
+            keep_going = !in_data.eop;
+        }
     }
 }
 
 // *************************************************
 //       Sigmoid Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void sigmoid_stream() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> [[intel::use_stall_enable_clusters]] void sigmoid_stream() {
 #include "activation_tables/sigmoid_table.tb"
-    static const int MAX_VALUE = 8;
+    constexpr int MAX_VALUE = 8;
+    using DataT = typename ExtractDataType<typename ExtractPipeType<data_pipe>::value_type>::value_type;
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(std::tuple_size<DataT>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<DataT>{} / multiplier_limit;
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
-    constexpr unsigned multiplier_limit =
-        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
+    bool keep_going = true;
 
 SigmoidActLoop:
-    [[intel::initiation_interval(pipeline)]] for (int i = 0;
-                                                  i < CONFIG_T::n_in /
-                                                          std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
-                                                  i++) {
-        auto in_data = data_pipe::read();
-        typename ExtractPipeType<res_pipe>::value_type out_data;
-
-    SigmoidPackLoop:
-        #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
-            [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type absoluteValue;
-
-            if (in_data[j] < 0)
-                absoluteValue = (-1) * in_data[j];
-            else
-                absoluteValue = in_data[j];
-
-            [[intel::fpga_register]] int index;
-            if (absoluteValue <= MAX_VALUE)
-                index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int();
-            else
-                index = CONFIG_T::table_size - 1;
-
-            if (in_data[j] > 0)
-                out_data[j] = sigmoid_table[index];
-            else
-                out_data[j] = 1 - sigmoid_table[index];
-        }
+    [[intel::initiation_interval(pipeline)]] while (keep_going) {
+        for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<ResT>{}; i++) {
+            auto in_data = data_pipe::read();
+
+        SigmoidPackLoop:
+            #pragma unroll
+            for (int j = 0; j < std::tuple_size<ResT>{}; j++) {
+                [[intel::fpga_register]] typename DataT::value_type absoluteValue;
+
+                if (in_data.data[j] < 0)
+                    absoluteValue = (-1) * in_data.data[j];
+                else
+                    absoluteValue = in_data.data[j];
+
+                [[intel::fpga_register]] int index;
+                if (absoluteValue <= MAX_VALUE)
+                    index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int();
+                else
+                    index = CONFIG_T::table_size - 1;
+
+                if (in_data.data[j] > 0)
+                    out_data.data[j] = sigmoid_table[index];
+                else
+                    out_data.data[j] = 1 - sigmoid_table[index];
+            }
+
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
 
-        res_pipe::write(out_data);
+            keep_going = !in_data.eop;
+        }
     }
 }
 
@@ -597,113 +699,144 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void sigmoid_strea
 //       Hard sigmoid Activation
 // *************************************************
 // Note - Theano and Tensorflow might have different definitions for hard sigmoid; could provide two implementations
-template <class data_pipe, class res_pipe, typename CONFIG_T> void hard_sigmoid_stream() {
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+[[intel::use_stall_enable_clusters]] void hard_sigmoid_stream() {
+    using DataT = typename ExtractDataType<typename ExtractPipeType<data_pipe>::value_type>::value_type;
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(std::tuple_size<DataT>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<DataT>{} / multiplier_limit;
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
-    constexpr unsigned multiplier_limit =
-        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
+    bool keep_going = true;
 
 HardSigmoidActLoop:
-    [[intel::initiation_interval(pipeline)]] for (int i = 0;
-                                                  i < CONFIG_T::n_in /
-                                                          std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
-                                                  i++) {
-
-        auto in_data = data_pipe::read();
-        typename ExtractPipeType<res_pipe>::value_type out_data;
-
-    HardSigmoidPackLoop:
-        #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
-            [[intel::fpga_register]] auto datareg = CONFIG_T::slope * in_data[j] + CONFIG_T::shift;
-            if (datareg > 1)
-                datareg = 1;
-            else if (datareg < 0)
-                datareg = 0;
-            out_data[j] = datareg;
-        }
+    [[intel::initiation_interval(pipeline)]] while (keep_going) {
+        for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<ResT>{}; i++) {
+
+            auto in_data = data_pipe::read();
+
+        HardSigmoidPackLoop:
+            #pragma unroll
+            for (int j = 0; j < std::tuple_size<ResT>{}; j++) {
+                [[intel::fpga_register]] auto datareg = CONFIG_T::slope * in_data.data[j] + CONFIG_T::shift;
+                if (datareg > 1)
+                    datareg = 1;
+                else if (datareg < 0)
+                    datareg = 0;
+                out_data.data[j] = datareg;
+            }
 
-        res_pipe::write(out_data);
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
+
+            keep_going = !in_data.eop;
+        }
     }
 }
 
-template <class data_pipe, class res_pipe, typename CONFIG_T> void hard_tanh_stream() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> [[intel::use_stall_enable_clusters]] void hard_tanh_stream() {
+    using DataT = typename ExtractDataType<typename ExtractPipeType<data_pipe>::value_type>::value_type;
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(std::tuple_size<DataT>{}, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = std::tuple_size<DataT>{} / multiplier_limit;
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
 
-    constexpr unsigned multiplier_limit =
-        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
+    bool keep_going = true;
 
 HardSigmoidActLoop:
-    [[intel::initiation_interval(pipeline)]] for (int i = 0;
-                                                  i < CONFIG_T::n_in /
-                                                          std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{};
-                                                  i++) {
-
-        auto in_data = data_pipe::read();
-        typename ExtractPipeType<res_pipe>::value_type out_data;
-
-    HardSigmoidPackLoop:
-        #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
-            auto sigmoid = CONFIG_T::slope * in_data[j] + CONFIG_T::shift;
-            if (sigmoid > 1)
-                sigmoid = 1;
-            else if (sigmoid < 0)
-                sigmoid = 0;
-            out_data[j] = 2 * sigmoid - 1;
-        }
+    [[intel::initiation_interval(pipeline)]] while (keep_going) {
+        for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<ResT>{}; i++) {
 
-        res_pipe::write(out_data);
+            auto in_data = data_pipe::read();
+
+        HardSigmoidPackLoop:
+            #pragma unroll
+            for (int j = 0; j < std::tuple_size<ResT>{}; j++) {
+                auto sigmoid = CONFIG_T::slope * in_data.data[j] + CONFIG_T::shift;
+                if (sigmoid > 1)
+                    sigmoid = 1;
+                else if (sigmoid < 0)
+                    sigmoid = 0;
+                out_data.data[j] = 2 * sigmoid - 1;
+            }
+
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
+
+            keep_going = !in_data.eop;
+        }
     }
 }
 
 // *************************************************
 //       Binary TanH Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void binary_tanh_stream() {
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+[[intel::use_stall_enable_clusters]] void binary_tanh_stream() {
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    bool keep_going = true;
+
 BinaryTanHActLoop:
-    [[intel::initiation_interval(
-        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
-
-        [[intel::fpga_register]] auto in_data = data_pipe::read();
-        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
-
-    BinaryTanHPackLoop:
-        #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
-            if (in_data[j] > 0)
-                out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(1);
-            else
-                out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(-1);
-        }
+    [[intel::initiation_interval(1)]] while (keep_going) {
+        for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<ResT>{}; i++) {
+
+            [[intel::fpga_register]] auto in_data = data_pipe::read();
+
+        BinaryTanHPackLoop:
+            #pragma unroll
+            for (int j = 0; j < std::tuple_size<ResT>{}; j++) {
+                if (in_data.data[j] > 0)
+                    out_data.data[j] = 1;
+                else
+                    out_data.data[j] = -1;
+            }
+
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
 
-        res_pipe::write(out_data);
+            keep_going = !in_data.eop;
+        }
     }
 }
 
 // *************************************************
 //       Ternary TanH Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void ternary_tanh_stream() {
+template <class data_pipe, class res_pipe, typename CONFIG_T>
+[[intel::use_stall_enable_clusters]] void ternary_tanh_stream() {
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    bool keep_going = true;
+
 TernaryTanHActLoop:
-    [[intel::initiation_interval(
-        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
-
-        [[intel::fpga_register]] auto in_data = data_pipe::read();
-        [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
-
-    TernaryTanHPackLoop:
-        #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
-            if (in_data[j] > 1)
-                out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(1);
-            else if (in_data[j] <= -1)
-                out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(-1);
-            else
-                out_data[j] = static_cast<typename ExtractPipeType<res_pipe>::value_type::value_type>(0);
-        }
+    [[intel::initiation_interval(1)]] while (keep_going) {
+        for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<ResT>{}; i++) {
+
+            [[intel::fpga_register]] auto in_data = data_pipe::read();
 
-        res_pipe::write(out_data);
+        TernaryTanHPackLoop:
+            #pragma unroll
+            for (int j = 0; j < std::tuple_size<ResT>{}; j++) {
+                if (in_data.data[j] > 1)
+                    out_data.data[j] = 1;
+                else if (in_data.data[j] <= -1)
+                    out_data.data[j] = -1;
+                else
+                    out_data.data[j] = 0;
+            }
+
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
+
+            keep_going = !in_data.eop;
+        }
     }
 }
 
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
index 128b3ac1a4..77597986c8 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h
@@ -13,33 +13,43 @@ namespace nnet {
 // ****************************************************
 template <class data_pipe, class res_pipe, typename CONFIG_T>
 void normalize_stream(typename CONFIG_T::scale_t scale, typename CONFIG_T::bias_t bias) {
+    using DataT = typename ExtractDataType<typename ExtractPipeType<data_pipe>::value_type>::value_type;
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
 
     constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
     constexpr unsigned pipeline = CONFIG_T::n_in / multiplier_limit;
-    constexpr auto datasize = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
-    CONFIG_T::template product<typename ExtractPipeType<data_pipe>::value_type::value_type,
-                               typename CONFIG_T::scale_t::value_type>::limit(multiplier_limit);
+    constexpr auto datasize = std::tuple_size<DataT>{};
+    CONFIG_T::template product<typename DataT::value_type, typename CONFIG_T::scale_t::value_type>::limit(multiplier_limit);
+
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    bool keep_going = true;
 
 BatchNormLoop:
-    [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / datasize; i++) {
-        auto in_data = data_pipe::read();
-        typename ExtractPipeType<res_pipe>::value_type out_data;
-
-    BatchNormpack:
-        #pragma unroll
-        for (int j = 0; j < datasize; j++) {
-            int norm_index;
-            if (CONFIG_T::n_filt == -1)
-                norm_index = i * datasize + j;
-            else
-                norm_index = j % CONFIG_T::n_filt;
-            out_data[j] =
-                CONFIG_T::template product<typename ExtractPipeType<data_pipe>::value_type::value_type,
-                                           typename CONFIG_T::scale_t::value_type>::product(in_data[j], scale[norm_index]) +
-                bias[norm_index];
+    [[intel::initiation_interval(pipeline)]] while (keep_going) {
+        for (int i = 0; i < CONFIG_T::n_in / datasize; i++) {
+            auto in_data = data_pipe::read();
+
+        BatchNormpack:
+            #pragma unroll
+            for (int j = 0; j < datasize; j++) {
+                int norm_index;
+                if (CONFIG_T::n_filt == -1)
+                    norm_index = i * datasize + j;
+                else
+                    norm_index = j % CONFIG_T::n_filt;
+                out_data.data[j] =
+                    CONFIG_T::template product<typename DataT::value_type, typename CONFIG_T::scale_t::value_type>::product(
+                        in_data.data[j], scale[norm_index]) +
+                    bias[norm_index];
+            }
+
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
+
+            keep_going = !in_data.eop;
         }
-
-        res_pipe::write(out_data);
     }
 }
 
@@ -48,57 +58,79 @@ void normalize_stream(typename CONFIG_T::scale_t scale, typename CONFIG_T::bias_
 // ****************************************************
 template <class data_pipe, class res_pipe, typename CONFIG_T>
 void normalize_binary_tanh_stream(typename CONFIG_T::threshold_t threshold) {
-    constexpr auto datasize = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
+    using DataT = typename ExtractDataType<typename ExtractPipeType<data_pipe>::value_type>::value_type;
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    constexpr auto datasize = std::tuple_size<DataT>{};
+
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    bool keep_going = true;
 
 BinaryNormLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / datasize; i++) {
-        auto in_data = data_pipe::read();
-        nnet::array<ac_int<1, false>, CONFIG_T::n_scale_bias> out_data;
-
-    BatchNormPack:
-        #pragma unroll
-        for (int j = 0; j < datasize; j++) {
-            int norm_index;
-            if (CONFIG_T::n_filt == -1)
-                norm_index = i * datasize + j;
-            else
-                norm_index = j % CONFIG_T::n_filt;
-
-            out_data[j] = (in_data[j] >= threshold[norm_index]) ? 1 : 0;
+    [[intel::initiation_interval(1)]] while (keep_going) {
+        for (int i = 0; i < CONFIG_T::n_in / datasize; i++) {
+            auto in_data = data_pipe::read();
+
+        BatchNormPack:
+            #pragma unroll
+            for (int j = 0; j < datasize; j++) {
+                int norm_index;
+                if (CONFIG_T::n_filt == -1)
+                    norm_index = i * datasize + j;
+                else
+                    norm_index = j % CONFIG_T::n_filt;
+
+                out_data.data[j] = (in_data.data[j] >= threshold[norm_index]) ? 1 : 0;
+            }
+
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
+
+            keep_going = !in_data.eop;
         }
-
-        res_pipe::write(out_data);
     }
 }
 
 template <class data_pipe, class res_pipe, typename CONFIG_T>
 void normalize_ternary_tanh_stream(typename CONFIG_T::threshold_hi_t threshold_hi,
                                    typename CONFIG_T::threshold_lo_t threshold_lo) {
-    constexpr auto datasize = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
+    using DataT = typename ExtractDataType<typename ExtractPipeType<data_pipe>::value_type>::value_type;
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    constexpr auto datasize = std::tuple_size<DataT>{};
+
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    bool keep_going = true;
 
 TernaryNormLoop:
-    [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / datasize; i++) {
-        auto in_data = data_pipe::read();
-        nnet::array<ac_int<2, true>, CONFIG_T::n_scale_bias> out_data;
-
-    BatchNormPack:
-        #pragma unroll
-        for (int j = 0; j < datasize; j++) {
-            int norm_index;
-            if (CONFIG_T::n_filt == -1)
-                norm_index = i * datasize + j;
-            else
-                norm_index = j % CONFIG_T::n_filt;
-
-            if (in_data[j] > threshold_hi[norm_index])
-                out_data[j] = 1;
-            else if (in_data[j] <= threshold_lo[norm_index])
-                out_data[j] = -1;
-            else
-                out_data[j] = 0;
+    [[intel::initiation_interval(1)]] while (keep_going) {
+        for (int i = 0; i < CONFIG_T::n_in / datasize; i++) {
+            auto in_data = data_pipe::read();
+
+        BatchNormPack:
+            #pragma unroll
+            for (int j = 0; j < datasize; j++) {
+                int norm_index;
+                if (CONFIG_T::n_filt == -1)
+                    norm_index = i * datasize + j;
+                else
+                    norm_index = j % CONFIG_T::n_filt;
+
+                if (in_data.data[j] > threshold_hi[norm_index])
+                    out_data.data[j] = 1;
+                else if (in_data.data[j] <= threshold_lo[norm_index])
+                    out_data.data[j] = -1;
+                else
+                    out_data.data[j] = 0;
+            }
+
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
+
+            keep_going = !in_data.eop;
         }
-
-        res_pipe::write(out_data);
     }
 }
 
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_data_movement.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_data_movement.h
new file mode 100644
index 0000000000..4a705ca199
--- /dev/null
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_data_movement.h
@@ -0,0 +1,148 @@
+#ifndef NNET_DATA_MOVEMENT_H
+#define NNET_DATA_MOVEMENT_H
+
+#include <sycl/ext/intel/fpga_extensions.hpp>
+#include <sycl/sycl.hpp>
+
+// This file defines the methods to transfer the data to the kernel. In the HLS flow,
+// these are really part of the testbench. However, in the accelerator (BSP) flow, they are
+// actual kernels that are deployed in hardware.
+
+namespace nnet {
+
+//////////////////////////////////////////////////////////////////////////////
+// These are the simple, testbench-only versions
+//////////////////////////////////////////////////////////////////////////////
+template <class srcType, class dest_pipe, size_t SIZE> void convert_data(sycl::queue &q, srcType *src) {
+    using PipeDataType = typename nnet::ExtractPipeType<dest_pipe>::value_type;
+    using SrcDataType = typename nnet::ExtractDataType<PipeDataType>::value_type;
+    constexpr auto dstTypeSize = std::tuple_size<SrcDataType>{};
+    for (size_t i = 0; i < SIZE / dstTypeSize; i++) {
+        PipeDataType packet;
+        for (size_t j = 0; j < dstTypeSize; j++) {
+            packet.data[j] = src[i * dstTypeSize + j];
+        }
+        packet.sop = (i == 0);
+        packet.eop = (i == (SIZE / dstTypeSize - 1));
+        dest_pipe::write(q, packet);
+    }
+}
+
+template <class src_pipe, class dstType, size_t SIZE> void convert_data_back(sycl::queue &q, dstType *dst) {
+    using PipeDataType = typename nnet::ExtractPipeType<src_pipe>::value_type;
+    using SrcDataType = typename nnet::ExtractDataType<PipeDataType>::value_type;
+    constexpr auto srcTypeSize = std::tuple_size<SrcDataType>{};
+    for (size_t i = 0; i < SIZE / srcTypeSize; i++) {
+        auto packet = src_pipe::read(q);
+        for (size_t j = 0; j < srcTypeSize; j++) {
+            dst[i * srcTypeSize + j] = packet.data[j].to_double();
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// The ones below can be used both in testbenches and in the accelerator flow
+//////////////////////////////////////////////////////////////////////////////
+#if !defined(IS_BSP)
+// Definition for buffer locations for Avalon MM host.
+inline constexpr unsigned kInputBufferLocation = 0;
+inline constexpr unsigned kOutputBufferLocation = 1;
+#endif
+
+// Implementation of a direct memory access kernel. Move data from source, convert,
+// and send to the sink. Adaptive to SYCL HLS and hardware acceleration flow.
+template <class src_T, class dest_pipe> struct DMA_convert_data {
+#if !defined(IS_BSP)
+    // When targeting a device family, we instantiate an Avalon Memory Mapped Host for
+    // data transaction between host and the DMA kernel during emulation and simulation.
+    sycl::ext::oneapi::experimental::annotated_arg<
+        src_T *,
+        decltype(sycl::ext::oneapi::experimental::properties{
+            sycl::ext::intel::experimental::latency<0>, sycl::ext::intel::experimental::dwidth<16>,
+            sycl::ext::intel::experimental::buffer_location<kInputBufferLocation>,
+            sycl::ext::intel::experimental::read_write_mode_read, sycl::ext::intel::experimental::wait_request_requested})>
+#else
+    // When targeting oneAPI BSP, we can use USM pointer to access host memory.
+    src_T *const
+#endif
+        src;
+    size_t num_iteration;
+
+    [[intel::kernel_args_restrict]] void operator()() const {
+
+#if defined(IS_BSP)
+        // Access data using host pointer.
+        sycl::ext::intel::host_ptr<src_T> src_ptr(src);
+#else
+        // Host allocation is not supported when targeting an FPGA family or part number.
+        src_T *src_ptr(src);
+#endif
+        // First, extract the PipeDataT from the pipe
+        using PipeDataType = typename nnet::ExtractPipeType<dest_pipe>::value_type;
+        // Then, extract the DataT from StreamingBeat
+        using DstDataType = typename nnet::ExtractDataType<PipeDataType>::value_type;
+        constexpr auto dstTypeSize = std::tuple_size<DstDataType>{};
+
+        [[intel::fpga_register]] typename nnet::ExtractPipeType<dest_pipe>::value_type packet;
+
+        // Keep sending data to the input layer and keep the kernels running.
+        for (size_t i = 0; i < num_iteration; i++) {
+            #pragma unroll
+            for (size_t j = 0; j < dstTypeSize; j++) {
+                packet.data[j] = src_ptr[i * dstTypeSize + j];
+            }
+            packet.sop = (i == 0);
+            // Assert end-of-packet signal after the last iteration.
+            // All down-stream kernels will stop seeing eop.
+            packet.eop = (i == (num_iteration - 1));
+            dest_pipe::write(packet);
+        }
+    }
+};
+
+// Symmetrical to the DMA_convert_data above, this DMA drains the output pipe and
+// writes result to memory.
+template <class src_pipe, class dst_T> struct DMA_convert_data_back {
+#if !defined(IS_BSP)
+    // Without BSP, instantiate an Avalon Memory Mapped Host to write to host.
+    sycl::ext::oneapi::experimental::annotated_arg<
+        dst_T *,
+        decltype(sycl::ext::oneapi::experimental::properties{
+            sycl::ext::intel::experimental::latency<0>, sycl::ext::intel::experimental::dwidth<16>,
+            sycl::ext::intel::experimental::buffer_location<kOutputBufferLocation>,
+            sycl::ext::intel::experimental::read_write_mode_write, sycl::ext::intel::experimental::wait_request_requested})>
+#else
+    // USM pointer, otherwise.
+    dst_T *const
+#endif
+        dst;
+    size_t num_iteration;
+
+    [[intel::kernel_args_restrict]] void operator()() const {
+#if defined(IS_BSP)
+        sycl::ext::intel::host_ptr<dst_T> dst_ptr(dst);
+#else
+        dst_T *dst_ptr(dst);
+#endif
+        // First, extract the PipeDataT from the pipe
+        using PipeDataType = typename nnet::ExtractPipeType<src_pipe>::value_type;
+        // Then, extract the DataT from StreamingBeat
+        using SrcDataType = typename nnet::ExtractDataType<PipeDataType>::value_type;
+        constexpr auto srcTypeSize = std::tuple_size<SrcDataType>{};
+
+        [[intel::fpga_register]] typename nnet::ExtractPipeType<src_pipe>::value_type packet;
+
+        // Drain the output pipe and write result to memory.
+        for (size_t i = 0; i < num_iteration; i++) {
+            packet = src_pipe::read();
+            #pragma unroll
+            for (size_t j = 0; j < srcTypeSize; j++) {
+                dst_ptr[i * srcTypeSize + j] = static_cast<dst_T>(packet.data[j].to_double());
+            }
+        }
+    }
+};
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
index 92c9adc3bb..c1c6d726e8 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
@@ -7,15 +7,34 @@
 
 namespace nnet {
 
-// Note:  DataPack logic removed, at least in the initial version
+// Restartable streaming kernel implementation.
+// Computation is carried out in a while-1 loop as long as there is valid input.
+// The loop breaks when the end-of-packet signal is asserted by upstream task.
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void dense_resource_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::bias_t biases) {
+[[intel::use_stall_enable_clusters]] void dense_resource_stream(const typename CONFIG_T::weight_t weights,
+                                                                const typename CONFIG_T::bias_t biases) {
+    using namespace nnet;
+    using DataT = typename ExtractDataType<typename ExtractPipeType<data_pipe>::value_type>::value_type;
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
 
-    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type res;
-    [[intel::fpga_register]] auto data = data_pipe::read();
-    dense_resource<typename ExtractPipeType<data_pipe>::value_type, typename ExtractPipeType<res_pipe>::value_type,
-                   CONFIG_T>(data, res, weights, biases);
-    res_pipe::write(res);
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type resbeat;
+
+    bool keep_going = true;
+    bool did_read_input;
+    [[intel::initiation_interval(1)]] while (keep_going) {
+        did_read_input = false;
+        [[intel::fpga_register]] auto databeat = data_pipe::read(did_read_input);
+
+        if (did_read_input) {
+            dense_resource<DataT, ResT, CONFIG_T>(databeat.data, resbeat.data, weights, biases);
+
+            resbeat.sop = databeat.sop;
+            resbeat.eop = databeat.eop;
+
+            res_pipe::write(resbeat);
+            keep_going = !databeat.eop;
+        }
+    }
 }
 
 } // namespace nnet
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h
index c7af2e7a68..e5b451655a 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h
@@ -12,27 +12,6 @@
 
 namespace nnet {
 
-template <class srcType, class dest_pipe, size_t SIZE> void convert_data(sycl::queue &q, srcType *src) {
-    constexpr auto dstTypeSize = std::tuple_size<typename ExtractPipeType<dest_pipe>::value_type>{};
-    for (size_t i = 0; i < SIZE / dstTypeSize; i++) {
-        typename ExtractPipeType<dest_pipe>::value_type ctype;
-        for (size_t j = 0; j < dstTypeSize; j++) {
-            ctype[j] = src[i * dstTypeSize + j];
-        }
-        dest_pipe::write(q, ctype);
-    }
-}
-
-template <class src_pipe, class dstType, size_t SIZE> void convert_data_back(sycl::queue &q, dstType *dst) {
-    constexpr auto srcTypeSize = std::tuple_size<typename ExtractPipeType<src_pipe>::value_type>{};
-    for (size_t i = 0; i < SIZE / srcTypeSize; i++) {
-        auto ctype = src_pipe::read(q);
-        for (size_t j = 0; j < srcTypeSize; j++) {
-            dst[i * srcTypeSize + j] = ctype[j].to_double();
-        }
-    }
-}
-
 extern bool trace_enabled;
 extern std::map<std::string, void *> *trace_outputs;
 extern size_t trace_type_size;
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
index 8cf883c1d5..a35bba17bf 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
@@ -8,6 +8,8 @@
 #include <tuple>
 #include <utility>
 
+#include <sycl/ext/intel/prototype/pipes_ext.hpp> // Streaming Beat and pipe properties.
+
 namespace nnet {
 
 // Define the pipe type that we use
@@ -34,6 +36,15 @@ struct ExtractPipeType<PipeClass<PipeName, PipeDataT, kPipeMinCapacity, PipeProp
     typedef PipeDataT value_type;
 };
 
+// Helper template for extracting datatype from oneAPI StreamingBeat type.
+template <typename T> struct ExtractDataType { typedef T value_type; };
+
+// Specialization on oneAPI StreamingBeat type.
+template <typename DataT, bool EnableSOP, bool EnableEmpty>
+struct ExtractDataType<sycl::ext::intel::experimental::StreamingBeat<DataT, EnableSOP, EnableEmpty>> {
+    typedef DataT value_type;
+};
+
 /*
  * HLS Shift Register Implementation
  * To verify a shift register is used in hardware, go to report.html > Area Analysis of System
diff --git a/hls4ml/templates/oneapi/myproject_bridge.cpp b/hls4ml/templates/oneapi/myproject_bridge.cpp
index ddad1d054b..fa73db7c2a 100644
--- a/hls4ml/templates/oneapi/myproject_bridge.cpp
+++ b/hls4ml/templates/oneapi/myproject_bridge.cpp
@@ -2,7 +2,7 @@
 #define MYPROJECT_BRIDGE_H_
 
 #include "firmware/myproject.h"
-#include "firmware/nnet_utils/nnet_helpers.h"
+#include "firmware/nnet_utils/nnet_data_movement.h"
 #include <algorithm>
 #include <map>
 
diff --git a/hls4ml/templates/oneapi/myproject_test.cpp b/hls4ml/templates/oneapi/myproject_test.cpp
index 82fb60d2f8..f0de1875fa 100644
--- a/hls4ml/templates/oneapi/myproject_test.cpp
+++ b/hls4ml/templates/oneapi/myproject_test.cpp
@@ -4,9 +4,11 @@
 #include <fstream>
 #include <iostream>
 #include <string>
+#include <string_view>
 #include <vector>
 
 #include "firmware/myproject.h"
+#include "firmware/nnet_utils/nnet_data_movement.h"
 #include "firmware/parameters.h"
 
 #include <sycl/ext/intel/fpga_extensions.hpp>
@@ -20,13 +22,70 @@
 
 #define CHECKPOINT 5000
 
+#if not defined(IS_BSP)
+using sycl::ext::intel::experimental::property::usm::buffer_location;
+#endif
+
+// Functions that reads input and prediction data from files.
+// Returns `true` if files are read successfully and not empty.
+// Returns `false` otherwise.
+bool prepare_data_from_file(std::string &fin_path, std::string &fpr_path, std::vector<std::vector<float>> &inputs,
+                            std::vector<std::vector<float>> &predictions) {
+    // load input data from text file
+    std::ifstream fin(fin_path.c_str());
+    // load predictions from text file
+    std::ifstream fpr(fpr_path.c_str());
+
+    std::string iline;
+    std::string pline;
+
+    if (fin.is_open() && fpr.is_open()) {
+        size_t num_iterations = 0;
+
+        // Prepare input data from file. Load predictions from file.
+        for (; std::getline(fin, iline) && std::getline(fpr, pline); num_iterations++) {
+            if (num_iterations % CHECKPOINT == 0) {
+                std::cout << "Processing input " << num_iterations << std::endl;
+            }
+
+            std::vector<float> in;
+            std::vector<float> pr;
+            float current;
+
+            std::stringstream ssin(iline);
+            while (ssin >> current) {
+                in.push_back(current);
+            }
+
+            std::stringstream sspred(pline);
+            while (sspred >> current) {
+                pr.push_back(current);
+            }
+
+            std::copy(pr.cbegin(), pr.cend(), predictions.back().begin());
+            std::copy(in.cbegin(), in.cend(), inputs.back().begin());
+        }
+        fin.close();
+        fpr.close();
+        if (inputs.empty())
+            return false;
+        else
+            return true;
+    } else {
+        return false;
+    }
+}
+
 int main(int argc, char **argv) {
 
 #if FPGA_SIMULATOR
+#define NUM_ITERATIONS 5
     auto selector = sycl::ext::intel::fpga_simulator_selector_v;
 #elif FPGA_HARDWARE
+#define NUM_ITERATIONS 100
     auto selector = sycl::ext::intel::fpga_selector_v;
 #else // #if FPGA_EMULATOR
+#define NUM_ITERATIONS 10
     auto selector = sycl::ext::intel::fpga_emulator_selector_v;
 #endif
 
@@ -44,93 +103,108 @@ int main(int argc, char **argv) {
 
     std::cout << "Running on device: " << device.get_info<sycl::info::device::name>().c_str() << std::endl;
 
-    // load input data from text file
-    std::ifstream fin("tb_data/tb_input_features.dat");
-    // load predictions from text file
-    std::ifstream fpr("tb_data/tb_output_predictions.dat");
-
+    std::string INPUT_FILE = "tb_data/tb_input_features.dat";
+    std::string PRED_FILE = "tb_data/tb_output_predictions.dat";
     std::string RESULTS_LOG = "tb_data/results.log";
     std::ofstream fout(RESULTS_LOG);
 
-    std::string iline;
-    std::string pline;
-
-    if (fin.is_open() && fpr.is_open()) {
-        std::vector<std::vector<float>> predictions;
-        unsigned int iteration = 0;
-        for (; std::getline(fin, iline) && std::getline(fpr, pline); iteration++) {
-            if (iteration % CHECKPOINT == 0) {
-                std::cout << "Processing input " << iteration << std::endl;
-            }
+    // Allocate vectors on stack to hold data from files temporarily.
+    std::vector<std::vector<float>> inputs;
+    std::vector<std::vector<float>> predictions;
+    bool file_valid = prepare_data_from_file(INPUT_FILE, PRED_FILE, inputs, predictions);
+    unsigned int num_iterations;
+    if (file_valid) {
+        num_iterations = inputs.size();
+    } else {
+        num_iterations = NUM_ITERATIONS;
+    }
 
-            std::vector<float> in;
-            std::vector<float> pr;
-            float current;
+    // hls-fpga-machine-learning insert runtime contant
 
-            std::stringstream ssin(iline);
-            while (ssin >> current) {
-                in.push_back(current);
-            }
+    try {
+#if defined(IS_BSP)
+        // Allocate host memory if BSP is in use.
+        float *vals = sycl::malloc_host<float>(kInputSz, q);
+        if (vals == nullptr) {
+            std::cerr << "ERROR: host allocation failed for input\n";
+            fout.close();
+            return 1;
+        }
+        float *outputs = sycl::malloc_host<float>(kOutputSz, q);
+        if (outputs == nullptr) {
+            std::cerr << "ERROR: host allocation failed for output\n";
+            fout.close();
+            return 1;
+        }
+#else
+        float *vals =
+            sycl::malloc_shared<float>(kInputSz, q, sycl::property_list{buffer_location(nnet::kInputBufferLocation)});
+        float *outputs =
+            sycl::malloc_shared<float>(kOutputSz, q, sycl::property_list{buffer_location(nnet::kOutputBufferLocation)});
+#endif
 
-            std::stringstream sspred(pline);
-            while (sspred >> current) {
-                pr.push_back(current);
-            }
+        if (file_valid) {
+            // Start always-run streaming kernel here, instead of inside a loop.
+            q.single_task(MyProject{});
 
             // hls-fpga-machine-learning insert data
 
-            q.single_task(MyProject{});
-
             // hls-fpga-machine-learning convert output
 
-            std::copy(pr.cbegin(), pr.cend(), predictions.back().begin());
-
-            for (auto outval : outputs) {
-                fout << outval << " ";
-            }
-            fout << std::endl;
-            if (iteration % CHECKPOINT == 0) {
-                std::cout << "Predictions" << std::endl;
-                // hls-fpga-machine-learning insert predictions
-                for (auto predval : pr) {
-                    std::cout << predval << " ";
+            // Print output from kernel and from prediction file.
+            for (int i = 0; i < num_iterations; i++) {
+                for (int j = 0; j < kOutLayerSize; j++) {
+                    fout << outputs[i * kOutLayerSize + j] << " ";
                 }
-                std::cout << std::endl;
-                std::cout << "Quantized predictions" << std::endl;
-                // hls-fpga-machine-learning insert quantized
-                for (auto outval : outputs) {
-                    std::cout << outval << " ";
+                fout << std::endl;
+                if (i % CHECKPOINT == 0) {
+                    std::cout << "Predictions" << std::endl;
+                    // hls-fpga-machine-learning insert predictions
+                    for (auto predval : predictions[i]) {
+                        std::cout << predval << " ";
+                    }
+                    std::cout << std::endl;
+                    std::cout << "Quantized predictions" << std::endl;
+                    // hls-fpga-machine-learning insert quantized
+                    for (int j = 0; j < kOutLayerSize; j++) {
+                        std::cout << outputs[i * kOutLayerSize + j] << " ";
+                    }
+                    std::cout << std::endl;
                 }
-                std::cout << std::endl;
             }
-        }
-        fin.close();
-        fpr.close();
-    } else {
-        const unsigned int num_iterations = 10;
-        std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations
-                  << " invocations." << std::endl;
-
-        // hls-fpga-machine-learning insert top-level-function
-        for (int i = 0; i < num_iterations; i++) {
-            // hls-fpga-machine-learning insert zero
+        } else {
+            std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations
+                      << " invocations." << std::endl;
             q.single_task(MyProject{});
+            // hls-fpga-machine-learning insert top-level-function
+            // hls-fpga-machine-learning insert zero
             // hls-fpga-machine-learning convert output
-            for (auto outval : outputs) {
-                std::cout << outval << " ";
-            }
-            std::cout << std::endl;
-
-            for (auto outval : outputs) {
-                fout << outval << " ";
+            for (int i = 0; i < num_iterations; i++) {
+                for (int j = 0; j < kOutLayerSize; j++) {
+                    std::cout << outputs[i * kOutLayerSize + j] << " ";
+                    fout << outputs[i * kOutLayerSize + j] << " ";
+                }
+                std::cout << std::endl;
+                fout << std::endl;
             }
-            fout << std::endl;
         }
+        sycl::free(vals, q);
+        sycl::free(outputs, q);
+        fout.close();
+        std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
+    } catch (sycl::exception const &e) {
+        // Catches exceptions in the host code.
+        std::cerr << "Caught a SYCL host exception:\n" << e.what() << "\n";
+
+        // Most likely the runtime couldn't find FPGA hardware!
+        if (e.code().value() == CL_DEVICE_NOT_FOUND) {
+            std::cerr << "If you are targeting an FPGA, please ensure that your "
+                         "system has a correctly configured FPGA board.\n";
+            std::cerr << "Run sys_check in the oneAPI root directory to verify.\n";
+            std::cerr << "If you are targeting the FPGA emulator, compile with "
+                         "-DFPGA_EMULATOR.\n";
+        }
+        std::terminate();
     }
-    q.wait();
-
-    fout.close();
-    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
-
     return 0;
 }
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index e93f8b5ca3..98a62a8c4a 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -137,8 +137,11 @@ def write_project_cpp(self, model):
                 elif '// hls-fpga-machine-learning read in' in line:
                     newline = line
                     if io_type == 'io_parallel':
+                        restartable_kernel_loop = f"bool keep_going = true;\n\n" f"{indent}while (keep_going) {{\n"
+                        newline += indent + restartable_kernel_loop
                         for inp in model_inputs:
-                            newline += indent + f'auto {inp.name} = {inp.pipe_name}::read();\n'
+                            newline += indent * 2 + f'auto {inp.name}_beat = {inp.pipe_name}::read();\n'
+                            newline += indent * 2 + f'auto {inp.name} = {inp.name}_beat.data;\n'
                     # for streaming we don't need to read it in
 
                 # Insert weights
@@ -151,16 +154,21 @@ def write_project_cpp(self, model):
 
                 # Insert task sequences
                 elif '// hls-fpga-machine-learning declare task sequences' in line:
-                    newline = line
                     if io_type == 'io_stream':  # only need this for io_stream
+                        newline = line
                         for layer in model.get_layers():
                             ts = layer.get_attr('tast_sequence_cpp')
                             if ts:
                                 newline += '    ' + ts + '\n'
+                    else:
+                        newline = indent + line
 
                 # Neural net instantiation
                 elif '// hls-fpga-machine-learning insert layers' in line:
-                    newline = line + '\n'
+                    if io_type == 'io_parallel':
+                        newline = indent + line + '\n'
+                    else:
+                        newline = line + '\n'
                     for layer in model.get_layers():
                         if io_type != 'io_stream':
                             vars = layer.get_variables()
@@ -168,14 +176,14 @@ def write_project_cpp(self, model):
                                 if var not in model_inputs:
                                     def_cpp = var.definition_cpp()
                                     if def_cpp is not None:
-                                        newline += '    ' + def_cpp + ';\n'
+                                        newline += indent * 2 + def_cpp + ';\n'
                         func = (
                             layer.get_attr('function_cpp')
                             if io_type == 'io_parallel'
                             else layer.get_attr('stream_function_cpp')
                         )
                         if func:
-                            newline += '    ' + func + '\n'
+                            newline += (indent * 2 if io_type == 'io_parallel' else indent) + func + '\n'
                             if model.config.trace_output and layer.get_attr('trace', False):
                                 newline += '#ifndef HLS_SYNTHESIS\n'
                                 for var in vars:
@@ -188,8 +196,17 @@ def write_project_cpp(self, model):
                 elif '// hls-fpga-machine-learning return' in line:
                     newline = line
                     if io_type == 'io_parallel':
+                        newline = indent + newline
                         for out in model_outputs:
-                            newline += indent + f'{out.pipe_name}::write({out.name});\n'
+                            out_beat = f"{out.name}_beat"
+                            newline += (
+                                indent * 2 + f'typename nnet::ExtractPipeType<{out.pipe_name}>::value_type {out_beat};\n'
+                            )
+                            newline += indent * 2 + f'{out_beat}.data = {out.name};\n'
+                            newline += indent * 2 + f'{out.pipe_name}::write({out_beat});\n'
+                        newline += indent * 2 + '// stops the kernel when the last input seen.\n'
+                        newline += indent * 2 + f'keep_going = !{model_inputs[0].name}_beat.eop;\n'
+                        newline += f"{indent}}}\n"
                     # don't need to add anything in io_stream
 
                 # Just copy line
@@ -396,27 +413,39 @@ def write_test_bench(self, model):
                     newline = line
                     for bram in model_brams:
                         newline += f'#include \"firmware/weights/{bram.name}.h\"\n'
+                elif '// hls-fpga-machine-learning insert runtime contant' in line:
+                    newline = line
+                    insert_constant_lines = (
+                        f'{indent}const size_t kInputSz = {model_inputs[0].size_cpp()} * num_iterations;\n'
+                        f'{indent}const size_t kOutputSz = {model_outputs[0].size_cpp()} * num_iterations;\n'
+                        f'{indent}const size_t kInputLayerSize = {model_inputs[0].size_cpp()};\n'
+                        f'{indent}const size_t kOutLayerSize = {model_outputs[0].size_cpp()};\n'
+                    )
+                    newline += insert_constant_lines
                 elif '// hls-fpga-machine-learning insert zero' in line:
                     newline = line
                     inp = model_inputs[0]
-                    newline += indent + f'float vals[{inp.size_cpp()}]; \n'
-                    newline += indent + f'for (int j = 0 ; j < {inp.size_cpp()} ; j++) {{\n'
-                    newline += indent + '    vals[j] = 0.0; \n'
-                    newline += indent + '}\n'
-                    newline += indent + f'nnet::convert_data<float, {inp.pipe_name}, {inp.size_cpp()}>(q, vals);\n'
+                    insert_zero_lines = (
+                        f'{indent}for (int j = 0 ; j < kInputSz; j++)\n'
+                        f'{indent}    vals[j] = 0.0;\n'
+                        f'{indent}q.single_task(nnet::DMA_convert_data<float, {inp.pipe_name}>{{vals, num_iterations}});\n'
+                    )
+                    newline += insert_zero_lines
                 elif '// hls-fpga-machine-learning insert data' in line:
                     newline = line
                     inp = model_inputs[0]
-                    newline += indent + f'float vals[{inp.size_cpp()}]; \n'
-                    newline += indent + f'for (int j = 0 ; j < {inp.size_cpp()} ; j++) {{\n'
-                    newline += indent + '    vals[j] = in[j]; \n'
-                    newline += indent + '}\n'
-                    newline += indent + f'nnet::convert_data<float, {inp.pipe_name}, {inp.size_cpp()}>(q, vals);\n'
+                    insert_data_lines = (
+                        f'{indent}for (int i = 0; i < num_iterations; i++)\n'
+                        f'{indent}    for (int j = 0 ; j < kInputLayerSize; j++)\n'
+                        f'{indent}        vals[i * kInputLayerSize + j] = inputs[i][j]; \n'
+                        f'{indent}q.single_task(nnet::DMA_convert_data<float, {inp.pipe_name}>{{vals, num_iterations}});\n'
+                    )
+                    newline += insert_data_lines
                 elif '// hls-fpga-machine-learning convert output' in line:
                     newline = line
                     out = model_outputs[0]
-                    newline += indent + f'float outputs[{out.size_cpp()}];\n'
-                    newline += indent + f'nnet::convert_data_back<{out.pipe_name}, float, {out.size_cpp()}>(q, outputs);\n'
+                    newline += f'{indent}q.single_task(nnet::DMA_convert_data_back<{out.pipe_name}, float>'
+                    newline += '{outputs, num_iterations}).wait();\n'
                 else:
                     newline = line
 
@@ -528,6 +557,10 @@ def write_build_script(self, model):
                 if 'set(FPGA_DEVICE' in line:
                     line = f'    set(FPGA_DEVICE "{device}")\n'
 
+                if model.config.get_config_value('UseOneAPIBSP'):
+                    if 'hls-fpga-machine-learning insert oneapi_bsp_cmake_flag' in line:
+                        line = 'set(BSP_FLAG "-DIS_BSP")'
+
                 fout.write(line)
 
     def write_nnet_utils(self, model):