diff --git a/docs/api/configuration.rst b/docs/api/configuration.rst index 1bc8f0676c..aa24e07329 100644 --- a/docs/api/configuration.rst +++ b/docs/api/configuration.rst @@ -101,6 +101,20 @@ Finally, one then uses the configuration to create an hls model: backend='Vitis' ) +To target an oneAPI Board Support Package (BSP) enabled FPGA for offload acceleration, you can specify the ``part`` argument to be the path to your BSP and the BSP variant. Then, set ``use_oneapi_bsp=True``. + +.. code-block:: python + + hls_model = hls4ml.converters.convert_from_keras_model( + model, + hls_config=config, + output_dir="my_project_dir", + io_type="io_parallel", + backend="oneAPI", + part="/path/to/my/bsp:bsp_variant", + use_oneapi_bsp=True + ) + See :py:class:`~hls4ml.converters.convert_from_keras_model` for more information on the various options. Similar functions exist for ONNX and PyTorch. ---- @@ -132,6 +146,9 @@ It looks like this: ClockPeriod: 5 IOType: io_parallel # options: io_parallel/io_stream + # oneAPI Offload Acceleration flag. + UseOneAPIBSP: True + HLSConfig: Model: Precision: fixed<16,6> @@ -168,6 +185,10 @@ For Vivado backend the options are: * **PipelineInterval**\ : Optionally override the desired initiation interval of the design. Only valid in combination with "pipeline" style. If unspecified, it is left to the compiler to decide, ideally matching the largest reuse factor of the network. * **Precision**\ : this defines the precision of your inputs, outputs, weights and biases. It is denoted by ``fixed``\ , where ``Y`` is the number of bits representing the signed number above the binary point (i.e. the integer part), and ``X`` is the total number of bits. Additionally, integers in the type (\ ``int``\ , where ``N`` is a bit-size from 1 to 1024) can also be used. The format follows ``ap_fixed`` and ``ap_int`` conventions. You have a chance to further configure this more finely with per-layer configuration described below. In the per-layer configuration (but not globally) one can also use ``'auto'`` precision. +For oneaAPI the options are similar but also include: + +* **UseOneAPIBSP**\ : path to the oneAPI Board Support Package (and the BSP variant) to enable offload acceleration with an Altera FPGA. This is only needed if you are using oneAPI in the accelerator style. + 2.2 Per-Layer Configuration --------------------------- diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py index a4000529c3..17654810c8 100644 --- a/hls4ml/backends/oneapi/oneapi_backend.py +++ b/hls4ml/backends/oneapi/oneapi_backend.py @@ -130,11 +130,13 @@ def get_default_flow(self): def get_writer_flow(self): return self._writer_flow - def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_parallel', write_tar=False, **_): + def create_initial_config( + self, part='Agilex7', clock_period=5, io_type='io_parallel', write_tar=False, use_oneapi_bsp=False, **_ + ): """Create initial configuration of the oneAPI backend. Args: - part (str, optional): The FPGA part to be used. Defaults to 'Arria10'. + part (str, optional): The FPGA part to be used. Defaults to 'Agilex7'. clock_period (int, optional): The clock period. Defaults to 5. io_type (str, optional): Type of implementation used. One of 'io_parallel' or 'io_stream'. Defaults to 'io_parallel'. @@ -146,7 +148,7 @@ def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_para config = {} - config['Part'] = part if part is not None else 'Arria10' + config['Part'] = part if part is not None else 'Agilex7' config['ClockPeriod'] = clock_period config['IOType'] = io_type config['HLSConfig'] = {} @@ -154,7 +156,7 @@ def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_para # TODO: add namespace 'WriteTar': write_tar, } - + config['UseOneAPIBSP'] = use_oneapi_bsp return config def compile(self, model): diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py index 3106e1e10d..410057e062 100644 --- a/hls4ml/backends/oneapi/oneapi_types.py +++ b/hls4ml/backends/oneapi/oneapi_types.py @@ -170,11 +170,18 @@ def definition_cpp(self, name_suffix='', as_reference=False): else: return f'{self.type.name} {self.name}{name_suffix}' - def declare_cpp(self, pipe_min_size=0, indent=''): - lines = indent + f'class {self.pipe_id};\n' - lines += indent + ( - f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, ' - + f'{self.type.name}, {pipe_min_size}, PipeProps>;\n' + # Updated pipe min size to be 32 for simulation. + def declare_cpp(self, pipe_min_size=32, indent=''): + # Updated to use streaming beat for restartable streaming kernel. + # Streaming beat is a wrapper type of the actual type with sideband control signals. + # Syntax: using BeatT = sycl::ext::intel::experimental::StreamingBeat; + streaming_beat_t = f"{self.pipe_name}BeatT" + lines = ( + f"{indent}class {self.pipe_id};\n" + f"{indent}using {streaming_beat_t} = " + f"sycl::ext::intel::experimental::StreamingBeat<{self.type.name}, true, true>;\n" + f"{indent}using {self.pipe_name} = sycl::ext::intel::experimental::pipe<" + f"{self.pipe_id}, {streaming_beat_t}, {pipe_min_size}, HostPipePropertiesT>;\n" ) return lines @@ -193,10 +200,13 @@ def definition_cpp(self, name_suffix='', as_reference=True): return f'{self.name}{name_suffix}' def declare_cpp(self, indent=''): - lines = indent + f'class {self.pipe_id};\n' - lines += indent + ( - f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, ' - + f'{self.type.name}, {self.pragma[-1]}>;\n' + streaming_beat_t = f"{self.pipe_name}BeatT" + lines = ( + f"{indent}class {self.pipe_id};\n" + f"{indent}using {streaming_beat_t} = " + f"sycl::ext::intel::experimental::StreamingBeat<{self.type.name}, true, true>;\n" + f"{indent}using {self.pipe_name} = " + f"sycl::ext::intel::experimental::pipe<{self.pipe_id}, {streaming_beat_t}, {self.pragma[-1]}>;\n" ) return lines diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt index e2b386d70d..fe15197210 100644 --- a/hls4ml/templates/oneapi/CMakeLists.txt +++ b/hls4ml/templates/oneapi/CMakeLists.txt @@ -38,15 +38,20 @@ set(LIBRARY_NAME myproject-${LIB_STAMP}) # You can also specify a device family (E.g. "Arria10" or "Stratix10") or a # specific part number (E.g. "10AS066N3F40E2SG") to generate a standalone IP. if(NOT DEFINED FPGA_DEVICE) - set(FPGA_DEVICE "Arria10") + set(FPGA_DEVICE "Agilex7") + set(BSP_FLAG "") endif() +# Set the target to a BSP if we target an actual accelerator board. +# hls-fpga-machine-learning insert oneapi_bsp_cmake_flag + # Use cmake -DUSER_FPGA_FLAGS= to set extra flags for FPGA backend # compilation. -set(USER_FPGA_FLAGS -Wno-unused-label ${USER_FPGA_FLAGS}) +# -Xsoptimize=latency Turns off the hyper-optimized handshake +set(USER_FPGA_FLAGS -Wno-unused-label;${USER_FPGA_FLAGS};-Xsoptimize=latency) # Use cmake -DUSER_FLAGS= to set extra flags for general compilation. -set(USER_FLAGS -Wno-unused-label -fconstexpr-steps=134217728 ${USER_FLAGS}) +set(USER_FLAGS -Wno-unused-label -fconstexpr-steps=134217728 ${USER_FLAGS} ${BSP_FLAG}) # Use cmake -DUSER_INCLUDE_PATHS= to set extra paths for general # compilation. diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h index 082ae5dc8c..1dc86c0cb6 100644 --- a/hls4ml/templates/oneapi/firmware/myproject.h +++ b/hls4ml/templates/oneapi/firmware/myproject.h @@ -5,8 +5,12 @@ // This file defines the interface to the kernel -// currently this is fixed -using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(sycl::ext::intel::experimental::ready_latency<0>)); +// Pipe properties for host pipes. Host pipes connect to the data source DMA and sink DMA. +// They are connected to the first and the last layer to stream data into and out from the kernel. +using HostPipePropertiesT = decltype(sycl::ext::oneapi::experimental::properties( + sycl::ext::intel::experimental::ready_latency<0>, sycl::ext::intel::experimental::bits_per_symbol<16>, + sycl::ext::intel::experimental::uses_valid, sycl::ext::intel::experimental::first_symbol_in_high_order_bits, + sycl::ext::intel::experimental::protocol_avalon_streaming_uses_ready)); // Need to declare the input and output pipes @@ -16,12 +20,15 @@ using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(sycl::ext class MyProjectID; struct MyProject { - +#ifndef IS_BSP // kernel property method to config invocation interface auto get(sycl::ext::oneapi::experimental::properties_tag) { return sycl::ext::oneapi::experimental::properties{sycl::ext::intel::experimental::streaming_interface<>, sycl::ext::intel::experimental::pipelined<>}; } +#else + // kernel properties and pipelining is not supported in BSP (accelerator style). +#endif SYCL_EXTERNAL void operator()() const; }; diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h index 13de5ab3bb..ebccee1f77 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h @@ -9,72 +9,93 @@ namespace nnet { // ************************************************* // Linear Activation // ************************************************* -template void linear_stream() { +template [[intel::use_stall_enable_clusters]] void linear_stream() { + using ResT = typename ExtractDataType::value_type>::value_type; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + bool keep_going = true; LinearActLoop: - [[intel::initiation_interval( - 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { - auto in_data = data_pipe::read(); - typename ExtractPipeType::value_type out_data; - - LinearPackLoop: - #pragma unroll - for (int j = 0; j < std::tuple_size::value_type>{}; j++) { - out_data[j] = in_data[j]; - } + [[intel::initiation_interval(1)]] while (keep_going) { + for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; i++) { + auto in_data = data_pipe::read(); + + LinearPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size{}; j++) { + out_data.data[j] = in_data.data[j]; + } + + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); - res_pipe::write(out_data); + keep_going = !in_data.eop; + } } } // ************************************************* // ReLU Activation // ************************************************* -template void relu_stream() { +template [[intel::use_stall_enable_clusters]] void relu_stream() { + using ResT = typename ExtractDataType::value_type>::value_type; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + bool keep_going = true; ReLUActLoop: - [[intel::initiation_interval( - 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { - auto in_data = data_pipe::read(); - typename ExtractPipeType::value_type out_data; - - ReLUPackLoop: - #pragma unroll - for (int j = 0; j < std::tuple_size::value_type>{}; j++) { - if (in_data[j] > 0) - out_data[j] = in_data[j]; - else - out_data[j] = 0; - } + [[intel::initiation_interval(1)]] while (keep_going) { + for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; i++) { + [[intel::fpga_register]] auto in_data = data_pipe::read(); + ReLUPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size{}; j++) { + if (in_data.data[j] > 0) + out_data.data[j] = in_data.data[j]; + else + out_data.data[j] = 0; + } + + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); - res_pipe::write(out_data); + keep_going = !in_data.eop; + } } } // ************************************************* // Leaky RELU Activation // ************************************************* -template void leaky_relu_stream(typename CONFIG_T::param_t alpha) { - constexpr unsigned multiplier_limit = - DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; - +template +[[intel::use_stall_enable_clusters]] void leaky_relu_stream(typename CONFIG_T::param_t alpha) { + using DataT = typename ExtractDataType::value_type>::value_type; + constexpr unsigned multiplier_limit = DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; + using ResT = typename ExtractDataType::value_type>::value_type; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + bool keep_going = true; LeakyReLUActLoop: - [[intel::initiation_interval(pipeline)]] for (int i = 0; - i < CONFIG_T::n_in / - std::tuple_size::value_type>{}; - i++) { - auto in_data = data_pipe::read(); - typename ExtractPipeType::value_type out_data; - - LeakyReLUPackLoop: - #pragma unroll - for (int j = 0; j < std::tuple_size::value_type>{}; j++) { - if (in_data[j] > 0) - out_data[j] = in_data[j]; - else - out_data[j] = alpha * in_data[j]; - } + [[intel::initiation_interval(pipeline)]] while (keep_going) { + for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; i++) { + auto in_data = data_pipe::read(); + + LeakyReLUPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size{}; j++) { + if (in_data.data[j] > 0) + out_data.data[j] = in_data.data[j]; + else + out_data.data[j] = alpha * in_data.data[j]; + } + + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); - res_pipe::write(out_data); + keep_going = !in_data.eop; + } } } @@ -82,186 +103,231 @@ template void leaky_relu_st // Thresholded RELU Activation // ************************************************* template -void thresholded_relu_stream(typename CONFIG_T::param_t theta) { +[[intel::use_stall_enable_clusters]] void thresholded_relu_stream(typename CONFIG_T::param_t theta) { + using ResT = typename ExtractDataType::value_type>::value_type; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + bool keep_going = true; + ThresholdedReLUActLoop: - [[intel::initiation_interval( - 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { - auto in_data = data_pipe::read(); - typename ExtractPipeType::value_type out_data; - - ThresholdedReLUPackLoop: - #pragma unroll - for (int j = 0; j < std::tuple_size::value_type>{}; j++) { - if (in_data[j] > theta) - out_data[j] = in_data[j]; - else - out_data[j] = 0; - } + [[intel::initiation_interval(1)]] while (keep_going) { + for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; i++) { + auto in_data = data_pipe::read(); - res_pipe::write(out_data); + ThresholdedReLUPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size{}; j++) { + if (in_data.data[j] > theta) + out_data.data[j] = in_data.data[j]; + else + out_data.data[j] = 0; + } + + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); + + keep_going = !in_data.eop; + } } } // ************************************************* // ELU Activation // ************************************************* -template void elu_stream(typename CONFIG_T::param_t alpha) { +template +[[intel::use_stall_enable_clusters]] void elu_stream(typename CONFIG_T::param_t alpha) { #include "activation_tables/elu_table.tb" + using DataT = typename ExtractDataType::value_type>::value_type; + constexpr unsigned multiplier_limit = DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; + using ResT = typename ExtractDataType::value_type>::value_type; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; - constexpr unsigned multiplier_limit = - DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; - + bool keep_going = true; EluActLoop: - [[intel::initiation_interval(pipeline)]] for (int i = 0; - i < CONFIG_T::n_in / - std::tuple_size::value_type>{}; - i++) { - auto in_data = data_pipe::read(); - typename ExtractPipeType::value_type out_data; - - EluPackLoop: - #pragma unroll - for (int j = 0; j < std::tuple_size::value_type>{}; j++) { - [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type datareg = in_data[j]; - if (datareg >= 0) { - out_data[j] = datareg; - } else { - int index = (datareg * CONFIG_T::table_size / -8).to_int(); - if (index > CONFIG_T::table_size - 1) - index = CONFIG_T::table_size - 1; - out_data[j] = alpha * elu_table[index]; + [[intel::initiation_interval(pipeline)]] while (keep_going) { + for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; i++) { + auto in_data = data_pipe::read(); + + EluPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size{}; j++) { + [[intel::fpga_register]] auto datareg = in_data.data[j]; + if (datareg >= 0) { + out_data.data[j] = datareg; + } else { + int index = (datareg * CONFIG_T::table_size / -8).to_int(); + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data.data[j] = alpha * elu_table[index]; + } } - } - res_pipe::write(out_data); + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); + + keep_going = !in_data.eop; + } } } // ************************************************* // SeLU Activation // ************************************************* -template void selu_stream() { +template [[intel::use_stall_enable_clusters]] void selu_stream() { #include "activation_tables/selu_table.tb" + using ResT = typename ExtractDataType::value_type>::value_type; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + bool keep_going = true; + + constexpr ac_fixed<16, 1, false, AC_RND> scale = 1.0507009873554804934193349852946; SeluActLoop: - [[intel::initiation_interval( - 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { - auto in_data = data_pipe::read(); - typename ExtractPipeType::value_type out_data; - - SeluPackLoop: - #pragma unroll - for (int j = 0; j < std::tuple_size::value_type>{}; j++) { - [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type datareg = in_data[j]; - if (datareg >= 0) { - out_data[j] = - typename ExtractPipeType::value_type::value_type(1.0507009873554804934193349852946) * datareg; - } else { - int index = (datareg * CONFIG_T::table_size / -8).to_int(); - if (index > CONFIG_T::table_size - 1) - index = CONFIG_T::table_size - 1; - out_data[j] = selu_table[index]; + [[intel::initiation_interval(1)]] while (keep_going) { + for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; i++) { + auto in_data = data_pipe::read(); + + SeluPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size{}; j++) { + [[intel::fpga_register]] auto datareg = in_data.data[j]; + + if (datareg >= 0) { + out_data.data[j] = scale * datareg; + } else { + int index = (datareg * CONFIG_T::table_size / -8).to_int(); + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data.data[j] = selu_table[index]; + } } - } - res_pipe::write(out_data); + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); + + keep_going = !in_data.eop; + } } } // ************************************************* // PReLU Activation // ************************************************* -template void prelu_stream(typename CONFIG_T::param_t alpha) { - constexpr unsigned multiplier_limit = - DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; - +template +[[intel::use_stall_enable_clusters]] void prelu_stream(typename CONFIG_T::param_t alpha) { + using DataT = typename ExtractDataType::value_type>::value_type; + constexpr unsigned multiplier_limit = DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; + using ResT = typename ExtractDataType::value_type>::value_type; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + bool keep_going = true; PReLUActLoop: - [[intel::initiation_interval(pipeline)]] for (int i = 0; - i < CONFIG_T::n_in / - std::tuple_size::value_type>{}; - i++) { - auto in_data = data_pipe::read(); - typename ExtractPipeType::value_type out_data; - - PReLUPackLoop: - #pragma unroll - for (int j = 0; j < std::tuple_size::value_type>{}; j++) { - if (in_data[j] > 0) - out_data[j] = in_data[j]; - else - out_data[j] = alpha[i * std::tuple_size::value_type>{} + j] * in_data[j]; - } + [[intel::initiation_interval(pipeline)]] while (keep_going) { + for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; i++) { + auto in_data = data_pipe::read(); + + PReLUPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size{}; j++) { + if (in_data.data[j] > 0) + out_data.data[j] = in_data.data[j]; + else + out_data.data[j] = alpha[i * std::tuple_size{} + j] * in_data.data[j]; + } - res_pipe::write(out_data); + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); + + keep_going = !in_data.eop; + } } } // ************************************************* // Softplus Activation // ************************************************* -template void softplus_stream() { +template [[intel::use_stall_enable_clusters]] void softplus_stream() { #include "activation_tables/softplus_table.tb" + using ResT = typename ExtractDataType::value_type>::value_type; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + bool keep_going = true; + SoftplusActLoop: - [[intel::initiation_interval( - 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { - auto in_data = data_pipe::read(); - typename ExtractPipeType::value_type out_data; - - SoftplusPackLoop: - #pragma unroll - for (int j = 0; j < std::tuple_size::value_type>{}; j++) { - [[intel::fpga_register]] int data_round = (in_data[j] * CONFIG_T::table_size / 16).to_int(); - [[intel::fpga_register]] int index = data_round + 8 * CONFIG_T::table_size / 16; - if (index < 0) - index = 0; - else if (index > CONFIG_T::table_size - 1) - index = CONFIG_T::table_size - 1; - out_data[j] = softplus_table[index]; - } + [[intel::initiation_interval(1)]] while (keep_going) { + for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; i++) { + auto in_data = data_pipe::read(); + + SoftplusPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size{}; j++) { + [[intel::fpga_register]] int data_round = (in_data.data[j] * CONFIG_T::table_size / 16).to_int(); + [[intel::fpga_register]] int index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + else if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + out_data.data[j] = softplus_table[index]; + } - res_pipe::write(out_data); + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); + + keep_going = !in_data.eop; + } } } // ************************************************* // Softsign Activation // ************************************************* -template void softsign_stream() { +template [[intel::use_stall_enable_clusters]] void softsign_stream() { #include "activation_tables/softsign_table.tb" + using DataT = typename ExtractDataType::value_type>::value_type; + constexpr int MAX_VALUE = 8; + using ResT = typename ExtractDataType::value_type>::value_type; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; - static const int MAX_VALUE = 8; - + bool keep_going = true; SoftsignActLoop: - [[intel::initiation_interval( - 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { - auto in_data = data_pipe::read(); - typename ExtractPipeType::value_type out_data; - - SoftsignPackLoop: - #pragma unroll - for (int j = 0; j < std::tuple_size::value_type>{}; j++) { - [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type absValue; - ; - if (in_data[j] < 0) { - absValue = -in_data[j]; - } else { - absValue = in_data[j]; - } - ac_int<16> index = (absValue * CONFIG_T::table_size / MAX_VALUE).to_int(); - if (absValue > MAX_VALUE) - index = CONFIG_T::table_size - 1; - if (in_data[j] < 0) { - out_data[j] = - static_cast::value_type::value_type>(-softsign_table[index]); - } else { - out_data[j] = static_cast::value_type::value_type>(softsign_table[index]); + [[intel::initiation_interval(1)]] while (keep_going) { + for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; i++) { + auto in_data = data_pipe::read(); + + SoftsignPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size{}; j++) { + [[intel::fpga_register]] typename DataT::value_type absValue; + ; + if (in_data.data[j] < 0) { + absValue = -in_data.data[j]; + } else { + absValue = in_data.data[j]; + } + ac_int<16> index = (absValue * CONFIG_T::table_size / MAX_VALUE).to_int(); + if (absValue > MAX_VALUE) + index = CONFIG_T::table_size - 1; + if (in_data.data[j] < 0) { + out_data.data[j] = -softsign_table[index]; + } else { + out_data.data[j] = softsign_table[index]; + } } - } - res_pipe::write(out_data); + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); + + keep_going = !in_data.eop; + } } } @@ -269,220 +335,246 @@ template void softsign_stre // Softmax Activation // ************************************************* -template void softmax_stable_stream() { +template +[[intel::use_stall_enable_clusters]] void softmax_stable_stream() { #include "activation_tables/exp_table.tb" #include "activation_tables/invert_table.tb" + using DataT = typename ExtractDataType::value_type>::value_type; + constexpr unsigned multiplier_limit = DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; - constexpr unsigned multiplier_limit = - DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + [[intel::fpga_register]] typename DataT::value_type data_array[std::tuple_size{}]; - [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type - data_array[std::tuple_size::value_type>{}]; + using ResT = typename ExtractDataType::value_type>::value_type; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + bool keep_going = true; SoftmaxArrayLoop: - [[intel::initiation_interval(pipeline)]] for (unsigned i = 0; - i < CONFIG_T::n_in / - std::tuple_size::value_type>{}; - i++) { - auto in_pack = data_pipe::read(); - - SoftmaxArrayPackLoop: - #pragma unroll - for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { - data_array[j] = in_pack[j]; - } + [[intel::initiation_interval(pipeline)]] while (keep_going) { + for (unsigned i = 0; i < CONFIG_T::n_in / std::tuple_size{}; i++) { + auto in_data = data_pipe::read(); - // Find the max and compute all delta(x_i, x_max) - Op_max::value_type::value_type> op_max; - [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type x_max = - reduce::value_type::value_type, - std::tuple_size::value_type>{}, - Op_max::value_type::value_type>>(data_array, op_max); - - // For the diffs, use the same type as the input but force rounding and saturation - [[intel::fpga_register]] ac_fixed::value_type::value_type::width, - ExtractPipeType::value_type::value_type::i_width, true, AC_RND, AC_SAT> - d_xi_xmax[std::tuple_size::value_type>{}]; - #pragma unroll - for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { - d_xi_xmax[j] = data_array[j] - x_max; - } + SoftmaxArrayPackLoop: + #pragma unroll + for (unsigned j = 0; j < std::tuple_size{}; j++) { + data_array[j] = in_data.data[j]; + } - // Calculate all the e^x's - [[intel::fpga_register]] - typename CONFIG_T::exp_table_t exp_res[std::tuple_size::value_type>{}]; - #pragma unroll - for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { - exp_res[j] = - exp_table[softmax_stable_idx_from_real_val::value_type::value_type, - CONFIG_T>(d_xi_xmax[j])]; - } + // Find the max and compute all delta(x_i, x_max) + Op_max op_max; + [[intel::fpga_register]] auto x_max = + reduce{}, Op_max>(data_array, + op_max); + + // For the diffs, use the same type as the input but force rounding and saturation + [[intel::fpga_register]] ac_fixed::value_type::value_type::width, + ExtractPipeType::value_type::value_type::i_width, true, AC_RND, + AC_SAT> + d_xi_xmax[std::tuple_size{}]; + #pragma unroll + for (unsigned j = 0; j < std::tuple_size{}; j++) { + d_xi_xmax[j] = data_array[j] - x_max; + } - // Explicitly sum the results with an adder tree. - // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing - Op_add op_add; - [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum = - reduce::value_type>{}, - Op_add>(exp_res, op_add); + // Calculate all the e^x's + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[std::tuple_size{}]; + #pragma unroll + for (unsigned j = 0; j < std::tuple_size{}; j++) { + exp_res[j] = exp_table[softmax_stable_idx_from_real_val(d_xi_xmax[j])]; + } - [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = - invert_table[softmax_stable_idx_from_real_val(exp_sum)]; - typename ExtractPipeType::value_type out_pack; + // Explicitly sum the results with an adder tree. + // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing + Op_add op_add; + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum = + reduce{}, Op_add>( + exp_res, op_add); - SoftmaxInvPackLoop: - #pragma unroll - for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { + [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table[softmax_stable_idx_from_real_val(exp_sum)]; - // TODO - Find Quartus-equivalent pragma - // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + SoftmaxInvPackLoop: + #pragma unroll + for (unsigned j = 0; j < std::tuple_size{}; j++) { - out_pack[j] = exp_res[j] * inv_exp_sum; - } + // TODO - Find Quartus-equivalent pragma + // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation - res_pipe::write(out_pack); + out_data.data[j] = exp_res[j] * inv_exp_sum; + } + + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); + + keep_going = !in_data.eop; + } } } -template void softmax_latency_stream() { +template +[[intel::use_stall_enable_clusters]] void softmax_latency_stream() { #include "activation_tables/exp_table_latency.tb" #include "activation_tables/invert_table_latency.tb" + using DataT = typename ExtractDataType::value_type>::value_type; + constexpr unsigned multiplier_limit = DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; + + using ResT = typename ExtractDataType::value_type>::value_type; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; - constexpr unsigned multiplier_limit = - DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + bool keep_going = true; // Calculate all the e^x's - [[intel::fpga_register]] - typename CONFIG_T::exp_table_t exp_res[std::tuple_size::value_type>{}]; + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[std::tuple_size{}]; SoftmaxExpLoop: - [[intel::initiation_interval(pipeline)]] for (unsigned i = 0; - i < CONFIG_T::n_in / - std::tuple_size::value_type>{}; - i++) { - auto in_pack = data_pipe::read(); - - SoftmaxExpPackLoop: - #pragma unroll - for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { - exp_res[j] = exp_table_latency[softmax_latency_idx_from_real_val< - typename ExtractPipeType::value_type::value_type, CONFIG_T>(in_pack[j])]; - } + [[intel::initiation_interval(pipeline)]] while (keep_going) { + for (unsigned i = 0; i < CONFIG_T::n_in / std::tuple_size{}; i++) { + auto in_data = data_pipe::read(); - // Explicitly sum the results with an adder tree. - // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing - Op_add op_add; - [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum = - reduce>(exp_res, op_add); - - // Multiply previously calculated exponetials with the reciprocal of the sum - [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = - invert_table_latency[softmax_latency_idx_from_real_val(exp_sum)]; - - typename ExtractPipeType::value_type out_pack; - SoftmaxInvPackLoop: - #pragma unroll - for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { - // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation - out_pack[j] = exp_res[j] * inv_exp_sum; - } + SoftmaxExpPackLoop: + #pragma unroll + for (unsigned j = 0; j < std::tuple_size{}; j++) { + exp_res[j] = + exp_table_latency[softmax_latency_idx_from_real_val(in_data.data[j])]; + } + + // Explicitly sum the results with an adder tree. + // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing + Op_add op_add; + [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum = + reduce>(exp_res, + op_add); + + // Multiply previously calculated exponetials with the reciprocal of the sum + [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = + invert_table_latency[softmax_latency_idx_from_real_val(exp_sum)]; - res_pipe::write(out_pack); + SoftmaxInvPackLoop: + #pragma unroll + for (unsigned j = 0; j < std::tuple_size{}; j++) { + // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation + out_data.data[j] = exp_res[j] * inv_exp_sum; + } + + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); + + keep_going = !in_data.eop; + } } } -template void softmax_legacy_stream() { +template +[[intel::use_stall_enable_clusters]] void softmax_legacy_stream() { #include "activation_tables/exp_table_legacy.tb" #include "activation_tables/invert_table_legacy.tb" + using DataT = typename ExtractDataType::value_type>::value_type; + using ResT = typename ExtractDataType::value_type>::value_type; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + bool keep_going = true; // Index into the lookup table based on data for exponentials - [[intel::fpga_register]] - typename CONFIG_T::table_t exp_res[std::tuple_size::value_type>{}]; + [[intel::fpga_register]] typename CONFIG_T::table_t exp_res[std::tuple_size{}]; [[intel::fpga_register]] typename CONFIG_T::table_t exp_diff_res; - [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type - data_cache[std::tuple_size::value_type>{}]; + [[intel::fpga_register]] typename DataT::value_type data_cache[std::tuple_size{}]; SoftmaxInitLoop: - [[intel::initiation_interval(1)]] for (unsigned s = 0; - s < CONFIG_T::n_in / - std::tuple_size::value_type>{}; - s++) { - auto in_pack = data_pipe::read(); - - SoftmaxInitPackLoop: - #pragma unroll - for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { - data_cache[j] = in_pack[j]; - exp_res[j] = 0; - } + [[intel::initiation_interval(1)]] while (keep_going) { + for (unsigned s = 0; s < CONFIG_T::n_in / std::tuple_size{}; s++) { + auto in_data = data_pipe::read(); - SoftmaxExpLoop: - #pragma unroll - for (int i = 0; i < std::tuple_size::value_type>{}; i++) { - SoftmaxExpInner: + SoftmaxInitPackLoop: #pragma unroll - for (int j = 0; j < std::tuple_size::value_type>{}; j++) { - if (i == j) { - exp_diff_res = 1; - } else { - int data_round = ((data_cache[j] - data_cache[i]) * CONFIG_T::table_size / 16).to_int(); - int index = data_round + 8 * CONFIG_T::table_size / 16; - if (index < 0) - index = 0; - if (index > CONFIG_T::table_size - 1) - index = CONFIG_T::table_size - 1; - exp_diff_res = exp_table_legacy[index]; + for (unsigned j = 0; j < std::tuple_size{}; j++) { + data_cache[j] = in_data.data[j]; + exp_res[j] = 0; + } + + SoftmaxExpLoop: + #pragma unroll + for (int i = 0; i < std::tuple_size{}; i++) { + SoftmaxExpInner: + #pragma unroll + for (int j = 0; j < std::tuple_size{}; j++) { + if (i == j) { + exp_diff_res = 1; + } else { + int data_round = ((data_cache[j] - data_cache[i]) * CONFIG_T::table_size / 16).to_int(); + int index = data_round + 8 * CONFIG_T::table_size / 16; + if (index < 0) + index = 0; + if (index > CONFIG_T::table_size - 1) + index = CONFIG_T::table_size - 1; + exp_diff_res = exp_table_legacy[index]; + } + exp_res[i] += exp_diff_res; } - exp_res[i] += exp_diff_res; } - } - typename ExtractPipeType::value_type out_pack; - SoftmaxInvPackLoop: - #pragma unroll - for (unsigned j = 0; j < std::tuple_size::value_type>{}; j++) { - int exp_res_index = (exp_res[j] * CONFIG_T::table_size / 64).to_int(); - if (exp_res_index < 0) - exp_res_index = 0; - if (exp_res_index > CONFIG_T::table_size - 1) - exp_res_index = CONFIG_T::table_size - 1; - out_pack[j] = - static_cast::value_type::value_type>(invert_table_legacy[exp_res_index]); - } + SoftmaxInvPackLoop: + #pragma unroll + for (unsigned j = 0; j < std::tuple_size{}; j++) { + int exp_res_index = (exp_res[j] * CONFIG_T::table_size / 64).to_int(); + if (exp_res_index < 0) + exp_res_index = 0; + if (exp_res_index > CONFIG_T::table_size - 1) + exp_res_index = CONFIG_T::table_size - 1; + out_data.data[j] = static_cast(invert_table_legacy[exp_res_index]); + } - res_pipe::write(out_pack); + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); + + keep_going = !in_data.eop; + } } } -template void softmax_argmax_stream() { - [[intel::initiation_interval( - 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { - auto in_data = data_pipe::read(); - typename ExtractPipeType::value_type out_data; +template +[[intel::use_stall_enable_clusters]] void softmax_argmax_stream() { + using DataT = typename ExtractDataType::value_type>::value_type; + using ResT = typename ExtractDataType::value_type>::value_type; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; - #pragma unroll - for (int i = 0; i < std::tuple_size::value_type>{}; i++) { - out_data[i] = static_cast::value_type::value_type>(0); - } + bool keep_going = true; - [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type maximum = in_data[0]; - [[intel::fpga_register]] int idx = 0; + [[intel::initiation_interval(1)]] while (keep_going) { + for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; i++) { + auto in_data = data_pipe::read(); - [[intel::initiation_interval(1)]] for (int i = 1; - i < std::tuple_size::value_type>{}; i++) { - if (in_data[i] > maximum) { - maximum = in_data[i]; - idx = i; + #pragma unroll + for (int i = 0; i < std::tuple_size{}; i++) { + out_data.data[i] = 0; } - } - out_data[idx] = static_cast::value_type::value_type>(1); - res_pipe::write(out_data); + [[intel::fpga_register]] auto maximum = in_data.data[0]; + [[intel::fpga_register]] int idx = 0; + + [[intel::initiation_interval(1)]] for (int i = 1; i < std::tuple_size{}; i++) { + if (in_data.data[i] > maximum) { + maximum = in_data.data[i]; + idx = i; + } + } + + out_data.data[idx] = 1; + + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); + + keep_going = !in_data.eop; + } } } -template void softmax_stream() { +template [[intel::use_stall_enable_clusters]] void softmax_stream() { switch (CONFIG_T::implementation) { case softmax_implementation::latency: softmax_latency_stream(); @@ -505,91 +597,101 @@ template void softmax_strea // ************************************************* // TanH Activation // ************************************************* -template void dense_tanh_stream() { +template [[intel::use_stall_enable_clusters]] void dense_tanh_stream() { #include "activation_tables/tanh_table.tb" - static const int MAX_VALUE = 4; + constexpr int MAX_VALUE = 4; + using DataT = typename ExtractDataType::value_type>::value_type; + constexpr unsigned multiplier_limit = DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; + using ResT = typename ExtractDataType::value_type>::value_type; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; - constexpr unsigned multiplier_limit = - DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + bool keep_going = true; TanHActLoop: - [[intel::initiation_interval(pipeline)]] for (int i = 0; - i < CONFIG_T::n_in / - std::tuple_size::value_type>{}; - i++) { - - auto in_data = data_pipe::read(); - typename ExtractPipeType::value_type out_data; - - TanHPackLoop: - #pragma unroll - for (int j = 0; j < std::tuple_size::value_type>{}; j++) { - [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type absoluteValue; - - if (in_data[j] < 0) - absoluteValue = (-1) * in_data[j]; - else - absoluteValue = in_data[j]; - - [[intel::fpga_register]] int index; - if (absoluteValue <= MAX_VALUE) - index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int(); - else - index = CONFIG_T::table_size - 1; - - if (in_data[j] > 0) - out_data[j] = tanh_table[index]; - else - out_data[j] = -tanh_table[index]; - } + [[intel::initiation_interval(pipeline)]] while (keep_going) { + for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; i++) { + + auto in_data = data_pipe::read(); + + TanHPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size{}; j++) { + [[intel::fpga_register]] typename DataT::value_type absoluteValue; + + if (in_data.data[j] < 0) + absoluteValue = (-1) * in_data.data[j]; + else + absoluteValue = in_data.data[j]; + + [[intel::fpga_register]] int index; + if (absoluteValue <= MAX_VALUE) + index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int(); + else + index = CONFIG_T::table_size - 1; - res_pipe::write(out_data); + if (in_data.data[j] > 0) + out_data.data[j] = tanh_table[index]; + else + out_data.data[j] = -tanh_table[index]; + } + + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); + + keep_going = !in_data.eop; + } } } // ************************************************* // Sigmoid Activation // ************************************************* -template void sigmoid_stream() { +template [[intel::use_stall_enable_clusters]] void sigmoid_stream() { #include "activation_tables/sigmoid_table.tb" - static const int MAX_VALUE = 8; + constexpr int MAX_VALUE = 8; + using DataT = typename ExtractDataType::value_type>::value_type; + constexpr unsigned multiplier_limit = DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; + using ResT = typename ExtractDataType::value_type>::value_type; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; - constexpr unsigned multiplier_limit = - DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + bool keep_going = true; SigmoidActLoop: - [[intel::initiation_interval(pipeline)]] for (int i = 0; - i < CONFIG_T::n_in / - std::tuple_size::value_type>{}; - i++) { - auto in_data = data_pipe::read(); - typename ExtractPipeType::value_type out_data; - - SigmoidPackLoop: - #pragma unroll - for (int j = 0; j < std::tuple_size::value_type>{}; j++) { - [[intel::fpga_register]] typename ExtractPipeType::value_type::value_type absoluteValue; - - if (in_data[j] < 0) - absoluteValue = (-1) * in_data[j]; - else - absoluteValue = in_data[j]; - - [[intel::fpga_register]] int index; - if (absoluteValue <= MAX_VALUE) - index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int(); - else - index = CONFIG_T::table_size - 1; - - if (in_data[j] > 0) - out_data[j] = sigmoid_table[index]; - else - out_data[j] = 1 - sigmoid_table[index]; - } + [[intel::initiation_interval(pipeline)]] while (keep_going) { + for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; i++) { + auto in_data = data_pipe::read(); + + SigmoidPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size{}; j++) { + [[intel::fpga_register]] typename DataT::value_type absoluteValue; + + if (in_data.data[j] < 0) + absoluteValue = (-1) * in_data.data[j]; + else + absoluteValue = in_data.data[j]; + + [[intel::fpga_register]] int index; + if (absoluteValue <= MAX_VALUE) + index = (absoluteValue * (CONFIG_T::table_size / MAX_VALUE)).to_int(); + else + index = CONFIG_T::table_size - 1; + + if (in_data.data[j] > 0) + out_data.data[j] = sigmoid_table[index]; + else + out_data.data[j] = 1 - sigmoid_table[index]; + } + + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); - res_pipe::write(out_data); + keep_going = !in_data.eop; + } } } @@ -597,113 +699,144 @@ template void sigmoid_strea // Hard sigmoid Activation // ************************************************* // Note - Theano and Tensorflow might have different definitions for hard sigmoid; could provide two implementations -template void hard_sigmoid_stream() { +template +[[intel::use_stall_enable_clusters]] void hard_sigmoid_stream() { + using DataT = typename ExtractDataType::value_type>::value_type; + constexpr unsigned multiplier_limit = DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; + using ResT = typename ExtractDataType::value_type>::value_type; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; - constexpr unsigned multiplier_limit = - DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + bool keep_going = true; HardSigmoidActLoop: - [[intel::initiation_interval(pipeline)]] for (int i = 0; - i < CONFIG_T::n_in / - std::tuple_size::value_type>{}; - i++) { - - auto in_data = data_pipe::read(); - typename ExtractPipeType::value_type out_data; - - HardSigmoidPackLoop: - #pragma unroll - for (int j = 0; j < std::tuple_size::value_type>{}; j++) { - [[intel::fpga_register]] auto datareg = CONFIG_T::slope * in_data[j] + CONFIG_T::shift; - if (datareg > 1) - datareg = 1; - else if (datareg < 0) - datareg = 0; - out_data[j] = datareg; - } + [[intel::initiation_interval(pipeline)]] while (keep_going) { + for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; i++) { + + auto in_data = data_pipe::read(); + + HardSigmoidPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size{}; j++) { + [[intel::fpga_register]] auto datareg = CONFIG_T::slope * in_data.data[j] + CONFIG_T::shift; + if (datareg > 1) + datareg = 1; + else if (datareg < 0) + datareg = 0; + out_data.data[j] = datareg; + } - res_pipe::write(out_data); + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); + + keep_going = !in_data.eop; + } } } -template void hard_tanh_stream() { +template [[intel::use_stall_enable_clusters]] void hard_tanh_stream() { + using DataT = typename ExtractDataType::value_type>::value_type; + constexpr unsigned multiplier_limit = DIV_ROUNDUP(std::tuple_size{}, CONFIG_T::reuse_factor); + constexpr unsigned pipeline = std::tuple_size{} / multiplier_limit; + using ResT = typename ExtractDataType::value_type>::value_type; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; - constexpr unsigned multiplier_limit = - DIV_ROUNDUP(std::tuple_size::value_type>{}, CONFIG_T::reuse_factor); - constexpr unsigned pipeline = std::tuple_size::value_type>{} / multiplier_limit; + bool keep_going = true; HardSigmoidActLoop: - [[intel::initiation_interval(pipeline)]] for (int i = 0; - i < CONFIG_T::n_in / - std::tuple_size::value_type>{}; - i++) { - - auto in_data = data_pipe::read(); - typename ExtractPipeType::value_type out_data; - - HardSigmoidPackLoop: - #pragma unroll - for (int j = 0; j < std::tuple_size::value_type>{}; j++) { - auto sigmoid = CONFIG_T::slope * in_data[j] + CONFIG_T::shift; - if (sigmoid > 1) - sigmoid = 1; - else if (sigmoid < 0) - sigmoid = 0; - out_data[j] = 2 * sigmoid - 1; - } + [[intel::initiation_interval(pipeline)]] while (keep_going) { + for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; i++) { - res_pipe::write(out_data); + auto in_data = data_pipe::read(); + + HardSigmoidPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size{}; j++) { + auto sigmoid = CONFIG_T::slope * in_data.data[j] + CONFIG_T::shift; + if (sigmoid > 1) + sigmoid = 1; + else if (sigmoid < 0) + sigmoid = 0; + out_data.data[j] = 2 * sigmoid - 1; + } + + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); + + keep_going = !in_data.eop; + } } } // ************************************************* // Binary TanH Activation // ************************************************* -template void binary_tanh_stream() { +template +[[intel::use_stall_enable_clusters]] void binary_tanh_stream() { + using ResT = typename ExtractDataType::value_type>::value_type; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + bool keep_going = true; + BinaryTanHActLoop: - [[intel::initiation_interval( - 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { - - [[intel::fpga_register]] auto in_data = data_pipe::read(); - [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; - - BinaryTanHPackLoop: - #pragma unroll - for (int j = 0; j < std::tuple_size::value_type>{}; j++) { - if (in_data[j] > 0) - out_data[j] = static_cast::value_type::value_type>(1); - else - out_data[j] = static_cast::value_type::value_type>(-1); - } + [[intel::initiation_interval(1)]] while (keep_going) { + for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; i++) { + + [[intel::fpga_register]] auto in_data = data_pipe::read(); + + BinaryTanHPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size{}; j++) { + if (in_data.data[j] > 0) + out_data.data[j] = 1; + else + out_data.data[j] = -1; + } + + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); - res_pipe::write(out_data); + keep_going = !in_data.eop; + } } } // ************************************************* // Ternary TanH Activation // ************************************************* -template void ternary_tanh_stream() { +template +[[intel::use_stall_enable_clusters]] void ternary_tanh_stream() { + using ResT = typename ExtractDataType::value_type>::value_type; + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + bool keep_going = true; + TernaryTanHActLoop: - [[intel::initiation_interval( - 1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size::value_type>{}; i++) { - - [[intel::fpga_register]] auto in_data = data_pipe::read(); - [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; - - TernaryTanHPackLoop: - #pragma unroll - for (int j = 0; j < std::tuple_size::value_type>{}; j++) { - if (in_data[j] > 1) - out_data[j] = static_cast::value_type::value_type>(1); - else if (in_data[j] <= -1) - out_data[j] = static_cast::value_type::value_type>(-1); - else - out_data[j] = static_cast::value_type::value_type>(0); - } + [[intel::initiation_interval(1)]] while (keep_going) { + for (int i = 0; i < CONFIG_T::n_in / std::tuple_size{}; i++) { + + [[intel::fpga_register]] auto in_data = data_pipe::read(); - res_pipe::write(out_data); + TernaryTanHPackLoop: + #pragma unroll + for (int j = 0; j < std::tuple_size{}; j++) { + if (in_data.data[j] > 1) + out_data.data[j] = 1; + else if (in_data.data[j] <= -1) + out_data.data[j] = -1; + else + out_data.data[j] = 0; + } + + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); + + keep_going = !in_data.eop; + } } } diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h index 128b3ac1a4..77597986c8 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_batchnorm_stream.h @@ -13,33 +13,43 @@ namespace nnet { // **************************************************** template void normalize_stream(typename CONFIG_T::scale_t scale, typename CONFIG_T::bias_t bias) { + using DataT = typename ExtractDataType::value_type>::value_type; + using ResT = typename ExtractDataType::value_type>::value_type; constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor); constexpr unsigned pipeline = CONFIG_T::n_in / multiplier_limit; - constexpr auto datasize = std::tuple_size::value_type>{}; - CONFIG_T::template product::value_type::value_type, - typename CONFIG_T::scale_t::value_type>::limit(multiplier_limit); + constexpr auto datasize = std::tuple_size{}; + CONFIG_T::template product::limit(multiplier_limit); + + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + bool keep_going = true; BatchNormLoop: - [[intel::initiation_interval(pipeline)]] for (int i = 0; i < CONFIG_T::n_in / datasize; i++) { - auto in_data = data_pipe::read(); - typename ExtractPipeType::value_type out_data; - - BatchNormpack: - #pragma unroll - for (int j = 0; j < datasize; j++) { - int norm_index; - if (CONFIG_T::n_filt == -1) - norm_index = i * datasize + j; - else - norm_index = j % CONFIG_T::n_filt; - out_data[j] = - CONFIG_T::template product::value_type::value_type, - typename CONFIG_T::scale_t::value_type>::product(in_data[j], scale[norm_index]) + - bias[norm_index]; + [[intel::initiation_interval(pipeline)]] while (keep_going) { + for (int i = 0; i < CONFIG_T::n_in / datasize; i++) { + auto in_data = data_pipe::read(); + + BatchNormpack: + #pragma unroll + for (int j = 0; j < datasize; j++) { + int norm_index; + if (CONFIG_T::n_filt == -1) + norm_index = i * datasize + j; + else + norm_index = j % CONFIG_T::n_filt; + out_data.data[j] = + CONFIG_T::template product::product( + in_data.data[j], scale[norm_index]) + + bias[norm_index]; + } + + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); + + keep_going = !in_data.eop; } - - res_pipe::write(out_data); } } @@ -48,57 +58,79 @@ void normalize_stream(typename CONFIG_T::scale_t scale, typename CONFIG_T::bias_ // **************************************************** template void normalize_binary_tanh_stream(typename CONFIG_T::threshold_t threshold) { - constexpr auto datasize = std::tuple_size::value_type>{}; + using DataT = typename ExtractDataType::value_type>::value_type; + using ResT = typename ExtractDataType::value_type>::value_type; + constexpr auto datasize = std::tuple_size{}; + + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + bool keep_going = true; BinaryNormLoop: - [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / datasize; i++) { - auto in_data = data_pipe::read(); - nnet::array, CONFIG_T::n_scale_bias> out_data; - - BatchNormPack: - #pragma unroll - for (int j = 0; j < datasize; j++) { - int norm_index; - if (CONFIG_T::n_filt == -1) - norm_index = i * datasize + j; - else - norm_index = j % CONFIG_T::n_filt; - - out_data[j] = (in_data[j] >= threshold[norm_index]) ? 1 : 0; + [[intel::initiation_interval(1)]] while (keep_going) { + for (int i = 0; i < CONFIG_T::n_in / datasize; i++) { + auto in_data = data_pipe::read(); + + BatchNormPack: + #pragma unroll + for (int j = 0; j < datasize; j++) { + int norm_index; + if (CONFIG_T::n_filt == -1) + norm_index = i * datasize + j; + else + norm_index = j % CONFIG_T::n_filt; + + out_data.data[j] = (in_data.data[j] >= threshold[norm_index]) ? 1 : 0; + } + + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); + + keep_going = !in_data.eop; } - - res_pipe::write(out_data); } } template void normalize_ternary_tanh_stream(typename CONFIG_T::threshold_hi_t threshold_hi, typename CONFIG_T::threshold_lo_t threshold_lo) { - constexpr auto datasize = std::tuple_size::value_type>{}; + using DataT = typename ExtractDataType::value_type>::value_type; + using ResT = typename ExtractDataType::value_type>::value_type; + constexpr auto datasize = std::tuple_size{}; + + [[intel::fpga_register]] typename ExtractPipeType::value_type out_data; + + bool keep_going = true; TernaryNormLoop: - [[intel::initiation_interval(1)]] for (int i = 0; i < CONFIG_T::n_in / datasize; i++) { - auto in_data = data_pipe::read(); - nnet::array, CONFIG_T::n_scale_bias> out_data; - - BatchNormPack: - #pragma unroll - for (int j = 0; j < datasize; j++) { - int norm_index; - if (CONFIG_T::n_filt == -1) - norm_index = i * datasize + j; - else - norm_index = j % CONFIG_T::n_filt; - - if (in_data[j] > threshold_hi[norm_index]) - out_data[j] = 1; - else if (in_data[j] <= threshold_lo[norm_index]) - out_data[j] = -1; - else - out_data[j] = 0; + [[intel::initiation_interval(1)]] while (keep_going) { + for (int i = 0; i < CONFIG_T::n_in / datasize; i++) { + auto in_data = data_pipe::read(); + + BatchNormPack: + #pragma unroll + for (int j = 0; j < datasize; j++) { + int norm_index; + if (CONFIG_T::n_filt == -1) + norm_index = i * datasize + j; + else + norm_index = j % CONFIG_T::n_filt; + + if (in_data.data[j] > threshold_hi[norm_index]) + out_data.data[j] = 1; + else if (in_data.data[j] <= threshold_lo[norm_index]) + out_data.data[j] = -1; + else + out_data.data[j] = 0; + } + + out_data.sop = in_data.sop; + out_data.eop = in_data.eop; + res_pipe::write(out_data); + + keep_going = !in_data.eop; } - - res_pipe::write(out_data); } } diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_data_movement.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_data_movement.h new file mode 100644 index 0000000000..4a705ca199 --- /dev/null +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_data_movement.h @@ -0,0 +1,148 @@ +#ifndef NNET_DATA_MOVEMENT_H +#define NNET_DATA_MOVEMENT_H + +#include +#include + +// This file defines the methods to transfer the data to the kernel. In the HLS flow, +// these are really part of the testbench. However, in the accelerator (BSP) flow, they are +// actual kernels that are deployed in hardware. + +namespace nnet { + +////////////////////////////////////////////////////////////////////////////// +// These are the simple, testbench-only versions +////////////////////////////////////////////////////////////////////////////// +template void convert_data(sycl::queue &q, srcType *src) { + using PipeDataType = typename nnet::ExtractPipeType::value_type; + using SrcDataType = typename nnet::ExtractDataType::value_type; + constexpr auto dstTypeSize = std::tuple_size{}; + for (size_t i = 0; i < SIZE / dstTypeSize; i++) { + PipeDataType packet; + for (size_t j = 0; j < dstTypeSize; j++) { + packet.data[j] = src[i * dstTypeSize + j]; + } + packet.sop = (i == 0); + packet.eop = (i == (SIZE / dstTypeSize - 1)); + dest_pipe::write(q, packet); + } +} + +template void convert_data_back(sycl::queue &q, dstType *dst) { + using PipeDataType = typename nnet::ExtractPipeType::value_type; + using SrcDataType = typename nnet::ExtractDataType::value_type; + constexpr auto srcTypeSize = std::tuple_size{}; + for (size_t i = 0; i < SIZE / srcTypeSize; i++) { + auto packet = src_pipe::read(q); + for (size_t j = 0; j < srcTypeSize; j++) { + dst[i * srcTypeSize + j] = packet.data[j].to_double(); + } + } +} + +////////////////////////////////////////////////////////////////////////////// +// The ones below can be used both in testbenches and in the accelerator flow +////////////////////////////////////////////////////////////////////////////// +#if !defined(IS_BSP) +// Definition for buffer locations for Avalon MM host. +inline constexpr unsigned kInputBufferLocation = 0; +inline constexpr unsigned kOutputBufferLocation = 1; +#endif + +// Implementation of a direct memory access kernel. Move data from source, convert, +// and send to the sink. Adaptive to SYCL HLS and hardware acceleration flow. +template struct DMA_convert_data { +#if !defined(IS_BSP) + // When targeting a device family, we instantiate an Avalon Memory Mapped Host for + // data transaction between host and the DMA kernel during emulation and simulation. + sycl::ext::oneapi::experimental::annotated_arg< + src_T *, + decltype(sycl::ext::oneapi::experimental::properties{ + sycl::ext::intel::experimental::latency<0>, sycl::ext::intel::experimental::dwidth<16>, + sycl::ext::intel::experimental::buffer_location, + sycl::ext::intel::experimental::read_write_mode_read, sycl::ext::intel::experimental::wait_request_requested})> +#else + // When targeting oneAPI BSP, we can use USM pointer to access host memory. + src_T *const +#endif + src; + size_t num_iteration; + + [[intel::kernel_args_restrict]] void operator()() const { + +#if defined(IS_BSP) + // Access data using host pointer. + sycl::ext::intel::host_ptr src_ptr(src); +#else + // Host allocation is not supported when targeting an FPGA family or part number. + src_T *src_ptr(src); +#endif + // First, extract the PipeDataT from the pipe + using PipeDataType = typename nnet::ExtractPipeType::value_type; + // Then, extract the DataT from StreamingBeat + using DstDataType = typename nnet::ExtractDataType::value_type; + constexpr auto dstTypeSize = std::tuple_size{}; + + [[intel::fpga_register]] typename nnet::ExtractPipeType::value_type packet; + + // Keep sending data to the input layer and keep the kernels running. + for (size_t i = 0; i < num_iteration; i++) { + #pragma unroll + for (size_t j = 0; j < dstTypeSize; j++) { + packet.data[j] = src_ptr[i * dstTypeSize + j]; + } + packet.sop = (i == 0); + // Assert end-of-packet signal after the last iteration. + // All down-stream kernels will stop seeing eop. + packet.eop = (i == (num_iteration - 1)); + dest_pipe::write(packet); + } + } +}; + +// Symmetrical to the DMA_convert_data above, this DMA drains the output pipe and +// writes result to memory. +template struct DMA_convert_data_back { +#if !defined(IS_BSP) + // Without BSP, instantiate an Avalon Memory Mapped Host to write to host. + sycl::ext::oneapi::experimental::annotated_arg< + dst_T *, + decltype(sycl::ext::oneapi::experimental::properties{ + sycl::ext::intel::experimental::latency<0>, sycl::ext::intel::experimental::dwidth<16>, + sycl::ext::intel::experimental::buffer_location, + sycl::ext::intel::experimental::read_write_mode_write, sycl::ext::intel::experimental::wait_request_requested})> +#else + // USM pointer, otherwise. + dst_T *const +#endif + dst; + size_t num_iteration; + + [[intel::kernel_args_restrict]] void operator()() const { +#if defined(IS_BSP) + sycl::ext::intel::host_ptr dst_ptr(dst); +#else + dst_T *dst_ptr(dst); +#endif + // First, extract the PipeDataT from the pipe + using PipeDataType = typename nnet::ExtractPipeType::value_type; + // Then, extract the DataT from StreamingBeat + using SrcDataType = typename nnet::ExtractDataType::value_type; + constexpr auto srcTypeSize = std::tuple_size{}; + + [[intel::fpga_register]] typename nnet::ExtractPipeType::value_type packet; + + // Drain the output pipe and write result to memory. + for (size_t i = 0; i < num_iteration; i++) { + packet = src_pipe::read(); + #pragma unroll + for (size_t j = 0; j < srcTypeSize; j++) { + dst_ptr[i * srcTypeSize + j] = static_cast(packet.data[j].to_double()); + } + } + } +}; + +} // namespace nnet + +#endif diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h index 92c9adc3bb..c1c6d726e8 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h @@ -7,15 +7,34 @@ namespace nnet { -// Note: DataPack logic removed, at least in the initial version +// Restartable streaming kernel implementation. +// Computation is carried out in a while-1 loop as long as there is valid input. +// The loop breaks when the end-of-packet signal is asserted by upstream task. template -void dense_resource_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::bias_t biases) { +[[intel::use_stall_enable_clusters]] void dense_resource_stream(const typename CONFIG_T::weight_t weights, + const typename CONFIG_T::bias_t biases) { + using namespace nnet; + using DataT = typename ExtractDataType::value_type>::value_type; + using ResT = typename ExtractDataType::value_type>::value_type; - [[intel::fpga_register]] typename ExtractPipeType::value_type res; - [[intel::fpga_register]] auto data = data_pipe::read(); - dense_resource::value_type, typename ExtractPipeType::value_type, - CONFIG_T>(data, res, weights, biases); - res_pipe::write(res); + [[intel::fpga_register]] typename ExtractPipeType::value_type resbeat; + + bool keep_going = true; + bool did_read_input; + [[intel::initiation_interval(1)]] while (keep_going) { + did_read_input = false; + [[intel::fpga_register]] auto databeat = data_pipe::read(did_read_input); + + if (did_read_input) { + dense_resource(databeat.data, resbeat.data, weights, biases); + + resbeat.sop = databeat.sop; + resbeat.eop = databeat.eop; + + res_pipe::write(resbeat); + keep_going = !databeat.eop; + } + } } } // namespace nnet diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h index c7af2e7a68..e5b451655a 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_helpers.h @@ -12,27 +12,6 @@ namespace nnet { -template void convert_data(sycl::queue &q, srcType *src) { - constexpr auto dstTypeSize = std::tuple_size::value_type>{}; - for (size_t i = 0; i < SIZE / dstTypeSize; i++) { - typename ExtractPipeType::value_type ctype; - for (size_t j = 0; j < dstTypeSize; j++) { - ctype[j] = src[i * dstTypeSize + j]; - } - dest_pipe::write(q, ctype); - } -} - -template void convert_data_back(sycl::queue &q, dstType *dst) { - constexpr auto srcTypeSize = std::tuple_size::value_type>{}; - for (size_t i = 0; i < SIZE / srcTypeSize; i++) { - auto ctype = src_pipe::read(q); - for (size_t j = 0; j < srcTypeSize; j++) { - dst[i * srcTypeSize + j] = ctype[j].to_double(); - } - } -} - extern bool trace_enabled; extern std::map *trace_outputs; extern size_t trace_type_size; diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h index 8cf883c1d5..a35bba17bf 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h @@ -8,6 +8,8 @@ #include #include +#include // Streaming Beat and pipe properties. + namespace nnet { // Define the pipe type that we use @@ -34,6 +36,15 @@ struct ExtractPipeType struct ExtractDataType { typedef T value_type; }; + +// Specialization on oneAPI StreamingBeat type. +template +struct ExtractDataType> { + typedef DataT value_type; +}; + /* * HLS Shift Register Implementation * To verify a shift register is used in hardware, go to report.html > Area Analysis of System diff --git a/hls4ml/templates/oneapi/myproject_bridge.cpp b/hls4ml/templates/oneapi/myproject_bridge.cpp index ddad1d054b..fa73db7c2a 100644 --- a/hls4ml/templates/oneapi/myproject_bridge.cpp +++ b/hls4ml/templates/oneapi/myproject_bridge.cpp @@ -2,7 +2,7 @@ #define MYPROJECT_BRIDGE_H_ #include "firmware/myproject.h" -#include "firmware/nnet_utils/nnet_helpers.h" +#include "firmware/nnet_utils/nnet_data_movement.h" #include #include diff --git a/hls4ml/templates/oneapi/myproject_test.cpp b/hls4ml/templates/oneapi/myproject_test.cpp index 82fb60d2f8..f0de1875fa 100644 --- a/hls4ml/templates/oneapi/myproject_test.cpp +++ b/hls4ml/templates/oneapi/myproject_test.cpp @@ -4,9 +4,11 @@ #include #include #include +#include #include #include "firmware/myproject.h" +#include "firmware/nnet_utils/nnet_data_movement.h" #include "firmware/parameters.h" #include @@ -20,13 +22,70 @@ #define CHECKPOINT 5000 +#if not defined(IS_BSP) +using sycl::ext::intel::experimental::property::usm::buffer_location; +#endif + +// Functions that reads input and prediction data from files. +// Returns `true` if files are read successfully and not empty. +// Returns `false` otherwise. +bool prepare_data_from_file(std::string &fin_path, std::string &fpr_path, std::vector> &inputs, + std::vector> &predictions) { + // load input data from text file + std::ifstream fin(fin_path.c_str()); + // load predictions from text file + std::ifstream fpr(fpr_path.c_str()); + + std::string iline; + std::string pline; + + if (fin.is_open() && fpr.is_open()) { + size_t num_iterations = 0; + + // Prepare input data from file. Load predictions from file. + for (; std::getline(fin, iline) && std::getline(fpr, pline); num_iterations++) { + if (num_iterations % CHECKPOINT == 0) { + std::cout << "Processing input " << num_iterations << std::endl; + } + + std::vector in; + std::vector pr; + float current; + + std::stringstream ssin(iline); + while (ssin >> current) { + in.push_back(current); + } + + std::stringstream sspred(pline); + while (sspred >> current) { + pr.push_back(current); + } + + std::copy(pr.cbegin(), pr.cend(), predictions.back().begin()); + std::copy(in.cbegin(), in.cend(), inputs.back().begin()); + } + fin.close(); + fpr.close(); + if (inputs.empty()) + return false; + else + return true; + } else { + return false; + } +} + int main(int argc, char **argv) { #if FPGA_SIMULATOR +#define NUM_ITERATIONS 5 auto selector = sycl::ext::intel::fpga_simulator_selector_v; #elif FPGA_HARDWARE +#define NUM_ITERATIONS 100 auto selector = sycl::ext::intel::fpga_selector_v; #else // #if FPGA_EMULATOR +#define NUM_ITERATIONS 10 auto selector = sycl::ext::intel::fpga_emulator_selector_v; #endif @@ -44,93 +103,108 @@ int main(int argc, char **argv) { std::cout << "Running on device: " << device.get_info().c_str() << std::endl; - // load input data from text file - std::ifstream fin("tb_data/tb_input_features.dat"); - // load predictions from text file - std::ifstream fpr("tb_data/tb_output_predictions.dat"); - + std::string INPUT_FILE = "tb_data/tb_input_features.dat"; + std::string PRED_FILE = "tb_data/tb_output_predictions.dat"; std::string RESULTS_LOG = "tb_data/results.log"; std::ofstream fout(RESULTS_LOG); - std::string iline; - std::string pline; - - if (fin.is_open() && fpr.is_open()) { - std::vector> predictions; - unsigned int iteration = 0; - for (; std::getline(fin, iline) && std::getline(fpr, pline); iteration++) { - if (iteration % CHECKPOINT == 0) { - std::cout << "Processing input " << iteration << std::endl; - } + // Allocate vectors on stack to hold data from files temporarily. + std::vector> inputs; + std::vector> predictions; + bool file_valid = prepare_data_from_file(INPUT_FILE, PRED_FILE, inputs, predictions); + unsigned int num_iterations; + if (file_valid) { + num_iterations = inputs.size(); + } else { + num_iterations = NUM_ITERATIONS; + } - std::vector in; - std::vector pr; - float current; + // hls-fpga-machine-learning insert runtime contant - std::stringstream ssin(iline); - while (ssin >> current) { - in.push_back(current); - } + try { +#if defined(IS_BSP) + // Allocate host memory if BSP is in use. + float *vals = sycl::malloc_host(kInputSz, q); + if (vals == nullptr) { + std::cerr << "ERROR: host allocation failed for input\n"; + fout.close(); + return 1; + } + float *outputs = sycl::malloc_host(kOutputSz, q); + if (outputs == nullptr) { + std::cerr << "ERROR: host allocation failed for output\n"; + fout.close(); + return 1; + } +#else + float *vals = + sycl::malloc_shared(kInputSz, q, sycl::property_list{buffer_location(nnet::kInputBufferLocation)}); + float *outputs = + sycl::malloc_shared(kOutputSz, q, sycl::property_list{buffer_location(nnet::kOutputBufferLocation)}); +#endif - std::stringstream sspred(pline); - while (sspred >> current) { - pr.push_back(current); - } + if (file_valid) { + // Start always-run streaming kernel here, instead of inside a loop. + q.single_task(MyProject{}); // hls-fpga-machine-learning insert data - q.single_task(MyProject{}); - // hls-fpga-machine-learning convert output - std::copy(pr.cbegin(), pr.cend(), predictions.back().begin()); - - for (auto outval : outputs) { - fout << outval << " "; - } - fout << std::endl; - if (iteration % CHECKPOINT == 0) { - std::cout << "Predictions" << std::endl; - // hls-fpga-machine-learning insert predictions - for (auto predval : pr) { - std::cout << predval << " "; + // Print output from kernel and from prediction file. + for (int i = 0; i < num_iterations; i++) { + for (int j = 0; j < kOutLayerSize; j++) { + fout << outputs[i * kOutLayerSize + j] << " "; } - std::cout << std::endl; - std::cout << "Quantized predictions" << std::endl; - // hls-fpga-machine-learning insert quantized - for (auto outval : outputs) { - std::cout << outval << " "; + fout << std::endl; + if (i % CHECKPOINT == 0) { + std::cout << "Predictions" << std::endl; + // hls-fpga-machine-learning insert predictions + for (auto predval : predictions[i]) { + std::cout << predval << " "; + } + std::cout << std::endl; + std::cout << "Quantized predictions" << std::endl; + // hls-fpga-machine-learning insert quantized + for (int j = 0; j < kOutLayerSize; j++) { + std::cout << outputs[i * kOutLayerSize + j] << " "; + } + std::cout << std::endl; } - std::cout << std::endl; } - } - fin.close(); - fpr.close(); - } else { - const unsigned int num_iterations = 10; - std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations - << " invocations." << std::endl; - - // hls-fpga-machine-learning insert top-level-function - for (int i = 0; i < num_iterations; i++) { - // hls-fpga-machine-learning insert zero + } else { + std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations + << " invocations." << std::endl; q.single_task(MyProject{}); + // hls-fpga-machine-learning insert top-level-function + // hls-fpga-machine-learning insert zero // hls-fpga-machine-learning convert output - for (auto outval : outputs) { - std::cout << outval << " "; - } - std::cout << std::endl; - - for (auto outval : outputs) { - fout << outval << " "; + for (int i = 0; i < num_iterations; i++) { + for (int j = 0; j < kOutLayerSize; j++) { + std::cout << outputs[i * kOutLayerSize + j] << " "; + fout << outputs[i * kOutLayerSize + j] << " "; + } + std::cout << std::endl; + fout << std::endl; } - fout << std::endl; } + sycl::free(vals, q); + sycl::free(outputs, q); + fout.close(); + std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl; + } catch (sycl::exception const &e) { + // Catches exceptions in the host code. + std::cerr << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.code().value() == CL_DEVICE_NOT_FOUND) { + std::cerr << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cerr << "Run sys_check in the oneAPI root directory to verify.\n"; + std::cerr << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); } - q.wait(); - - fout.close(); - std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl; - return 0; } diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py index e93f8b5ca3..98a62a8c4a 100644 --- a/hls4ml/writer/oneapi_writer.py +++ b/hls4ml/writer/oneapi_writer.py @@ -137,8 +137,11 @@ def write_project_cpp(self, model): elif '// hls-fpga-machine-learning read in' in line: newline = line if io_type == 'io_parallel': + restartable_kernel_loop = f"bool keep_going = true;\n\n" f"{indent}while (keep_going) {{\n" + newline += indent + restartable_kernel_loop for inp in model_inputs: - newline += indent + f'auto {inp.name} = {inp.pipe_name}::read();\n' + newline += indent * 2 + f'auto {inp.name}_beat = {inp.pipe_name}::read();\n' + newline += indent * 2 + f'auto {inp.name} = {inp.name}_beat.data;\n' # for streaming we don't need to read it in # Insert weights @@ -151,16 +154,21 @@ def write_project_cpp(self, model): # Insert task sequences elif '// hls-fpga-machine-learning declare task sequences' in line: - newline = line if io_type == 'io_stream': # only need this for io_stream + newline = line for layer in model.get_layers(): ts = layer.get_attr('tast_sequence_cpp') if ts: newline += ' ' + ts + '\n' + else: + newline = indent + line # Neural net instantiation elif '// hls-fpga-machine-learning insert layers' in line: - newline = line + '\n' + if io_type == 'io_parallel': + newline = indent + line + '\n' + else: + newline = line + '\n' for layer in model.get_layers(): if io_type != 'io_stream': vars = layer.get_variables() @@ -168,14 +176,14 @@ def write_project_cpp(self, model): if var not in model_inputs: def_cpp = var.definition_cpp() if def_cpp is not None: - newline += ' ' + def_cpp + ';\n' + newline += indent * 2 + def_cpp + ';\n' func = ( layer.get_attr('function_cpp') if io_type == 'io_parallel' else layer.get_attr('stream_function_cpp') ) if func: - newline += ' ' + func + '\n' + newline += (indent * 2 if io_type == 'io_parallel' else indent) + func + '\n' if model.config.trace_output and layer.get_attr('trace', False): newline += '#ifndef HLS_SYNTHESIS\n' for var in vars: @@ -188,8 +196,17 @@ def write_project_cpp(self, model): elif '// hls-fpga-machine-learning return' in line: newline = line if io_type == 'io_parallel': + newline = indent + newline for out in model_outputs: - newline += indent + f'{out.pipe_name}::write({out.name});\n' + out_beat = f"{out.name}_beat" + newline += ( + indent * 2 + f'typename nnet::ExtractPipeType<{out.pipe_name}>::value_type {out_beat};\n' + ) + newline += indent * 2 + f'{out_beat}.data = {out.name};\n' + newline += indent * 2 + f'{out.pipe_name}::write({out_beat});\n' + newline += indent * 2 + '// stops the kernel when the last input seen.\n' + newline += indent * 2 + f'keep_going = !{model_inputs[0].name}_beat.eop;\n' + newline += f"{indent}}}\n" # don't need to add anything in io_stream # Just copy line @@ -396,27 +413,39 @@ def write_test_bench(self, model): newline = line for bram in model_brams: newline += f'#include \"firmware/weights/{bram.name}.h\"\n' + elif '// hls-fpga-machine-learning insert runtime contant' in line: + newline = line + insert_constant_lines = ( + f'{indent}const size_t kInputSz = {model_inputs[0].size_cpp()} * num_iterations;\n' + f'{indent}const size_t kOutputSz = {model_outputs[0].size_cpp()} * num_iterations;\n' + f'{indent}const size_t kInputLayerSize = {model_inputs[0].size_cpp()};\n' + f'{indent}const size_t kOutLayerSize = {model_outputs[0].size_cpp()};\n' + ) + newline += insert_constant_lines elif '// hls-fpga-machine-learning insert zero' in line: newline = line inp = model_inputs[0] - newline += indent + f'float vals[{inp.size_cpp()}]; \n' - newline += indent + f'for (int j = 0 ; j < {inp.size_cpp()} ; j++) {{\n' - newline += indent + ' vals[j] = 0.0; \n' - newline += indent + '}\n' - newline += indent + f'nnet::convert_data(q, vals);\n' + insert_zero_lines = ( + f'{indent}for (int j = 0 ; j < kInputSz; j++)\n' + f'{indent} vals[j] = 0.0;\n' + f'{indent}q.single_task(nnet::DMA_convert_data{{vals, num_iterations}});\n' + ) + newline += insert_zero_lines elif '// hls-fpga-machine-learning insert data' in line: newline = line inp = model_inputs[0] - newline += indent + f'float vals[{inp.size_cpp()}]; \n' - newline += indent + f'for (int j = 0 ; j < {inp.size_cpp()} ; j++) {{\n' - newline += indent + ' vals[j] = in[j]; \n' - newline += indent + '}\n' - newline += indent + f'nnet::convert_data(q, vals);\n' + insert_data_lines = ( + f'{indent}for (int i = 0; i < num_iterations; i++)\n' + f'{indent} for (int j = 0 ; j < kInputLayerSize; j++)\n' + f'{indent} vals[i * kInputLayerSize + j] = inputs[i][j]; \n' + f'{indent}q.single_task(nnet::DMA_convert_data{{vals, num_iterations}});\n' + ) + newline += insert_data_lines elif '// hls-fpga-machine-learning convert output' in line: newline = line out = model_outputs[0] - newline += indent + f'float outputs[{out.size_cpp()}];\n' - newline += indent + f'nnet::convert_data_back<{out.pipe_name}, float, {out.size_cpp()}>(q, outputs);\n' + newline += f'{indent}q.single_task(nnet::DMA_convert_data_back<{out.pipe_name}, float>' + newline += '{outputs, num_iterations}).wait();\n' else: newline = line @@ -528,6 +557,10 @@ def write_build_script(self, model): if 'set(FPGA_DEVICE' in line: line = f' set(FPGA_DEVICE "{device}")\n' + if model.config.get_config_value('UseOneAPIBSP'): + if 'hls-fpga-machine-learning insert oneapi_bsp_cmake_flag' in line: + line = 'set(BSP_FLAG "-DIS_BSP")' + fout.write(line) def write_nnet_utils(self, model):