Merge pull request #1254 from haoyanwa/oneapi_backend/experiment

jmitrevs · web-flow · commit 84ad787ad30e · 2025-04-02T08:40:32.000-05:00
oneAPI BSP Support
diff --git a/docs/api/configuration.rst b/docs/api/configuration.rst
@@ -101,6 +101,20 @@ Finally, one then uses the configuration to create an hls model:
         backend='Vitis'
     )
 
+To target an oneAPI Board Support Package (BSP) enabled FPGA for offload acceleration, you can specify the ``part`` argument to be the path to your BSP and the BSP variant. Then, set ``use_oneapi_bsp=True``.
+
+.. code-block:: python
+
+  hls_model = hls4ml.converters.convert_from_keras_model(
+        model,
+        hls_config=config,
+        output_dir="my_project_dir",
+        io_type="io_parallel",
+        backend="oneAPI",
+        part="/path/to/my/bsp:bsp_variant",
+        use_oneapi_bsp=True
+    )
+
 See :py:class:`~hls4ml.converters.convert_from_keras_model` for more information on the various options. Similar functions exist for ONNX and PyTorch.
 
 ----
@@ -132,6 +146,9 @@ It looks like this:
    ClockPeriod: 5
    IOType: io_parallel # options: io_parallel/io_stream
 
+   # oneAPI Offload Acceleration flag.
+   UseOneAPIBSP: True
+
    HLSConfig:
      Model:
        Precision: fixed<16,6>
@@ -156,6 +173,7 @@ The backend-specific section of the configuration depends on the backend. You ca
 For Vivado backend the options are:
 
 * **Part**\ : the particular FPGA part number that you are considering, here it's a Xilinx Virtex UltraScale+ VU13P FPGA
+* **UseOneAPIBSP**\ : path to the oneAPI Board Support Package (and the BSP variant) to enable offload acceleration with an Altera FPGA. This is only needed if you are using the oneAPI backend.
 * **ClockPeriod**\ : the clock period, in ns, at which your algorithm runs
   Then you have some optimization parameters for how your algorithm runs:
 * **IOType**\ : your options are ``io_parallel`` or ``io_stream`` which defines the type of data structure used for inputs, intermediate activations between layers, and outputs. For ``io_parallel``, arrays are used that, in principle, can be fully unrolled and are typically implemented in RAMs. For ``io_stream``, HLS streams are used, which are a more efficient/scalable mechanism to represent data that are produced and consumed in a sequential manner. Typically, HLS streams are implemented with FIFOs instead of RAMs. For more information see `here <https://docs.xilinx.com/r/en-US/ug1399-vitis-hls/pragma-HLS-stream>`__.
diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -129,7 +129,8 @@ def get_default_flow(self):
     def get_writer_flow(self):
         return self._writer_flow
 
-    def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_parallel', write_tar=False, **_):
+    def create_initial_config(
+            self, part='Arria10', clock_period=5, io_type='io_parallel', write_tar=False, use_oneapi_bsp=False, **_):
         """Create initial configuration of the oneAPI backend.
 
         Args:
@@ -153,10 +154,7 @@ def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_para
             # TODO:  add namespace
             'WriteTar': write_tar,
         }
-
-        if 'use_bsp' in _:
-            config['IS_BSP'] = True
-
+        config['UseOneAPIBSP'] = use_oneapi_bsp
         return config
 
     def compile(self, model):
diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt
@@ -39,15 +39,19 @@ set(LIBRARY_NAME myproject-${LIB_STAMP})
 # specific part number (E.g. "10AS066N3F40E2SG") to generate a standalone IP.
 if(NOT DEFINED FPGA_DEVICE)
     set(FPGA_DEVICE "Agilex7")
+    set(BSP_FLAG "")
 endif()
 
+# Set the target to a BSP if we target an actual accelerator board.
+# hls-fpga-machine-learning insert oneapi_bsp_cmake_flag
+
 # Use cmake -DUSER_FPGA_FLAGS=<flags> to set extra flags for FPGA backend
 # compilation.
 # -Xsoptimize=latency Turns off the hyper-optimized handshake
 set(USER_FPGA_FLAGS -Wno-unused-label;${USER_FPGA_FLAGS};-Xsoptimize=latency)
 
 # Use cmake -DUSER_FLAGS=<flags> to set extra flags for general compilation.
-set(USER_FLAGS -Wno-unused-label -fconstexpr-steps=134217728 ${USER_FLAGS})
+set(USER_FLAGS -Wno-unused-label -fconstexpr-steps=134217728 ${USER_FLAGS} ${BSP_FLAG})
 
 # Use cmake -DUSER_INCLUDE_PATHS=<paths> to set extra paths for general
 # compilation.
diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h
@@ -11,7 +11,7 @@ using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(sycl::ext
 // Pipe properties for host pipes. Host pipes connect to the data source DMA and sink DMA.
 // They are connected to the first and the last layer to stream data into and out from the kernel.
 using HostPipePropertiesT = decltype(sycl::ext::oneapi::experimental::properties(
-    sycl::ext::intel::experimental::ready_latency<0>, sycl::ext::intel::experimental::bits_per_symbol<8>,
+    sycl::ext::intel::experimental::ready_latency<0>, sycl::ext::intel::experimental::bits_per_symbol<16>,
     sycl::ext::intel::experimental::uses_valid<true>, sycl::ext::intel::experimental::first_symbol_in_high_order_bits<true>,
     sycl::ext::intel::experimental::protocol_avalon_streaming_uses_ready));
 
@@ -127,12 +127,15 @@ template <class src_pipe, class dst_T> struct DMA_convert_data_back {
 class MyProjectID;
 
 struct MyProject {
-
+#ifndef IS_BSP
     // kernel property method to config invocation interface
     auto get(sycl::ext::oneapi::experimental::properties_tag) {
         return sycl::ext::oneapi::experimental::properties{sycl::ext::intel::experimental::streaming_interface<>,
                                                            sycl::ext::intel::experimental::pipelined<>};
     }
+#else
+    // kernel properties and pipelining is not supported in BSP.
+#endif
 
     SYCL_EXTERNAL void operator()() const;
 };
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
@@ -557,6 +557,10 @@ def write_build_script(self, model):
                 if 'set(FPGA_DEVICE' in line:
                     line = f'    set(FPGA_DEVICE "{device}")\n'
 
+                if model.config.get_config_value('UseOneAPIBSP'):
+                    if 'hls-fpga-machine-learning insert oneapi_bsp_cmake_flag' in line:
+                        line = f'set(BSP_FLAG "-DIS_BSP")'
+
                 fout.write(line)
 
     def write_nnet_utils(self, model):