POC/use_mmap_at_import_model v1

MirceaDan99 · Aug 19, 2024 · b2b8dc9 · b2b8dc9
1 parent 3692cf8
commit b2b8dc9
Show file tree

Hide file tree

Showing 10 changed files with 145 additions and 35 deletions.
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/al/icompiler.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/al/icompiler.hpp
@@ -19,6 +19,9 @@
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/runtime/common.hpp"
 #include "openvino/runtime/profiling_info.hpp"
+#include "openvino/util/mmap_object.hpp"
+
+#include <optional>
 
 namespace intel_npu {
 
@@ -131,22 +134,41 @@ struct NetworkMetadata final {
  * to provide such information about a network as description of inputs and outputs,
  * name and compiled network in a format executable by device
  */
-struct NetworkDescription final {
-    NetworkDescription(std::vector<uint8_t>&& compiledNetwork, NetworkMetadata&& metadata)
-        : compiledNetwork(std::move(compiledNetwork)),
-          metadata(std::move(metadata)) {}
+
+#define NetworkDescriptionCastCheck(network) dynamic_cast<const NetworkDescriptionT<std::vector<uint8_t>>*>(network.get()) != nullptr ? true : false
+#define NetworkDescriptionPtrCast1(network) std::dynamic_pointer_cast<const NetworkDescriptionT<std::vector<uint8_t>>>(network)
+#define NetworkDescriptionPtrCast2(network) std::dynamic_pointer_cast<const NetworkDescriptionT<std::shared_ptr<ov::MappedMemory>>>(network)
+
+struct NetworkDescription {
+
     // Force move semantics to prevent blob copies
+    NetworkDescription(NetworkMetadata&& metadata) : metadata(std::move(metadata)) {}
     NetworkDescription(const NetworkDescription&) = delete;
     NetworkDescription(NetworkDescription&&) = default;
     NetworkDescription& operator=(const NetworkDescription&) = delete;
     NetworkDescription& operator=(NetworkDescription&&) = default;
-    ~NetworkDescription() = default;
-
-    std::vector<uint8_t> compiledNetwork;
+    virtual ~NetworkDescription() = default;
 
     NetworkMetadata metadata;
 };
 
+template<typename T>
+struct NetworkDescriptionT : public NetworkDescription {
+
+    NetworkDescriptionT(T&& compiledNetwork, NetworkMetadata&& metadata)
+        : NetworkDescription(std::move(metadata)) {
+            this->compiledNetwork = compiledNetwork;
+        }
+
+    NetworkDescriptionT(const NetworkDescriptionT<T>&) = delete;
+    NetworkDescriptionT(NetworkDescriptionT<T>&&) = default;
+    NetworkDescriptionT& operator=(const NetworkDescriptionT<T>&) = delete;
+    NetworkDescriptionT& operator=(NetworkDescriptionT<T>&&) = default;
+    ~NetworkDescriptionT() = default;
+
+    T compiledNetwork;
+};
+
 /**
  * @interface ICompiler
  * @brief An interface to be implemented by a concrete compiler to provide
@@ -190,12 +212,16 @@ class ICompiler : public std::enable_shared_from_this<ICompiler> {
      *        to be used for creating network description
      * @return a shared pointer on an object implementing NetworkDescription interface
      */
-    virtual NetworkMetadata parse(const std::vector<uint8_t>& network, const Config& config) const = 0;
+    virtual NetworkMetadata parse(const std::shared_ptr<ov::MappedMemory>& mmapNetwork, const Config& config) const = 0;
 
     virtual std::vector<ov::ProfilingInfo> process_profiling_output(const std::vector<uint8_t>& profData,
                                                                     const std::vector<uint8_t>& network,
                                                                     const Config& config) const = 0;
 
+    virtual std::vector<ov::ProfilingInfo> process_profiling_output(const std::vector<uint8_t>& profData,
+                                                                    const std::shared_ptr<ov::MappedMemory>& network,
+                                                                    const Config& config) const = 0;
+
 protected:
     virtual ~ICompiler() = default;
 };

diff --git a/src/plugins/intel_npu/src/backend/src/zero_executor.cpp b/src/plugins/intel_npu/src/backend/src/zero_executor.cpp
@@ -72,9 +72,20 @@ ZeroExecutor::ZeroExecutor(const std::shared_ptr<const ZeroInitStructsHolder>& i
     ze_graph_desc_t desc{ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES,
                          nullptr,
                          ZE_GRAPH_FORMAT_NATIVE,
-                         _networkDesc->compiledNetwork.size(),
-                         _networkDesc->compiledNetwork.data(),
+                         0,
+                         nullptr,
                          nullptr};
+
+    if (NetworkDescriptionCastCheck(_networkDesc)) {
+        auto _networkDescCast = NetworkDescriptionPtrCast1(_networkDesc);
+        desc.inputSize = _networkDescCast->compiledNetwork.size();
+        desc.pInput = _networkDescCast->compiledNetwork.data();
+    } else {
+        auto _networkDescCast = NetworkDescriptionPtrCast2(_networkDesc);
+        desc.inputSize = _networkDescCast->compiledNetwork->size();
+        desc.pInput = reinterpret_cast<uint8_t*>(_networkDescCast->compiledNetwork->data());
+    }
+
     zeroUtils::throwOnFail(
         "pfnCreate",
         _graph_ddi_table_ext->pfnCreate(_initStructs->getContext(), _initStructs->getDevice(), &desc, &_graph));

diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
@@ -595,10 +595,13 @@ std::vector<ov::ProfilingInfo> ZeroInferRequest::get_profiling_info() const {
         // processing to the compiler
         const auto& networkDesc = compiledModel.get_network_description();
         const auto& compiler = compiledModel.get_compiler();
-        const auto& blob = networkDesc->compiledNetwork;
         auto profData = get_raw_profiling_data();
         _logger.debug("InferRequest::get_profiling_info complete with compiler->process_profiling_output().");
-        return compiler->process_profiling_output(profData, blob, compilerConfig);
+        if (NetworkDescriptionCastCheck(networkDesc) == false) {
+            auto networkDescCast = NetworkDescriptionPtrCast2(networkDesc);
+            return compiler->process_profiling_output(profData, networkDescCast->compiledNetwork, compilerConfig);
+        } 
+        return compiler->process_profiling_output(profData, NetworkDescriptionPtrCast1(networkDesc)->compiledNetwork, compilerConfig);
     } else {
         auto proftype = _config.get<PROFILING_TYPE>();
         if (proftype == ov::intel_npu::ProfilingType::INFER) {

diff --git a/src/plugins/intel_npu/src/compiler/include/driver_compiler_adapter.hpp b/src/plugins/intel_npu/src/compiler/include/driver_compiler_adapter.hpp
@@ -27,12 +27,16 @@ class LevelZeroCompilerAdapter final : public ICompiler {
 
     ov::SupportedOpsMap query(const std::shared_ptr<const ov::Model>& model, const Config& config) const override final;
 
-    NetworkMetadata parse(const std::vector<uint8_t>& network, const Config& config) const override final;
+    NetworkMetadata parse(const std::shared_ptr<ov::MappedMemory>& mmapNetwork, const Config& config) const override final;
 
     std::vector<ov::ProfilingInfo> process_profiling_output(const std::vector<uint8_t>& profData,
                                                             const std::vector<uint8_t>& network,
                                                             const Config& config) const override final;
 
+    std::vector<ov::ProfilingInfo> process_profiling_output(const std::vector<uint8_t>& profData,
+                                                            const std::shared_ptr<ov::MappedMemory>& mmapNetwork,
+                                                            const Config& config) const override final;
+
 private:
     /**
      * @brief Separate externals calls to separate class

diff --git a/src/plugins/intel_npu/src/compiler/include/iexternal_compiler.hpp b/src/plugins/intel_npu/src/compiler/include/iexternal_compiler.hpp
@@ -0,0 +1,44 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include "intel_npu/al/icompiler.hpp"
+
+namespace intel_npu {
+namespace driverCompilerAdapter {
+
+struct IR {
+    std::stringstream xml;
+    std::stringstream weights;
+};
+
+/**
+ * @brief Interface for external compiler
+ * @details Isolate external API calls from general logic
+ */
+class IExternalCompiler {
+public:
+    virtual ~IExternalCompiler() = default;
+
+    /**
+     * @brief Get opset supported by compiler
+     */
+    virtual uint32_t getSupportedOpset() const = 0;
+
+    /**
+     * @brief Get query result for current network
+     */
+    virtual std::unordered_set<std::string> getQueryResult(IR& irModel, const Config& config) const = 0;
+
+    /**
+     * @brief Sends the serialized model and its I/O metadata to the driver for compilation.
+     * @return The compiled model descriptor corresponding to the previously given network.
+     */
+    virtual NetworkDescription compileIR(const std::shared_ptr<const ov::Model>& model,
+                                         IR& irModel,
+                                         const Config& config) const = 0;
+    virtual NetworkMetadata parseBlob(const std::shared_ptr<ov::MappedMemory>& mmapBlob, const Config& config) const = 0;
+};
+}  // namespace driverCompilerAdapter
+}  // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/compiler/include/zero_compiler_in_driver.hpp b/src/plugins/intel_npu/src/compiler/include/zero_compiler_in_driver.hpp
@@ -65,14 +65,20 @@ class LevelZeroCompilerInDriver final : public ICompiler {
                                                 ze_device_graph_properties_t deviceGraphProperties,
                                                 ze_graph_handle_t& graphHandle) const;
 
-    NetworkMetadata parse(const std::vector<uint8_t>& network, const Config& config) const override final;
+    NetworkMetadata parse(const std::shared_ptr<ov::MappedMemory>& mmapNetwork, const Config& config) const override final;
 
     std::vector<ov::ProfilingInfo> process_profiling_output(const std::vector<uint8_t>& profData,
                                                             const std::vector<uint8_t>& network,
                                                             const Config& config) const override final {
         OPENVINO_THROW("Profiling post-processing is not implemented.");
     }
 
+    std::vector<ov::ProfilingInfo> process_profiling_output(const std::vector<uint8_t>& profData,
+                                                            const std::shared_ptr<ov::MappedMemory>& mmapNetwork,
+                                                            const Config& config) const override final {
+        OPENVINO_THROW("Profiling post-processing is not implemented.");
+    }
+
     template <typename T = TableExtension, std::enable_if_t<!NotSupportQuery(T), bool> = true>
     std::unordered_set<std::string> getQueryResultFromSupportedLayers(
         ze_result_t result,

diff --git a/src/plugins/intel_npu/src/compiler/src/driver_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler/src/driver_compiler_adapter.cpp
@@ -195,9 +195,9 @@ ov::SupportedOpsMap LevelZeroCompilerAdapter::query(const std::shared_ptr<const
     return apiAdapter->query(model, config);
 }
 
-NetworkMetadata LevelZeroCompilerAdapter::parse(const std::vector<uint8_t>& network, const Config& config) const {
+NetworkMetadata LevelZeroCompilerAdapter::parse(const std::shared_ptr<ov::MappedMemory>& mmapNetwork, const Config& config) const {
     _logger.debug("parse start");
-    return apiAdapter->parse(network, config);
+    return apiAdapter->parse(mmapNetwork, config);
 }
 
 std::vector<ov::ProfilingInfo> LevelZeroCompilerAdapter::process_profiling_output(const std::vector<uint8_t>&,
@@ -206,5 +206,11 @@ std::vector<ov::ProfilingInfo> LevelZeroCompilerAdapter::process_profiling_outpu
     OPENVINO_THROW("Profiling post-processing is not implemented.");
 }
 
+std::vector<ov::ProfilingInfo> LevelZeroCompilerAdapter::process_profiling_output(const std::vector<uint8_t>&,
+                                                                                  const std::shared_ptr<ov::MappedMemory>&,
+                                                                                  const Config&) const {
+    OPENVINO_THROW("Profiling post-processing is not implemented.");
+}
+
 }  // namespace driverCompilerAdapter
 }  // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/compiler/src/zero_compiler_in_driver.cpp b/src/plugins/intel_npu/src/compiler/src/zero_compiler_in_driver.cpp
@@ -870,23 +870,23 @@ NetworkDescription LevelZeroCompilerInDriver<TableExtension>::compile(const std:
     }
 
     _logger.debug("compile end");
-    return NetworkDescription(std::move(blob), std::move(networkMeta));
+    return NetworkDescriptionT<std::vector<uint8_t>>(std::move(blob), std::move(networkMeta));
 }
 
 template <typename TableExtension>
-NetworkMetadata LevelZeroCompilerInDriver<TableExtension>::parse(const std::vector<uint8_t>& network,
+NetworkMetadata LevelZeroCompilerInDriver<TableExtension>::parse(const std::shared_ptr<ov::MappedMemory>& mmapNetwork,
                                                                  const Config& config) const {
     OV_ITT_TASK_CHAIN(PARSE_BLOB, itt::domains::NPUPlugin, "LevelZeroCompilerInDriver::parse", "desc");
     ze_graph_handle_t graphHandle;
 
-    if (!network.empty()) {
+    if (mmapNetwork->size() > 0) {
         _logger.debug("Import network case");
         ze_graph_format_t format = ZE_GRAPH_FORMAT_NATIVE;
         ze_graph_desc_t desc{ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES,
                              nullptr,
                              format,
-                             network.size(),
-                             network.data(),
+                             mmapNetwork->size(),
+                             reinterpret_cast<uint8_t*>(mmapNetwork->data()),
                              nullptr};
 
         auto result = _graphDdiTableExt->pfnCreate(_context, _deviceHandle, &desc, &graphHandle);

diff --git a/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp
@@ -128,17 +128,27 @@ std::shared_ptr<ov::ISyncInferRequest> CompiledModel::create_sync_infer_request(
 }
 
 void CompiledModel::export_model(std::ostream& stream) const {
-    const auto& blob = _networkPtr->compiledNetwork;
-    stream.write(reinterpret_cast<const char*>(blob.data()), blob.size());
+    const uint8_t* data;
+    size_t size;
+    if (NetworkDescriptionCastCheck(_networkPtr)) {
+        auto networkPtrCast = NetworkDescriptionPtrCast1(_networkPtr);
+        data = networkPtrCast->compiledNetwork.data();
+        size = networkPtrCast->compiledNetwork.size();
+        const auto& blob = networkPtrCast->compiledNetwork;
+        std::stringstream str;
+        str << "Blob size: " << blob.size() << ", hash: " << std::hex << hash(blob);
+        _logger.info(str.str().c_str());
+    } else {
+        auto networkPtrCast = NetworkDescriptionPtrCast2(_networkPtr);
+        data = reinterpret_cast<uint8_t*>(networkPtrCast->compiledNetwork->data());
+        size = networkPtrCast->compiledNetwork->size();
+    }
+    stream.write(reinterpret_cast<const char*>(data), size);
     if (!stream) {
         _logger.error("Write blob to stream failed. Blob is broken!");
     } else {
         _logger.info("Write blob to stream successfully.");
     }
-
-    std::stringstream str;
-    str << "Blob size: " << blob.size() << ", hash: " << std::hex << hash(blob);
-    _logger.info(str.str().c_str());
 }
 
 std::shared_ptr<const ov::Model> CompiledModel::get_runtime_model() const {

diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp
@@ -19,6 +19,7 @@
 #include "openvino/op/parameter.hpp"
 #include "openvino/runtime/intel_npu/properties.hpp"
 #include "openvino/runtime/properties.hpp"
+#include "openvino/util/mmap_object.hpp"
 #include "remote_context.hpp"
 
 using namespace intel_npu;
@@ -742,21 +743,20 @@ std::shared_ptr<ov::ICompiledModel> Plugin::import_model(std::istream& stream, c
 
         auto graphSize = getFileSize(stream);
 
-        std::vector<uint8_t> blob(graphSize);
-        stream.read(reinterpret_cast<char*>(blob.data()), graphSize);
-        if (!stream) {
-            OPENVINO_THROW("Failed to read data from stream!");
-        }
-        _logger.debug("Successfully read %zu bytes into blob.", graphSize);
+        std::ofstream tmpBlob("tmpBlob.blob", std::ios::binary);
+        tmpBlob << stream.rdbuf();
+        tmpBlob.close();
+
+        auto mMapBlob = ov::load_mmap_object("tmpBlob.blob");
 
-        auto meta = compiler->parse(blob, localConfig);
+        auto meta = compiler->parse(mMapBlob, localConfig);
         meta.name = "net" + std::to_string(_compiledModelLoadCounter++);
 
         const std::shared_ptr<ov::Model> modelDummy = create_dummy_model(meta.inputs, meta.outputs);
 
         bool profiling = localConfig.get<PERF_COUNT>();
 
-        auto networkDescription = std::make_shared<const NetworkDescription>(std::move(blob), std::move(meta));
+        auto networkDescription = std::make_shared<const NetworkDescriptionT<std::shared_ptr<ov::MappedMemory>>>(std::move(mMapBlob), std::move(meta));
 
         compiledModel = std::make_shared<CompiledModel>(modelDummy,
                                                         shared_from_this(),