diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp index 1fc3a3e20965c6..d6a01da8855a73 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp @@ -288,4 +288,21 @@ struct RUN_INFERENCES_SEQUENTIALLY final : OptionBase> { + static std::string_view key() { + return ov::internal::cached_model_buffer.name(); + } + + static bool defaultValue() { + return nullptr; + } + + static OptionMode mode() { + return OptionMode::RunTime; + } +}; + } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp index 7e718d9172f4f7..6d35e250166503 100644 --- a/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp +++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp @@ -13,15 +13,77 @@ #include "intel_npu/utils/zero/zero_utils.hpp" #include "intel_npu/utils/zero/zero_wrappers.hpp" #include "openvino/runtime/profiling_info.hpp" +#include "openvino/runtime/shared_buffer.hpp" namespace intel_npu { +class BlobContainer { +public: + virtual void* get_ptr() { + OPENVINO_THROW("const BlobContainer::get_ptr() method is not implemented!"); + } + + virtual size_t size() const { + OPENVINO_THROW("BlobContainer::size() method is not implemented!"); + } + + virtual bool release_from_memory() { + OPENVINO_THROW("BlobContainer::release_from_memory() method is not implemented!"); + } + + virtual ~BlobContainer() = default; +}; + +class BlobContainerVector : public BlobContainer { +public: + BlobContainerVector(std::vector blob) : _ownershipBlob(std::move(blob)) {} + + void* get_ptr() override { + return reinterpret_cast(_ownershipBlob.data()); + } + + size_t size() const override { + return _ownershipBlob.size(); + } + + bool release_from_memory() override { + _ownershipBlob.clear(); + _ownershipBlob.shrink_to_fit(); + return true; + } + +private: + std::vector _ownershipBlob; +}; + +class BlobContainerAlignedBuffer : public BlobContainer { +public: + BlobContainerAlignedBuffer(const std::shared_ptr& blobSO, + size_t offset) : _ownershipBlob(blobSO), _offset(offset) {} + + void* get_ptr() override { + return _ownershipBlob->get_ptr(_offset); + } + + size_t size() const override { + return _ownershipBlob->size(); + } + + bool release_from_memory() override { + return false; + } + +private: + std::shared_ptr _ownershipBlob; + size_t _offset; +}; + class IGraph : public std::enable_shared_from_this { public: IGraph(ze_graph_handle_t handle, NetworkMetadata metadata, const Config& config, - std::optional> blob); + std::optional> blobPtr); virtual void export_blob(std::ostream& stream) const = 0; @@ -89,7 +151,7 @@ class IGraph : public std::enable_shared_from_this { // first inference starts running std::mutex _mutex; - std::vector _blob; + std::unique_ptr _blob; uint32_t _unique_id = 0; uint32_t _last_submitted_id; diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/npu.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/npu.hpp index b34f2deee6c61e..b3846906644e58 100644 --- a/src/plugins/intel_npu/src/common/include/intel_npu/common/npu.hpp +++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/npu.hpp @@ -58,7 +58,7 @@ class ICompilerAdapter { public: virtual std::shared_ptr compile(const std::shared_ptr& model, const Config& config) const = 0; - virtual std::shared_ptr parse(std::vector network, const Config& config) const = 0; + virtual std::shared_ptr parse(std::unique_ptr blobPtr, const Config& config) const = 0; virtual ov::SupportedOpsMap query(const std::shared_ptr& model, const Config& config) const = 0; virtual ~ICompilerAdapter() = default; diff --git a/src/plugins/intel_npu/src/common/src/igraph.cpp b/src/plugins/intel_npu/src/common/src/igraph.cpp index fd5463af5eea3e..8547e47b69116d 100644 --- a/src/plugins/intel_npu/src/common/src/igraph.cpp +++ b/src/plugins/intel_npu/src/common/src/igraph.cpp @@ -17,7 +17,7 @@ namespace intel_npu { IGraph::IGraph(ze_graph_handle_t handle, NetworkMetadata metadata, const Config& config, - std::optional> blob) + std::optional> blobPtr) : _handle(handle), _metadata(std::move(metadata)), _logger("IGraph", config.get()) { diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp index 82ababf21c147a..38e6aab1a21765 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp @@ -26,7 +26,7 @@ class DriverCompilerAdapter final : public ICompilerAdapter { std::shared_ptr compile(const std::shared_ptr& model, const Config& config) const override; - std::shared_ptr parse(std::vector network, const Config& config) const override; + std::shared_ptr parse(std::unique_ptr blobPtr, const Config& config) const override; ov::SupportedOpsMap query(const std::shared_ptr& model, const Config& config) const override; diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp index 0f426581687f65..ef70ae01cc62af 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp @@ -21,7 +21,7 @@ class DriverGraph final : public IGraph { ze_graph_handle_t graphHandle, NetworkMetadata metadata, const Config& config, - std::optional> blob); + std::optional> blob); void export_blob(std::ostream& stream) const override; diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp index 8d2616884e7d5f..89e0b81426ef40 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp @@ -21,7 +21,7 @@ class PluginCompilerAdapter final : public ICompilerAdapter { std::shared_ptr compile(const std::shared_ptr& model, const Config& config) const override; - std::shared_ptr parse(std::vector network, const Config& config) const override; + std::shared_ptr parse(std::unique_ptr blobPtr, const Config& config) const override; ov::SupportedOpsMap query(const std::shared_ptr& model, const Config& config) const override; diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp index 2d7d9bfd429e47..5d0ab241bcd9c8 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp @@ -23,7 +23,7 @@ class PluginGraph final : public IGraph { const std::shared_ptr& zeroInitStruct, ze_graph_handle_t graphHandle, NetworkMetadata metadata, - std::vector blob, + std::unique_ptr blobPtr, const Config& config); void export_blob(std::ostream& stream) const override; diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp index 3e8c17ad13db7e..d6ba89472663a2 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp @@ -35,7 +35,7 @@ class ZeGraphExtWrappers { const std::string& buildFlags, const uint32_t& flags) const; - ze_graph_handle_t getGraphHandle(const std::vector& network) const; + ze_graph_handle_t getGraphHandle(const uint8_t* data, size_t size) const; NetworkMetadata getNetworkMeta(ze_graph_handle_t graphHandle) const; diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp index 9d634656db109a..37b1774bf17591 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp @@ -206,11 +206,13 @@ std::shared_ptr DriverCompilerAdapter::compile(const std::shared_ptr DriverCompilerAdapter::parse(std::vector network, const Config& config) const { +std::shared_ptr DriverCompilerAdapter::parse(std::unique_ptr blobPtr, + const Config& config) const { OV_ITT_TASK_CHAIN(PARSE_BLOB, itt::domains::NPUPlugin, "DriverCompilerAdapter", "parse"); _logger.debug("parse start"); - ze_graph_handle_t graphHandle = _zeGraphExt->getGraphHandle(network); + ze_graph_handle_t graphHandle = + _zeGraphExt->getGraphHandle(reinterpret_cast(blobPtr->get_ptr()), blobPtr->size()); _logger.debug("parse end"); OV_ITT_TASK_NEXT(PARSE_BLOB, "getNetworkMeta"); @@ -221,7 +223,7 @@ std::shared_ptr DriverCompilerAdapter::parse(std::vector networ graphHandle, std::move(networkMeta), config, - std::optional>(std::move(network))); + std::optional>(std::move(blobPtr))); } ov::SupportedOpsMap DriverCompilerAdapter::query(const std::shared_ptr& model, diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp index 0d180f983ad3a9..0019eb1bdf17d4 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp @@ -15,8 +15,8 @@ DriverGraph::DriverGraph(const std::shared_ptr& zeGraphExt, ze_graph_handle_t graphHandle, NetworkMetadata metadata, const Config& config, - std::optional> blob) - : IGraph(graphHandle, std::move(metadata), config, std::move(blob)), + std::optional> blobPtr) + : IGraph(graphHandle, std::move(metadata), config, std::move(blobPtr)), _zeGraphExt(zeGraphExt), _zeroInitStruct(zeroInitStruct), _logger("DriverGraph", config.get()) { @@ -139,7 +139,7 @@ void DriverGraph::initialize(const Config& config) { } bool DriverGraph::release_blob(const Config& config) { - if (_blob.empty() || _zeroInitStruct->getGraphDdiTable().version() < ZE_GRAPH_EXT_VERSION_1_8 || + if (_blob == nullptr || _zeroInitStruct->getGraphDdiTable().version() < ZE_GRAPH_EXT_VERSION_1_8 || config.get()) { return false; } @@ -152,8 +152,9 @@ bool DriverGraph::release_blob(const Config& config) { return false; } - _blob.clear(); - _blob.shrink_to_fit(); + if (!_blob->release_from_memory()) { + return false; + } _logger.debug("Blob is released"); diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp index 06d71fd1126c17..220af24b83c6c1 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp @@ -83,6 +83,7 @@ std::shared_ptr PluginCompilerAdapter::compile(const std::shared_ptrcompile(model, config); + auto blobPtr = std::make_unique(std::move(networkDesc.compiledNetwork)); _logger.debug("compile end"); ze_graph_handle_t graphHandle = nullptr; @@ -90,33 +91,40 @@ std::shared_ptr PluginCompilerAdapter::compile(const std::shared_ptrgetGraphHandle(networkDesc.compiledNetwork); + graphHandle = + _zeGraphExt->getGraphHandle(reinterpret_cast(blobPtr->get_ptr()), blobPtr->size()); } catch (...) { _logger.info("Failed to obtain the level zero graph handle. Inference requests for this model are not " "allowed. Only exports are available"); } } - return std::make_shared(_zeGraphExt, _compiler, _zeroInitStruct, graphHandle, std::move(networkDesc.metadata), - std::move(networkDesc.compiledNetwork), + std::move(blobPtr), config); } -std::shared_ptr PluginCompilerAdapter::parse(std::vector network, const Config& config) const { +std::shared_ptr PluginCompilerAdapter::parse(std::unique_ptr blobPtr, + const Config& config) const { OV_ITT_TASK_CHAIN(PARSE_BLOB, itt::domains::NPUPlugin, "PluginCompilerAdapter", "parse"); _logger.debug("parse start"); + std::vector network(blobPtr->size()); + network.assign(reinterpret_cast(blobPtr->get_ptr()), + reinterpret_cast(blobPtr->get_ptr()) + blobPtr->size()); auto networkMeta = _compiler->parse(network, config); + network.clear(); + network.shrink_to_fit(); _logger.debug("parse end"); ze_graph_handle_t graphHandle = nullptr; if (_zeGraphExt) { - graphHandle = _zeGraphExt->getGraphHandle(network); + graphHandle = + _zeGraphExt->getGraphHandle(reinterpret_cast(blobPtr->get_ptr()), blobPtr->size()); } return std::make_shared(_zeGraphExt, @@ -124,7 +132,7 @@ std::shared_ptr PluginCompilerAdapter::parse(std::vector networ _zeroInitStruct, graphHandle, std::move(networkMeta), - std::move(network), + std::move(blobPtr), config); } diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp index b1658e7e0582e0..ce02e0caad8edd 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp @@ -15,9 +15,12 @@ PluginGraph::PluginGraph(const std::shared_ptr& zeGraphExt, const std::shared_ptr& zeroInitStruct, ze_graph_handle_t graphHandle, NetworkMetadata metadata, - std::vector blob, + std::unique_ptr blobPtr, const Config& config) - : IGraph(graphHandle, std::move(metadata), config, std::optional>(std::move(blob))), + : IGraph(graphHandle, + std::move(metadata), + config, + std::optional>(std::move(blobPtr))), _zeGraphExt(zeGraphExt), _zeroInitStruct(zeroInitStruct), _compiler(compiler), @@ -31,7 +34,7 @@ PluginGraph::PluginGraph(const std::shared_ptr& zeGraphExt, } void PluginGraph::export_blob(std::ostream& stream) const { - stream.write(reinterpret_cast(_blob.data()), _blob.size()); + stream.write(reinterpret_cast(_blob->get_ptr()), _blob->size()); if (!stream) { _logger.error("Write blob to stream failed. Blob is broken!"); @@ -40,12 +43,14 @@ void PluginGraph::export_blob(std::ostream& stream) const { if (_logger.level() >= ov::log::Level::INFO) { std::uint32_t result = 1171117u; - for (const uint8_t* it = _blob.data(); it != _blob.data() + _blob.size(); ++it) { + for (const uint8_t* it = reinterpret_cast(_blob->get_ptr()); + it != reinterpret_cast(_blob->get_ptr()) + _blob->size(); + ++it) { result = ((result << 7) + result) + static_cast(*it); } std::stringstream str; - str << "Blob size: " << _blob.size() << ", hash: " << std::hex << result; + str << "Blob size: " << _blob->size() << ", hash: " << std::hex << result; _logger.info(str.str().c_str()); } _logger.info("Write blob to stream successfully."); @@ -53,7 +58,10 @@ void PluginGraph::export_blob(std::ostream& stream) const { std::vector PluginGraph::process_profiling_output(const std::vector& profData, const Config& config) const { - return _compiler->process_profiling_output(profData, _blob, config); + std::vector blob(_blob->size()); + blob.assign(reinterpret_cast(_blob->get_ptr()), + reinterpret_cast(_blob->get_ptr()) + _blob->size()); + return _compiler->process_profiling_output(profData, blob, config); } void PluginGraph::set_argument_value(uint32_t argi, const void* argv) const { diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp index f6366a2509747b..5efb973351c55f 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp @@ -363,19 +363,15 @@ ze_graph_handle_t ZeGraphExtWrappers::getGraphHandle(std::pair& network) const { +ze_graph_handle_t ZeGraphExtWrappers::getGraphHandle(const uint8_t* blobData, size_t blobSize) const { ze_graph_handle_t graphHandle; - if (network.empty()) { + if (blobData == nullptr || blobSize == 0) { OPENVINO_THROW("Empty blob"); } - ze_graph_desc_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES, - nullptr, - ZE_GRAPH_FORMAT_NATIVE, - network.size(), - network.data(), - nullptr}; + ze_graph_desc_t desc = + {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES, nullptr, ZE_GRAPH_FORMAT_NATIVE, blobSize, blobData, nullptr}; _logger.debug("getGraphHandle - perform pfnCreate"); auto result = _zeroInitStruct->getGraphDdiTable().pfnCreate(_zeroInitStruct->getContext(), diff --git a/src/plugins/intel_npu/src/plugin/include/metrics.hpp b/src/plugins/intel_npu/src/plugin/include/metrics.hpp index 7bce9eb0881a51..e940439d2b3611 100644 --- a/src/plugins/intel_npu/src/plugin/include/metrics.hpp +++ b/src/plugins/intel_npu/src/plugin/include/metrics.hpp @@ -67,7 +67,8 @@ class Metrics final { ov::intel_npu::batch_mode.name(), ov::hint::execution_mode.name()}; - const std::vector _internalSupportedProperties = {ov::internal::caching_properties.name()}; + const std::vector _internalSupportedProperties = {ov::internal::caching_properties.name(), + ov::internal::caching_with_mmap.name()}; // Metric to provide a hint for a range for number of async infer requests. (bottom bound, upper bound, step) const std::tuple _rangeForAsyncInferRequests{1u, 10u, 1u}; diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp index da425d5d01a5c3..f1291841663609 100644 --- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp +++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp @@ -767,21 +767,31 @@ std::shared_ptr Plugin::import_model(std::istream& stream, c "The usage of a compiled model can lead to undefined behavior. Please use OpenVINO IR instead!"); } + auto model_buffer = localConfig.get(); + OV_ITT_TASK_NEXT(PLUGIN_IMPORT_MODEL, "parse"); std::shared_ptr compiledModel; try { - auto compiler = getCompiler(localConfig); + std::unique_ptr blobPtr; + + if (model_buffer == nullptr) { + auto compiler = getCompiler(localConfig); - auto graphSize = getFileSize(stream); + auto graphSize = getFileSize(stream); - std::vector blob(graphSize); - stream.read(reinterpret_cast(blob.data()), graphSize); - if (!stream) { - OPENVINO_THROW("Failed to read data from stream!"); + std::vector blob(graphSize); + stream.read(reinterpret_cast(blob.data()), graphSize); + if (!stream) { + OPENVINO_THROW("Failed to read data from stream!"); + } + _logger.debug("Successfully read %zu bytes into blob.", graphSize); + + blobPtr = std::move(std::make_unique(std::move(blob))); + } else { + blobPtr = std::move(std::make_unique(model_buffer, stream.tellg())); } - _logger.debug("Successfully read %zu bytes into blob.", graphSize); auto graph = compiler->parse(std::move(blob), localConfig); graph->update_network_name("net" + std::to_string(_compiledModelLoadCounter++));