DEMO changes

microsoft · Nov 21, 2024 · b3b819b · b3b819b
1 parent 42e7bf6
commit b3b819b
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 28 deletions.
diff --git a/include/onnxruntime/core/framework/resource_accountant.h b/include/onnxruntime/core/framework/resource_accountant.h
@@ -37,11 +37,19 @@ class IResourceAccountant {
   virtual void AddConsumedAmount(const ResourceCount& amount) = 0;
   virtual void RemoveConsumedAmount(const ResourceCount& amount) = 0;
   virtual ResourceCount ComputeResourceCount(const Graph&, size_t node_index) const = 0;
+
   std::optional<ResourceCount> GetThreshold() const {
     return threshold_;
   }
 
+  void SetStopAssignment() {
+    stop_assignment_ = true;
+  }
+
+  bool IsStopIssued() const noexcept { return stop_assignment_; }
+
  private:
+  bool stop_assignment_ = false;
   std::optional<ResourceCount> threshold_;
 };
 

diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -199,6 +199,7 @@ static const char* const kNodePartitionConfigFile = "session.node_partition_conf
 
 /// "number > 0": enables Capacity Aware Partitioning for Cuda EP. The EP will place nodes on device
 /// "0" : disables Capacity Aware Partitioning for Cuda EP. The EP will place nodes on device based on the default policy.
+/// "auto": means automatic resource accounting and threshold
 ///  until the device memory usage reaches the specified threshold in Kb. The default value is 0.
 static const char* const kOrtSessionOptionsConfigPartitionSetCudaMemoryLimitKb = "session.node_partition_cuda_memory_limit_kb";
 

diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
@@ -823,8 +823,7 @@ static Status PartitionOnnxFormatModel(const PartitionParams& partition_params,
 
 static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_params,
                                           KernelRegistryManager& kernel_registry_mgr,
-                                          IExecutionProvider& current_ep,
-                                          IResourceAccountant* resource_accountant) {
+                                          IExecutionProvider& current_ep) {
   // handle testing edge case where optimizers or constant lifting results in graph with no nodes.
   // doing it here saves all providers checking for this in GetCapability
   auto& graph = partition_params.graph.get();
@@ -839,7 +838,7 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
       PartitionParams subgraph_partition_params = partition_params;
       subgraph_partition_params.graph = std::ref(subgraph);
       ORT_RETURN_IF_ERROR(PartitionOrtFormatModelImpl(subgraph_partition_params, kernel_registry_mgr,
-                                                      current_ep, resource_accountant));
+                                                      current_ep));
     }
   }
 
@@ -855,7 +854,7 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
       std::cref(partition_params.transform_layout_function),
       std::cref(partition_params.debug_graph_fn),
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
-      resource_accountant
+      nullptr
   };
   // clang-format on
 
@@ -947,17 +946,10 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
 // Simplified partitioning where custom EPs may produce compiled nodes.
 static Status PartitionOrtFormatModel(const PartitionParams& partition_params,
                                       const ExecutionProviders& execution_providers,
-                                      KernelRegistryManager& kernel_registry_manager,
-                                      const ResourceAccountantMap& acc_map) {
+                                      KernelRegistryManager& kernel_registry_manager) {
   // process full graph with each EP
   for (const auto& ep : execution_providers) {
-    IResourceAccountant* resource_accountant = nullptr;
-    auto hit = acc_map.find(ep->Type());
-    if (hit != acc_map.end()) {
-      resource_accountant = hit->second.get();
-    }
-    ORT_RETURN_IF_ERROR(PartitionOrtFormatModelImpl(partition_params, kernel_registry_manager, *ep,
-                                                    resource_accountant));
+    ORT_RETURN_IF_ERROR(PartitionOrtFormatModelImpl(partition_params, kernel_registry_manager, *ep));
   }
 
   return Status::OK();
@@ -1056,11 +1048,15 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
   // We use this only if Resource Aware Partitioning is enabled for any of the EPs
   ResourceAccountantMap ep_acc_map;
   // Zero, it is disabled by default
-  std::string cuda_memory_limit_config = config_options.GetConfigOrDefault(kOrtSessionOptionsConfigPartitionSetCudaMemoryLimitKb, "0");
+  const std::string cuda_memory_limit_config = config_options.GetConfigOrDefault(kOrtSessionOptionsConfigPartitionSetCudaMemoryLimitKb, "0");
   if (cuda_memory_limit_config != "0") {
-    SafeInt<size_t> cuda_memory_limit = std::stoi(cuda_memory_limit_config);
-    cuda_memory_limit *= 1024;
-    ep_acc_map[kCudaExecutionProvider] = std::make_unique<SizeTAccountant>(cuda_memory_limit);
+    if (cuda_memory_limit_config == "auto") {
+      ep_acc_map[kCudaExecutionProvider] = std::make_unique<SizeTAccountant>();
+    } else {
+      SafeInt<size_t> cuda_memory_limit = std::stoi(cuda_memory_limit_config);
+      cuda_memory_limit *= 1024;
+      ep_acc_map[kCudaExecutionProvider] = std::make_unique<SizeTAccountant>(cuda_memory_limit);
+    }
   }
 
   if (mode == Mode::kNormal || mode == Mode::kAssignOnly) {
@@ -1082,7 +1078,7 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
 #endif  //! defined(ORT_MINIMAL_BUILD)
   } else {
     ORT_RETURN_IF_ERROR(PartitionOrtFormatModel(partition_params,
-                                                providers_, kernel_registry_mgr_, ep_acc_map));
+                                                providers_, kernel_registry_mgr_));
   }
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)

diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -2582,6 +2582,11 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
   size_t memory_threshold = std::numeric_limits<size_t>::max();
   SafeInt<size_t> consumed_memory = 0;
   if (resource_accountant != nullptr) {
+    if (resource_accountant->IsStopIssued()) {
+      LOGS(logger, WARNING) << "CUDA_EP returning due to Stop Set";
+      return result;
+    }
+
     auto threshold = resource_accountant->GetThreshold();
     if (!threshold.has_value()) {
       // info_.gpu_mem_limit is for BFC arena
@@ -2596,11 +2601,6 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
     }
 
     consumed_memory = std::get<0>(resource_accountant->GetConsumedAmount());
-    // Return early if already over the limit
-    if (static_cast<size_t>(consumed_memory) > memory_threshold) {
-      LOGS(logger, INFO) << "CUDA EP returning early due to capacity threshold";
-      return result;
-    }
   }
 
   InlinedHashSet<NodeIndex> previously_assigned_nodes;
@@ -2677,15 +2677,22 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
 
     // Previously assigned nodes have been accounted before
     if (previously_assigned_nodes.count(node_index) > 0 || resource_accountant == nullptr) {
+      // XXX: For demo only
+      constexpr const size_t kNodeCountThreshold = 800;
+      static std::atomic_size_t nodes_assigned = 0;
+      if (nodes_assigned.fetch_add(1) > kNodeCountThreshold) {
+        ORT_THROW("CUDA EP is running out of memory");
+      }
+      /// XXX: End of DEMO
       auto sub_graph = IndexedSubGraph::Create();
       sub_graph->Nodes().push_back(node_index);
       result.push_back(ComputeCapability::Create(std::move(sub_graph)));
     } else {
       auto resource_count = std::get<0>(resource_accountant->ComputeResourceCount(graph.GetGraph(), node_index));
       const auto would_be_consumed = resource_count + consumed_memory;
-      LOGS(logger, VERBOSE) << "Node: " << node_index << " Memory usage : " << resource_count
-                            << " would be consumed " << static_cast<size_t>(would_be_consumed)
-                            << " threshold: " << memory_threshold;
+      LOGS(logger, INFO) << "CUDA_EP Node: " << node_index << " Memory usage : " << resource_count
+                         << " would be consumed " << static_cast<size_t>(would_be_consumed)
+                         << " threshold: " << memory_threshold;
       if (would_be_consumed < memory_threshold) {
         consumed_memory = would_be_consumed;
         auto sub_graph = IndexedSubGraph::Create();
@@ -2697,9 +2704,10 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
         // We break here so we do not have patches of CUDA assigned nodes.
         auto* node = graph.GetNode(node_index);
         if (node != nullptr) {
-          LOGS(logger, INFO) << "CUDA EP Halting assignment due to capacity threshold at node: "
-                             << node->Name() << " index: " << node_index;
+          LOGS(logger, WARNING) << "CUDA_EP Halting assignment due to capacity threshold at node: "
+                                << node->Name() << " index: " << node_index;
         }
+        resource_accountant->SetStopAssignment();
         break;
       }
     }