Skip to content

Commit

Permalink
DEMO changes
Browse files Browse the repository at this point in the history
  • Loading branch information
yuslepukhin committed Nov 21, 2024
1 parent 42e7bf6 commit b3b819b
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 28 deletions.
8 changes: 8 additions & 0 deletions include/onnxruntime/core/framework/resource_accountant.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,19 @@ class IResourceAccountant {
virtual void AddConsumedAmount(const ResourceCount& amount) = 0;
virtual void RemoveConsumedAmount(const ResourceCount& amount) = 0;
virtual ResourceCount ComputeResourceCount(const Graph&, size_t node_index) const = 0;

std::optional<ResourceCount> GetThreshold() const {
return threshold_;
}

void SetStopAssignment() {
stop_assignment_ = true;
}

bool IsStopIssued() const noexcept { return stop_assignment_; }

private:
bool stop_assignment_ = false;
std::optional<ResourceCount> threshold_;
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ static const char* const kNodePartitionConfigFile = "session.node_partition_conf

/// "number > 0": enables Capacity Aware Partitioning for Cuda EP. The EP will place nodes on device
/// "0" : disables Capacity Aware Partitioning for Cuda EP. The EP will place nodes on device based on the default policy.
/// "auto": means automatic resource accounting and threshold
/// until the device memory usage reaches the specified threshold in Kb. The default value is 0.
static const char* const kOrtSessionOptionsConfigPartitionSetCudaMemoryLimitKb = "session.node_partition_cuda_memory_limit_kb";

Expand Down
32 changes: 14 additions & 18 deletions onnxruntime/core/framework/graph_partitioner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -823,8 +823,7 @@ static Status PartitionOnnxFormatModel(const PartitionParams& partition_params,

static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_params,
KernelRegistryManager& kernel_registry_mgr,
IExecutionProvider& current_ep,
IResourceAccountant* resource_accountant) {
IExecutionProvider& current_ep) {
// handle testing edge case where optimizers or constant lifting results in graph with no nodes.
// doing it here saves all providers checking for this in GetCapability
auto& graph = partition_params.graph.get();
Expand All @@ -839,7 +838,7 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
PartitionParams subgraph_partition_params = partition_params;
subgraph_partition_params.graph = std::ref(subgraph);
ORT_RETURN_IF_ERROR(PartitionOrtFormatModelImpl(subgraph_partition_params, kernel_registry_mgr,
current_ep, resource_accountant));
current_ep));
}
}

Expand All @@ -855,7 +854,7 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
std::cref(partition_params.transform_layout_function),
std::cref(partition_params.debug_graph_fn),
#endif // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
resource_accountant
nullptr
};
// clang-format on

Expand Down Expand Up @@ -947,17 +946,10 @@ static Status PartitionOrtFormatModelImpl(const PartitionParams& partition_param
// Simplified partitioning where custom EPs may produce compiled nodes.
static Status PartitionOrtFormatModel(const PartitionParams& partition_params,
const ExecutionProviders& execution_providers,
KernelRegistryManager& kernel_registry_manager,
const ResourceAccountantMap& acc_map) {
KernelRegistryManager& kernel_registry_manager) {
// process full graph with each EP
for (const auto& ep : execution_providers) {
IResourceAccountant* resource_accountant = nullptr;
auto hit = acc_map.find(ep->Type());
if (hit != acc_map.end()) {
resource_accountant = hit->second.get();
}
ORT_RETURN_IF_ERROR(PartitionOrtFormatModelImpl(partition_params, kernel_registry_manager, *ep,
resource_accountant));
ORT_RETURN_IF_ERROR(PartitionOrtFormatModelImpl(partition_params, kernel_registry_manager, *ep));
}

return Status::OK();
Expand Down Expand Up @@ -1056,11 +1048,15 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
// We use this only if Resource Aware Partitioning is enabled for any of the EPs
ResourceAccountantMap ep_acc_map;
// Zero, it is disabled by default
std::string cuda_memory_limit_config = config_options.GetConfigOrDefault(kOrtSessionOptionsConfigPartitionSetCudaMemoryLimitKb, "0");
const std::string cuda_memory_limit_config = config_options.GetConfigOrDefault(kOrtSessionOptionsConfigPartitionSetCudaMemoryLimitKb, "0");
if (cuda_memory_limit_config != "0") {
SafeInt<size_t> cuda_memory_limit = std::stoi(cuda_memory_limit_config);
cuda_memory_limit *= 1024;
ep_acc_map[kCudaExecutionProvider] = std::make_unique<SizeTAccountant>(cuda_memory_limit);
if (cuda_memory_limit_config == "auto") {
ep_acc_map[kCudaExecutionProvider] = std::make_unique<SizeTAccountant>();
} else {
SafeInt<size_t> cuda_memory_limit = std::stoi(cuda_memory_limit_config);
cuda_memory_limit *= 1024;
ep_acc_map[kCudaExecutionProvider] = std::make_unique<SizeTAccountant>(cuda_memory_limit);
}
}

if (mode == Mode::kNormal || mode == Mode::kAssignOnly) {
Expand All @@ -1082,7 +1078,7 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
#endif //! defined(ORT_MINIMAL_BUILD)
} else {
ORT_RETURN_IF_ERROR(PartitionOrtFormatModel(partition_params,
providers_, kernel_registry_mgr_, ep_acc_map));
providers_, kernel_registry_mgr_));
}

#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
Expand Down
28 changes: 18 additions & 10 deletions onnxruntime/core/providers/cuda/cuda_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2582,6 +2582,11 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
size_t memory_threshold = std::numeric_limits<size_t>::max();
SafeInt<size_t> consumed_memory = 0;
if (resource_accountant != nullptr) {
if (resource_accountant->IsStopIssued()) {
LOGS(logger, WARNING) << "CUDA_EP returning due to Stop Set";
return result;
}

auto threshold = resource_accountant->GetThreshold();
if (!threshold.has_value()) {
// info_.gpu_mem_limit is for BFC arena
Expand All @@ -2596,11 +2601,6 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
}

consumed_memory = std::get<0>(resource_accountant->GetConsumedAmount());
// Return early if already over the limit
if (static_cast<size_t>(consumed_memory) > memory_threshold) {
LOGS(logger, INFO) << "CUDA EP returning early due to capacity threshold";
return result;
}
}

InlinedHashSet<NodeIndex> previously_assigned_nodes;
Expand Down Expand Up @@ -2677,15 +2677,22 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,

// Previously assigned nodes have been accounted before
if (previously_assigned_nodes.count(node_index) > 0 || resource_accountant == nullptr) {
// XXX: For demo only
constexpr const size_t kNodeCountThreshold = 800;
static std::atomic_size_t nodes_assigned = 0;
if (nodes_assigned.fetch_add(1) > kNodeCountThreshold) {
ORT_THROW("CUDA EP is running out of memory");
}
/// XXX: End of DEMO
auto sub_graph = IndexedSubGraph::Create();
sub_graph->Nodes().push_back(node_index);
result.push_back(ComputeCapability::Create(std::move(sub_graph)));
} else {
auto resource_count = std::get<0>(resource_accountant->ComputeResourceCount(graph.GetGraph(), node_index));
const auto would_be_consumed = resource_count + consumed_memory;
LOGS(logger, VERBOSE) << "Node: " << node_index << " Memory usage : " << resource_count
<< " would be consumed " << static_cast<size_t>(would_be_consumed)
<< " threshold: " << memory_threshold;
LOGS(logger, INFO) << "CUDA_EP Node: " << node_index << " Memory usage : " << resource_count
<< " would be consumed " << static_cast<size_t>(would_be_consumed)
<< " threshold: " << memory_threshold;
if (would_be_consumed < memory_threshold) {
consumed_memory = would_be_consumed;
auto sub_graph = IndexedSubGraph::Create();
Expand All @@ -2697,9 +2704,10 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
// We break here so we do not have patches of CUDA assigned nodes.
auto* node = graph.GetNode(node_index);
if (node != nullptr) {
LOGS(logger, INFO) << "CUDA EP Halting assignment due to capacity threshold at node: "
<< node->Name() << " index: " << node_index;
LOGS(logger, WARNING) << "CUDA_EP Halting assignment due to capacity threshold at node: "
<< node->Name() << " index: " << node_index;
}
resource_accountant->SetStopAssignment();
break;
}
}
Expand Down

0 comments on commit b3b819b

Please sign in to comment.