Skip to content

Commit 38e1bbc

Browse files
Place shape related compute nodes in CPU (#4940) (#5350)
* Place shape related nodes in CPU * visit candidates by topological order * Make CPU node placement a utility function * skip placing on CPU if the data typs is float16 or bfloat16 Co-authored-by: Sherlock <[email protected]>
1 parent 5de47af commit 38e1bbc

File tree

3 files changed

+184
-56
lines changed

3 files changed

+184
-56
lines changed

include/onnxruntime/core/graph/graph_viewer.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,18 @@ class GraphViewer {
125125
/** Get the Node containing this Graph if IsSubgraph is true. Returns nullptr otherwise. */
126126
const Node* ParentNode() const noexcept { return graph_->ParentNode(); }
127127

128+
#if !defined(ORT_MINIMAL_BUILD)
129+
/** Get the consumer nodes of a node arg */
130+
std::vector<const Node*> GetConsumerNodes(const std::string& node_arg_name) const {
131+
return graph_->GetConsumerNodes(node_arg_name);
132+
}
133+
134+
/** Get the producer node of a node arg */
135+
const Node* GetProducerNode(const std::string& node_arg_name) const {
136+
return graph_->GetProducerNode(node_arg_name);
137+
}
138+
#endif
139+
128140
private:
129141
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(GraphViewer);
130142

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the MIT License.
3+
4+
#pragma once
5+
#include "core/graph/graph_viewer.h"
6+
#include "onnx/defs/data_type_utils.h"
7+
#include <queue>
8+
9+
using namespace ONNX_NAMESPACE::Utils;
10+
11+
namespace onnxruntime {
12+
13+
namespace {
14+
const int64_t Small_Initializer_Threshold = 100;
15+
16+
bool IsSmallInitializerWithSingleConsumer(const onnxruntime::GraphViewer& graph, const NodeArg* arg) {
17+
const ONNX_NAMESPACE::TensorProto* initializer_tensor;
18+
if (!graph.GetInitializedTensor(arg->Name(), initializer_tensor))
19+
return false;
20+
int64_t size = 1;
21+
for (auto& dim : initializer_tensor->dims()) {
22+
size *= dim;
23+
}
24+
return size <= Small_Initializer_Threshold &&
25+
graph.GetConsumerNodes(arg->Name()).size() == 1;
26+
}
27+
} // namespace
28+
29+
/**
30+
Returns a list of nodes that are prefered on CPU.
31+
They are commonly shape-related computation subgraphs.
32+
@param graph Graph viewer
33+
@param provider_type The targe execution provider type
34+
@param kernel_registries Kernel registies for the target EP
35+
@param tentative_nodes Nodes that are tentative to be placed on on target EP
36+
*/
37+
std::unordered_set<NodeIndex> GetCpuPreferedNodes(const onnxruntime::GraphViewer& graph,
38+
const std::string& provider_type,
39+
const std::vector<const KernelRegistry*>& kernel_registries,
40+
const std::vector<NodeIndex>& tentative_nodes) {
41+
const std::vector<NodeIndex>& ordered_nodes = graph.GetNodesInTopologicalOrder();
42+
std::vector<size_t> node_id_to_order_map(graph.MaxNodeIndex());
43+
for (size_t id = 0; id < ordered_nodes.size(); ++id) {
44+
const NodeIndex& node_id = ordered_nodes[id];
45+
node_id_to_order_map[node_id] = id;
46+
}
47+
48+
// If return false, n1 will be output first; If return true, n2 will be output first
49+
auto greater_order_comp = [&](const NodeIndex n1, const NodeIndex n2) {
50+
return node_id_to_order_map[n1] > node_id_to_order_map[n2];
51+
};
52+
53+
std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(greater_order_comp)> candidates(greater_order_comp);
54+
std::unordered_set<NodeIndex> visited;
55+
56+
std::unordered_set<const NodeArg*> cpu_output_args;
57+
std::unordered_set<NodeIndex> provider_nodes;
58+
std::unordered_map<NodeIndex, const KernelCreateInfo*> node_to_kernel;
59+
60+
for (auto& node_id : tentative_nodes) {
61+
provider_nodes.insert(node_id);
62+
const Node* node = graph.GetNode(node_id);
63+
64+
const KernelCreateInfo* kernel_info = nullptr;
65+
for (auto registry : kernel_registries) {
66+
auto st = registry->TryFindKernel(*node, provider_type, &kernel_info);
67+
if (st.IsOK())
68+
break;
69+
}
70+
// at least one registry has a target provider's kernel for this node
71+
ORT_ENFORCE(kernel_info != nullptr);
72+
node_to_kernel.insert({node_id, kernel_info});
73+
74+
// first, find all the direct consumer of cpu tensors.
75+
ORT_THROW_IF_ERROR(node->ForEachWithIndex(
76+
node->OutputDefs(),
77+
[&](const NodeArg& node_arg, size_t out_index) {
78+
if (kernel_info->kernel_def->IsOutputOnCpu(out_index)) {
79+
cpu_output_args.insert(&node_arg);
80+
auto consumer_nodes = graph.GetConsumerNodes(node_arg.Name());
81+
for (auto& consumer_node : consumer_nodes) {
82+
candidates.push(consumer_node->Index());
83+
LOGS_DEFAULT(INFO) << "Canditiate for fallback CPU execution: " << consumer_node->Name();
84+
}
85+
}
86+
return Status::OK();
87+
}));
88+
}
89+
90+
const std::vector<const NodeArg*>& graph_inputs = graph.GetInputs();
91+
std::unordered_set<NodeIndex> cpu_nodes;
92+
// The algo below is trying to identity a subgraph that only depends on cpu tensors.
93+
// Usually it is a subgraph that doing shape calculation based on a GPU tensor, then reshape it back.
94+
// The detail:
95+
// for each candidate, if one of its input is a cpu tensor and the Non-CPU kernel doesn't mark it as cpu input,
96+
// force the node to CPU to avoid memory cpu and add its output to the small cpu tensors.
97+
while (!candidates.empty()) {
98+
NodeIndex cur = candidates.top();
99+
candidates.pop();
100+
if (visited.count(cur) != 0)
101+
continue;
102+
visited.insert(cur);
103+
104+
if (provider_nodes.find(cur) == provider_nodes.end())
105+
continue;
106+
107+
auto* node = graph.GetNode(cur);
108+
bool place_in_cpu = true;
109+
for (size_t i = 0; i < node->InputDefs().size(); ++i) {
110+
auto* input = node->InputDefs()[i];
111+
112+
// skip placing on CPU if the data typs is float16 or bfloat16
113+
if (input->Type() == DataTypeUtils::ToType("float16") ||
114+
input->Type() == DataTypeUtils::ToType("bfloat16")) {
115+
place_in_cpu = false;
116+
break;
117+
}
118+
119+
// allow placing on CPU if it's a small initializer or graph input
120+
if (IsSmallInitializerWithSingleConsumer(graph, input) ||
121+
std::find(graph_inputs.begin(), graph_inputs.end(), input) != graph_inputs.end()) {
122+
continue;
123+
}
124+
125+
// the input is not a CPU tensor
126+
if (cpu_output_args.find(input) == cpu_output_args.end()) {
127+
place_in_cpu = false;
128+
break;
129+
}
130+
131+
// input is a CPU tensor, but it's intended to be consumed as CPU input by the target EP
132+
if (node_to_kernel[cur]->kernel_def->IsInputOnCpu(i)) {
133+
place_in_cpu = false;
134+
break;
135+
}
136+
}
137+
138+
if (place_in_cpu) {
139+
cpu_nodes.insert(cur);
140+
LOGS_DEFAULT(WARNING) << "Force fallback to CPU execution for node: " << node->Name();
141+
for (auto* output : node->OutputDefs()) {
142+
cpu_output_args.insert(output);
143+
}
144+
for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) {
145+
candidates.push((*it).Index());
146+
}
147+
}
148+
}
149+
150+
return cpu_nodes;
151+
}
152+
153+
} // namespace onnxruntime

onnxruntime/core/providers/cuda/cuda_execution_provider.cc

Lines changed: 19 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "cuda_allocator.h"
88
#include "core/framework/kernel_registry.h"
99
#include "core/framework/compute_capability.h"
10+
#include "core/framework/fallback_cpu_capability.h"
1011
#include "core/framework/memcpy.h"
1112
#include "core/graph/graph_utils.h"
1213
#include "core/providers/cuda/gpu_data_transfer.h"
@@ -1822,9 +1823,7 @@ std::unique_ptr<onnxruntime::IDataTransfer> CUDAExecutionProvider::GetDataTransf
18221823
std::vector<std::unique_ptr<ComputeCapability>>
18231824
CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
18241825
const std::vector<const KernelRegistry*>& kernel_registries) const {
1825-
std::vector<std::unique_ptr<ComputeCapability>> result;
1826-
std::unordered_set<const NodeArg*> defs_outside_cuda;
1827-
1826+
std::vector<NodeIndex> candidates;
18281827
for (auto& node_index : graph.GetNodesInTopologicalOrder()) {
18291828
const auto* p_node = graph.GetNode(node_index);
18301829
if (p_node == nullptr)
@@ -1833,7 +1832,6 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
18331832
const auto& node = *p_node;
18341833
const KernelCreateInfo* cuda_kernel_def = nullptr;
18351834
if (!node.GetExecutionProviderType().empty()) {
1836-
defs_outside_cuda.insert(node.OutputDefs().cbegin(), node.OutputDefs().cend());
18371835
continue;
18381836
}
18391837

@@ -1847,14 +1845,10 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
18471845

18481846
// none of the provided registries has a CUDA kernel for this node
18491847
if (cuda_kernel_def == nullptr) {
1850-
// node is not in cuda exeuction provider if no kernel def found,
1851-
// or if other execution provider already assigned to it
1852-
defs_outside_cuda.insert(node.OutputDefs().cbegin(), node.OutputDefs().cend());
18531848
continue;
18541849
}
18551850

18561851
bool not_supported = false;
1857-
bool force_outside = false;
18581852
bool force_inside = false; // for some compute heavy ops, we'll force it to run inside CUDA
18591853
if ("LSTM" == node.OpType()) {
18601854
// the supported activations covers the bidirectional mode
@@ -1877,60 +1871,29 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
18771871
// cast is not compute heavy, and may be placed outside
18781872
}
18791873

1880-
//Below rule only works for inference, for training, we can't do constant folding.
1881-
//We need find a better solution.
1882-
//Temporary disable the check here, the cost is all the cast will be on GPU now.
1883-
#ifndef ENABLE_TRAINING
1884-
if (!not_supported && !force_inside) {
1885-
// Note that nodes with only inputs from initializer would not be place on CUDA
1886-
// Ideally, those nodes should be eliminated in constant folding
1887-
bool should_force_outside = true;
1888-
bool all_inputs_are_initializers = true;
1889-
ORT_THROW_IF_ERROR(node.ForEachWithIndex(node.InputDefs(),
1890-
[&](const NodeArg& def, size_t index) {
1891-
// The input is not a initializer and the input is from CPU
1892-
// or the input declared as CPU memory and is from CPU
1893-
// in that case we should still keep the node on CUDA
1894-
bool initializer_input = graph.IsConstantInitializer(def.Name(), /*check_outer_scope*/ true);
1895-
bool input_is_on_cpu = defs_outside_cuda.count(&def) > 0;
1896-
if ((!initializer_input && !input_is_on_cpu) ||
1897-
(input_is_on_cpu && cuda_kernel_def->kernel_def->IsInputOnCpu(index))) {
1898-
should_force_outside = false;
1899-
}
1900-
1901-
if (!initializer_input) {
1902-
all_inputs_are_initializers = false;
1903-
}
1904-
return Status::OK();
1905-
}));
1906-
1907-
// If all the inputs are initializers, we shouldn't force it to CPU
1908-
if (should_force_outside && !all_inputs_are_initializers) {
1909-
force_outside = true;
1910-
}
1911-
}
1912-
#endif
1913-
if (!force_inside && (not_supported || force_outside)) {
1914-
defs_outside_cuda.insert(node.OutputDefs().cbegin(), node.OutputDefs().cend());
1874+
if (!force_inside && not_supported) {
19151875
if (not_supported) {
19161876
LOGS_DEFAULT(WARNING) << "CUDA kernel not supported. Fallback to CPU execution provider for Op type: " << node.OpType() << " node name: " << node.Name();
1917-
} else if (force_outside) {
1918-
LOGS_DEFAULT(INFO) << "Force fallback to CPU execution provider for Op type: " << node.OpType() << " node name: " << node.Name();
19191877
}
19201878
} else {
1921-
// for nodes placed on CUDA, check if its output is on CPU
1922-
ORT_THROW_IF_ERROR(node.ForEachWithIndex(
1923-
node.OutputDefs(),
1924-
[&](const NodeArg& def, size_t out_index) {
1925-
if (cuda_kernel_def->kernel_def->OutputMemoryType(out_index) != OrtMemTypeDefault)
1926-
defs_outside_cuda.insert(&def);
1927-
return Status::OK();
1928-
}));
1929-
std::unique_ptr<IndexedSubGraph> sub_graph = onnxruntime::make_unique<IndexedSubGraph>();
1930-
sub_graph->nodes.push_back(node.Index());
1931-
result.push_back(onnxruntime::make_unique<ComputeCapability>(std::move(sub_graph)));
1879+
candidates.push_back(node.Index());
19321880
}
19331881
}
1882+
1883+
// For CUDA EP, exclude the subgraph that is preferred to be placed in CPU
1884+
// These are usually shape related computation subgraphs
1885+
// Following logic can be extended for other EPs
1886+
std::unordered_set<NodeIndex> cpu_nodes = GetCpuPreferedNodes(graph, Type(), kernel_registries, candidates);
1887+
1888+
std::vector<std::unique_ptr<ComputeCapability>> result;
1889+
for (auto& node_index : candidates) {
1890+
if (cpu_nodes.count(node_index) > 0)
1891+
continue;
1892+
1893+
std::unique_ptr<IndexedSubGraph> sub_graph = onnxruntime::make_unique<IndexedSubGraph>();
1894+
sub_graph->nodes.push_back(node_index);
1895+
result.push_back(onnxruntime::make_unique<ComputeCapability>(std::move(sub_graph)));
1896+
}
19341897
return result;
19351898
}
19361899

0 commit comments

Comments
 (0)