7
7
#include " cuda_allocator.h"
8
8
#include " core/framework/kernel_registry.h"
9
9
#include " core/framework/compute_capability.h"
10
+ #include " core/framework/fallback_cpu_capability.h"
10
11
#include " core/framework/memcpy.h"
11
12
#include " core/graph/graph_utils.h"
12
13
#include " core/providers/cuda/gpu_data_transfer.h"
@@ -1822,9 +1823,7 @@ std::unique_ptr<onnxruntime::IDataTransfer> CUDAExecutionProvider::GetDataTransf
1822
1823
std::vector<std::unique_ptr<ComputeCapability>>
1823
1824
CUDAExecutionProvider::GetCapability (const onnxruntime::GraphViewer& graph,
1824
1825
const std::vector<const KernelRegistry*>& kernel_registries) const {
1825
- std::vector<std::unique_ptr<ComputeCapability>> result;
1826
- std::unordered_set<const NodeArg*> defs_outside_cuda;
1827
-
1826
+ std::vector<NodeIndex> candidates;
1828
1827
for (auto & node_index : graph.GetNodesInTopologicalOrder ()) {
1829
1828
const auto * p_node = graph.GetNode (node_index);
1830
1829
if (p_node == nullptr )
@@ -1833,7 +1832,6 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
1833
1832
const auto & node = *p_node;
1834
1833
const KernelCreateInfo* cuda_kernel_def = nullptr ;
1835
1834
if (!node.GetExecutionProviderType ().empty ()) {
1836
- defs_outside_cuda.insert (node.OutputDefs ().cbegin (), node.OutputDefs ().cend ());
1837
1835
continue ;
1838
1836
}
1839
1837
@@ -1847,14 +1845,10 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
1847
1845
1848
1846
// none of the provided registries has a CUDA kernel for this node
1849
1847
if (cuda_kernel_def == nullptr ) {
1850
- // node is not in cuda exeuction provider if no kernel def found,
1851
- // or if other execution provider already assigned to it
1852
- defs_outside_cuda.insert (node.OutputDefs ().cbegin (), node.OutputDefs ().cend ());
1853
1848
continue ;
1854
1849
}
1855
1850
1856
1851
bool not_supported = false ;
1857
- bool force_outside = false ;
1858
1852
bool force_inside = false ; // for some compute heavy ops, we'll force it to run inside CUDA
1859
1853
if (" LSTM" == node.OpType ()) {
1860
1854
// the supported activations covers the bidirectional mode
@@ -1877,60 +1871,29 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
1877
1871
// cast is not compute heavy, and may be placed outside
1878
1872
}
1879
1873
1880
- // Below rule only works for inference, for training, we can't do constant folding.
1881
- // We need find a better solution.
1882
- // Temporary disable the check here, the cost is all the cast will be on GPU now.
1883
- #ifndef ENABLE_TRAINING
1884
- if (!not_supported && !force_inside) {
1885
- // Note that nodes with only inputs from initializer would not be place on CUDA
1886
- // Ideally, those nodes should be eliminated in constant folding
1887
- bool should_force_outside = true ;
1888
- bool all_inputs_are_initializers = true ;
1889
- ORT_THROW_IF_ERROR (node.ForEachWithIndex (node.InputDefs (),
1890
- [&](const NodeArg& def, size_t index) {
1891
- // The input is not a initializer and the input is from CPU
1892
- // or the input declared as CPU memory and is from CPU
1893
- // in that case we should still keep the node on CUDA
1894
- bool initializer_input = graph.IsConstantInitializer (def.Name (), /* check_outer_scope*/ true );
1895
- bool input_is_on_cpu = defs_outside_cuda.count (&def) > 0 ;
1896
- if ((!initializer_input && !input_is_on_cpu) ||
1897
- (input_is_on_cpu && cuda_kernel_def->kernel_def ->IsInputOnCpu (index))) {
1898
- should_force_outside = false ;
1899
- }
1900
-
1901
- if (!initializer_input) {
1902
- all_inputs_are_initializers = false ;
1903
- }
1904
- return Status::OK ();
1905
- }));
1906
-
1907
- // If all the inputs are initializers, we shouldn't force it to CPU
1908
- if (should_force_outside && !all_inputs_are_initializers) {
1909
- force_outside = true ;
1910
- }
1911
- }
1912
- #endif
1913
- if (!force_inside && (not_supported || force_outside)) {
1914
- defs_outside_cuda.insert (node.OutputDefs ().cbegin (), node.OutputDefs ().cend ());
1874
+ if (!force_inside && not_supported) {
1915
1875
if (not_supported) {
1916
1876
LOGS_DEFAULT (WARNING) << " CUDA kernel not supported. Fallback to CPU execution provider for Op type: " << node.OpType () << " node name: " << node.Name ();
1917
- } else if (force_outside) {
1918
- LOGS_DEFAULT (INFO) << " Force fallback to CPU execution provider for Op type: " << node.OpType () << " node name: " << node.Name ();
1919
1877
}
1920
1878
} else {
1921
- // for nodes placed on CUDA, check if its output is on CPU
1922
- ORT_THROW_IF_ERROR (node.ForEachWithIndex (
1923
- node.OutputDefs (),
1924
- [&](const NodeArg& def, size_t out_index) {
1925
- if (cuda_kernel_def->kernel_def ->OutputMemoryType (out_index) != OrtMemTypeDefault)
1926
- defs_outside_cuda.insert (&def);
1927
- return Status::OK ();
1928
- }));
1929
- std::unique_ptr<IndexedSubGraph> sub_graph = onnxruntime::make_unique<IndexedSubGraph>();
1930
- sub_graph->nodes .push_back (node.Index ());
1931
- result.push_back (onnxruntime::make_unique<ComputeCapability>(std::move (sub_graph)));
1879
+ candidates.push_back (node.Index ());
1932
1880
}
1933
1881
}
1882
+
1883
+ // For CUDA EP, exclude the subgraph that is preferred to be placed in CPU
1884
+ // These are usually shape related computation subgraphs
1885
+ // Following logic can be extended for other EPs
1886
+ std::unordered_set<NodeIndex> cpu_nodes = GetCpuPreferedNodes (graph, Type (), kernel_registries, candidates);
1887
+
1888
+ std::vector<std::unique_ptr<ComputeCapability>> result;
1889
+ for (auto & node_index : candidates) {
1890
+ if (cpu_nodes.count (node_index) > 0 )
1891
+ continue ;
1892
+
1893
+ std::unique_ptr<IndexedSubGraph> sub_graph = onnxruntime::make_unique<IndexedSubGraph>();
1894
+ sub_graph->nodes .push_back (node_index);
1895
+ result.push_back (onnxruntime::make_unique<ComputeCapability>(std::move (sub_graph)));
1896
+ }
1934
1897
return result;
1935
1898
}
1936
1899
0 commit comments