Skip to content

Commit f16036b

Browse files
authored
[DML EP] Prefer MatMulInteger over MatMulIntegerToFloat in case of (#22469)
### Description Skip `MatMulIntegerToFloat` fusion in case of DML EP for cases where model uses Quantization before `MatMulInteger`. This is mainly done to be resource efficient, and we have better `MatMulInteger` Metacommand coverage which computes in int data type ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. -->
1 parent a436b3a commit f16036b

File tree

4 files changed

+160
-0
lines changed

4 files changed

+160
-0
lines changed

onnxruntime/core/optimizer/matmul_integer_to_float.cc

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,49 @@ bool HasElementDataType(const NodeArg& node_arg, int32_t data_type) {
4949
return data_type == actual_data_type;
5050
}
5151

52+
// Return total mnumber of Elements.
53+
static uint64_t NumElements(const TensorShapeProto* tensor_shape) {
54+
if (nullptr == tensor_shape || tensor_shape->dim_size() < 1) {
55+
return 0;
56+
}
57+
uint64_t num_elements = 1;
58+
59+
for (int i = 0; i < tensor_shape->dim_size(); i++) {
60+
num_elements *= tensor_shape->dim(i).dim_value();
61+
}
62+
return num_elements;
63+
}
64+
65+
bool CheckMatMulLargeTensors(const Node& matmulinteger_node, const Node& cast_node) {
66+
const auto a_def = matmulinteger_node.InputDefs()[0];
67+
const auto b_def = matmulinteger_node.InputDefs()[1];
68+
const int a_dim_size = a_def->Shape()->dim_size();
69+
const int b_dim_size = b_def->Shape()->dim_size();
70+
uint64_t a_num_elements = NumElements(a_def->Shape());
71+
uint64_t b_num_elements = NumElements(b_def->Shape());
72+
73+
if (a_dim_size != b_dim_size) {
74+
bool a_is_broadcasted = a_dim_size < b_dim_size;
75+
if (a_is_broadcasted) {
76+
for (int i = 0; i < b_dim_size - a_dim_size; i++) {
77+
a_num_elements *= b_def->Shape()->dim(i).dim_value();
78+
}
79+
} else {
80+
for (int i = 0; i < a_dim_size - b_dim_size; i++) {
81+
b_num_elements *= a_def->Shape()->dim(i).dim_value();
82+
}
83+
}
84+
}
85+
86+
int output_data_type = HasElementDataType(*cast_node.OutputDefs()[0], ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) ? 2 : 4;
87+
uint64_t total_bytes = (a_num_elements + b_num_elements) * output_data_type;
88+
89+
if (total_bytes > UINT32_MAX) {
90+
return true;
91+
}
92+
return false;
93+
}
94+
5295
/**
5396
MatMulIntegerToFloatFusion will fuse subgraph like below into MatMulIntegerToFloat:
5497
@@ -114,6 +157,17 @@ Status MatMulIntegerToFloatFusion::ApplyImpl(Graph& graph, bool& modified, int g
114157
continue;
115158
}
116159

160+
const Node* p_dynamicquantize_node = graph_utils::FirstParentByType(*p_matmulinteger_node, "DynamicQuantizeLinear");
161+
162+
// Check MatMulInteger Nodes' input is coming from DynamicQuantizeLinear
163+
// For larger tensors DynamicQuantizeLinear -> MatMulInteger is used to be resource efficient
164+
// And we have better MatMulInteger Metacommand coverage in DML
165+
if (is_dml_ep && p_dynamicquantize_node) {
166+
if (CheckMatMulLargeTensors(matmulinteger_node, cast_node)) {
167+
continue;
168+
}
169+
}
170+
117171
// Find bias node
118172
Node* p_add_node = nullptr;
119173
if (optimizer_utils::CheckOutputEdges(graph, mul_node, 1)) {

onnxruntime/test/optimizer/graph_transform_test.cc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5859,6 +5859,22 @@ TEST_F(GraphTransformationTests, MatMulIntegerToFloat16Test) {
58595859
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
58605860
EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
58615861
}
5862+
5863+
TEST_F(GraphTransformationTests, MatMulIntegerToFloatLargeTensorTest) {
5864+
constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/matmul_integer_to_float_large_tensor.onnx";
5865+
std::shared_ptr<Model> p_model;
5866+
ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
5867+
Graph& graph = p_model->MainGraph();
5868+
5869+
for (auto& node : graph.Nodes()) {
5870+
node.SetExecutionProviderType(kDmlExecutionProvider);
5871+
}
5872+
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
5873+
ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique<MatMulIntegerToFloatFusion>(), TransformerLevel::Level2));
5874+
ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_));
5875+
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
5876+
EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 0);
5877+
}
58625878
#endif // USE_DML
58635879

58645880
#endif

onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.onnx

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
 :�
2+
R
3+
inputA a_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear
4+
Y
5+
a_quantized
6+
inputB
7+
a_zp
8+
inputBZPmatmulinteger_outputMatMulInteger"MatMulInteger
9+
-
10+
a_scale
11+
inputBScalemul_1 mul_right"Mul
12+
:
13+
matmulinteger_output cast_outputcast"Cast*
14+
to�
15+
-
16+
mul_1
17+
cast_outputoutput
18+
mul_bottom"Mul+matmul_integer_to_float_large_tensor_fusionZ"
19+
inputA
20+

21+

22+

23+
�
24+
25+
�
26+
Z
27+
inputB
28+

29+
30+
�
31+
32+
�
33+
Z
34+
inputBZP
35+
36+

37+
Z
38+
inputBScale
39+
40+

41+
B
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
from enum import Enum # noqa: F401
2+
3+
import onnx
4+
from onnx import TensorProto, helper
5+
6+
7+
def GenerateModel(model_name): # noqa: N802
8+
inputs = []
9+
outputs = []
10+
initializers = []
11+
nodes = []
12+
13+
inputs.append(helper.make_tensor_value_info("inputA", TensorProto.FLOAT, [16, 32, 1280, 1280]))
14+
inputs.append(helper.make_tensor_value_info("inputB", TensorProto.INT8, [1280, 1280]))
15+
inputs.append(helper.make_tensor_value_info("inputBZP", TensorProto.INT8, [1]))
16+
inputs.append(helper.make_tensor_value_info("inputBScale", TensorProto.FLOAT, [1]))
17+
18+
nodes = [ # construct graph
19+
helper.make_node(
20+
"DynamicQuantizeLinear",
21+
["inputA"],
22+
["a_quantized", "a_scale", "a_zp"],
23+
"DynamicQuantizeLinear",
24+
),
25+
helper.make_node(
26+
"MatMulInteger",
27+
["a_quantized", "inputB", "a_zp", "inputBZP"],
28+
["matmulinteger_output"],
29+
"MatMulInteger",
30+
),
31+
helper.make_node("Mul", ["a_scale", "inputBScale"], ["mul_1"], "mul_right"),
32+
helper.make_node("Cast", ["matmulinteger_output"], ["cast_output"], "cast", to=1),
33+
helper.make_node("Mul", ["mul_1", "cast_output"], ["output"], "mul_bottom"),
34+
]
35+
36+
graph = helper.make_graph(
37+
nodes,
38+
"matmul_integer_to_float_large_tensor_fusion", # name
39+
inputs,
40+
outputs,
41+
initializers,
42+
)
43+
44+
model = helper.make_model(graph)
45+
onnx.save(model, model_name)
46+
47+
48+
if __name__ == "__main__":
49+
GenerateModel("matmul_integer_to_float_large_tensor.onnx")

0 commit comments

Comments
 (0)