Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions onnxruntime/core/optimizer/matmul_integer_to_float.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,49 @@
return data_type == actual_data_type;
}

// Return total mnumber of Elements.
static uint64_t NumElements(const TensorShapeProto* tensor_shape) {
if (nullptr == tensor_shape || tensor_shape->dim_size() < 1) {
return 0;
}
uint64_t num_elements = 1;
// First N-1 dimension must equal to 1
for (int i = 0; i <= tensor_shape->dim_size() - 1; i++) {
num_elements *= tensor_shape->dim(i).dim_value();
}
return num_elements;
}

bool CheckMatMulLargeTensors(const Node& matmulinteger_node, const Node& cast_node) {
const auto a_def = matmulinteger_node.InputDefs()[0];
const auto b_def = matmulinteger_node.InputDefs()[1];
const int a_dim_size = a_def->Shape()->dim_size();
const int b_dim_size = b_def->Shape()->dim_size();
uint64_t inputA_bytes = NumElements(a_def->Shape());
uint64_t inputB_bytes = NumElements(b_def->Shape());

if (a_dim_size != b_dim_size) {
bool a_is_broadcasted = a_dim_size < b_dim_size;
if (a_is_broadcasted) {
for (int i = 0; i < b_dim_size - a_dim_size; i++) {
inputA_bytes *= b_def->Shape()->dim(i).dim_value();
}
} else {
for (int i = 0; i < a_dim_size - b_dim_size; i++) {
inputB_bytes *= a_def->Shape()->dim(i).dim_value();
}
}
}

int output_data_type = HasElementDataType(*cast_node.OutputDefs()[0], ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) ? 2 : 4;

Check warning on line 86 in onnxruntime/core/optimizer/matmul_integer_to_float.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Lines should be <= 120 characters long [whitespace/line_length] [2] Raw Output: onnxruntime/core/optimizer/matmul_integer_to_float.cc:86: Lines should be <= 120 characters long [whitespace/line_length] [2]
uint64_t total_bytes = (inputA_bytes + inputB_bytes) * output_data_type;

if (total_bytes > UINT32_MAX) {
return true;
}
return false;
}

/**
MatMulIntegerToFloatFusion will fuse subgraph like below into MatMulIntegerToFloat:
Expand Down Expand Up @@ -114,6 +157,17 @@
continue;
}

const Node* p_dynamicquantize_node = graph_utils::FirstParentByType(*p_matmulinteger_node, "DynamicQuantizeLinear");

// Check MatMulInteger Nodes' input is coming from DynamicQuantizeLinear
// For larger tensors DynamicQuantizeLinear -> MatMulInteger is used to be resource efficient
// And we have better MatMulInteger Metacommand coverage in DML
if (is_dml_ep && p_dynamicquantize_node) {
if (CheckMatMulLargeTensors(matmulinteger_node, cast_node)) {
continue;
}
}

// Find bias node
Node* p_add_node = nullptr;
if (optimizer_utils::CheckOutputEdges(graph, mul_node, 1)) {
Expand Down
16 changes: 16 additions & 0 deletions onnxruntime/test/optimizer/graph_transform_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5695,6 +5695,22 @@ TEST_F(GraphTransformationTests, MatMulIntegerToFloat16Test) {
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
}

TEST_F(GraphTransformationTests, MatMulIntegerToFloatLargeTensorTest) {
constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/matmul_integer_to_float_large_tensor.onnx";
std::shared_ptr<Model> p_model;
ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
Graph& graph = p_model->MainGraph();

for (auto& node : graph.Nodes()) {
node.SetExecutionProviderType(kDmlExecutionProvider);
}
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique<MatMulIntegerToFloatFusion>(), TransformerLevel::Level2));
ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_));
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 0);
}
#endif // USE_DML

#endif
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
 :é
R
inputA a_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear
Y
a_quantized
inputB
a_zp
inputBZPmatmulinteger_outputMatMulInteger"MatMulInteger
-
a_scale
inputBScalemul_1 mul_right"Mul
:
matmulinteger_output cast_outputcast"Cast*
to 
-
mul_1
cast_outputoutput
mul_bottom"Mul+matmul_integer_to_float_large_tensor_fusionZ"
inputA



€

€
Z
inputB


€

€
Z
inputBZP


Z
inputBScale


B
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from enum import Enum # noqa: F401

Check notice

Code scanning / CodeQL

Unused import Note test

Import of 'Enum' is not used.

import onnx

Check notice

Code scanning / CodeQL

Module is imported with 'import' and 'import from' Note test

Module 'onnx' is imported with both 'import' and 'import from'.
Module 'onnxruntime.test.onnx' is imported with both 'import' and 'import from'.
from onnx import TensorProto, helper


def GenerateModel(model_name): # noqa: N802
inputs = []
outputs = []
initializers = []
nodes = []

inputs.append(helper.make_tensor_value_info('inputA', TensorProto.FLOAT, [16, 32, 1280, 1280]))
inputs.append(helper.make_tensor_value_info('inputB', TensorProto.INT8, [1280, 1280]))
inputs.append(helper.make_tensor_value_info('inputBZP', TensorProto.INT8, [1]))
inputs.append(helper.make_tensor_value_info('inputBScale', TensorProto.FLOAT, [1]))

nodes = [ # construct graph
helper.make_node(
"DynamicQuantizeLinear",
["inputA"],
["a_quantized", "a_scale", "a_zp"],
"DynamicQuantizeLinear",
),
helper.make_node(
"MatMulInteger",
["a_quantized", "inputB", "a_zp", "inputBZP"],
["matmulinteger_output"],
"MatMulInteger",
),
helper.make_node("Mul", ["a_scale", "inputBScale"], ["mul_1"], "mul_right"),
helper.make_node("Cast", ["matmulinteger_output"], ["cast_output"], "cast", to=1),
helper.make_node("Mul", ["mul_1", "cast_output"], ['output'], "mul_bottom"),
]

graph = helper.make_graph(
nodes,
"matmul_integer_to_float_large_tensor_fusion", # name
inputs,
outputs,
initializers,
)

model = helper.make_model(graph)
onnx.save(model, model_name)


if __name__ == "__main__":
GenerateModel("matmul_integer_to_float_large_tensor.onnx")
Loading