[DML EP] Prefer MatMulInteger over MatMulIntegerToFloat in case of (#22469)

AnaghaRaoAMD · web-flow · commit f16036b6f5c5 · 2024-11-07T10:02:01.000-08:00
### Description
Skip `MatMulIntegerToFloat` fusion in case of DML EP for cases where
model uses Quantization before `MatMulInteger`. This is mainly done to
be resource efficient, and we have better `MatMulInteger` Metacommand
coverage which computes in int data type



### Motivation and Context
&lt;!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. --&gt;
diff --git a/onnxruntime/core/optimizer/matmul_integer_to_float.cc b/onnxruntime/core/optimizer/matmul_integer_to_float.cc
@@ -49,6 +49,49 @@ bool HasElementDataType(const NodeArg& node_arg, int32_t data_type) {
   return data_type == actual_data_type;
 }
 
+// Return total mnumber of Elements.
+static uint64_t NumElements(const TensorShapeProto* tensor_shape) {
+  if (nullptr == tensor_shape || tensor_shape->dim_size() < 1) {
+    return 0;
+  }
+  uint64_t num_elements = 1;
+
+  for (int i = 0; i < tensor_shape->dim_size(); i++) {
+    num_elements *= tensor_shape->dim(i).dim_value();
+  }
+  return num_elements;
+}
+
+bool CheckMatMulLargeTensors(const Node& matmulinteger_node, const Node& cast_node) {
+  const auto a_def = matmulinteger_node.InputDefs()[0];
+  const auto b_def = matmulinteger_node.InputDefs()[1];
+  const int a_dim_size = a_def->Shape()->dim_size();
+  const int b_dim_size = b_def->Shape()->dim_size();
+  uint64_t a_num_elements = NumElements(a_def->Shape());
+  uint64_t b_num_elements = NumElements(b_def->Shape());
+
+  if (a_dim_size != b_dim_size) {
+    bool a_is_broadcasted = a_dim_size < b_dim_size;
+    if (a_is_broadcasted) {
+      for (int i = 0; i < b_dim_size - a_dim_size; i++) {
+        a_num_elements *= b_def->Shape()->dim(i).dim_value();
+      }
+    } else {
+      for (int i = 0; i < a_dim_size - b_dim_size; i++) {
+        b_num_elements *= a_def->Shape()->dim(i).dim_value();
+      }
+    }
+  }
+
+  int output_data_type = HasElementDataType(*cast_node.OutputDefs()[0], ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) ? 2 : 4;
+  uint64_t total_bytes = (a_num_elements + b_num_elements) * output_data_type;
+
+  if (total_bytes > UINT32_MAX) {
+    return true;
+  }
+  return false;
+}
+
 /**
 MatMulIntegerToFloatFusion will fuse subgraph like below into MatMulIntegerToFloat:
 
@@ -114,6 +157,17 @@ Status MatMulIntegerToFloatFusion::ApplyImpl(Graph& graph, bool& modified, int g
       continue;
     }
 
+    const Node* p_dynamicquantize_node = graph_utils::FirstParentByType(*p_matmulinteger_node, "DynamicQuantizeLinear");
+
+    // Check MatMulInteger Nodes' input is coming from DynamicQuantizeLinear
+    // For larger tensors DynamicQuantizeLinear -> MatMulInteger is used to be resource efficient
+    // And we have better MatMulInteger Metacommand coverage in DML
+    if (is_dml_ep && p_dynamicquantize_node) {
+      if (CheckMatMulLargeTensors(matmulinteger_node, cast_node)) {
+        continue;
+      }
+    }
+
     // Find bias node
     Node* p_add_node = nullptr;
     if (optimizer_utils::CheckOutputEdges(graph, mul_node, 1)) {
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -5859,6 +5859,22 @@ TEST_F(GraphTransformationTests, MatMulIntegerToFloat16Test) {
   std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
   EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
 }
+
+TEST_F(GraphTransformationTests, MatMulIntegerToFloatLargeTensorTest) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/matmul_integer_to_float_large_tensor.onnx";
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  for (auto& node : graph.Nodes()) {
+    node.SetExecutionProviderType(kDmlExecutionProvider);
+  }
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique<MatMulIntegerToFloatFusion>(), TransformerLevel::Level2));
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_));
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 0);
+}
 #endif  // USE_DML
 
 #endif
diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.onnx b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.onnx
@@ -0,0 +1,41 @@
+	:�
+R
+inputAa_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear
+Y
+a_quantized
+inputB
+a_zp
+inputBZPmatmulinteger_outputMatMulInteger"MatMulInteger
+-
+a_scale
+inputBScalemul_1	mul_right"Mul
+:
+matmulinteger_outputcast_outputcast"Cast*	
+to�
+-
+mul_1
+cast_outputoutput
+mul_bottom"Mul+matmul_integer_to_float_large_tensor_fusionZ"
+inputA
+
+
+ 
+�
+
+�
+Z
+inputB
+
+
+�
+
+�
+Z
+inputBZP
+
+
+Z
+inputBScale
+
+
+B
diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.py b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.py
@@ -0,0 +1,49 @@
+from enum import Enum  # noqa: F401
+
+import onnx
+from onnx import TensorProto, helper
+
+
+def GenerateModel(model_name):  # noqa: N802
+    inputs = []
+    outputs = []
+    initializers = []
+    nodes = []
+
+    inputs.append(helper.make_tensor_value_info("inputA", TensorProto.FLOAT, [16, 32, 1280, 1280]))
+    inputs.append(helper.make_tensor_value_info("inputB", TensorProto.INT8, [1280, 1280]))
+    inputs.append(helper.make_tensor_value_info("inputBZP", TensorProto.INT8, [1]))
+    inputs.append(helper.make_tensor_value_info("inputBScale", TensorProto.FLOAT, [1]))
+
+    nodes = [  # construct graph
+        helper.make_node(
+            "DynamicQuantizeLinear",
+            ["inputA"],
+            ["a_quantized", "a_scale", "a_zp"],
+            "DynamicQuantizeLinear",
+        ),
+        helper.make_node(
+            "MatMulInteger",
+            ["a_quantized", "inputB", "a_zp", "inputBZP"],
+            ["matmulinteger_output"],
+            "MatMulInteger",
+        ),
+        helper.make_node("Mul", ["a_scale", "inputBScale"], ["mul_1"], "mul_right"),
+        helper.make_node("Cast", ["matmulinteger_output"], ["cast_output"], "cast", to=1),
+        helper.make_node("Mul", ["mul_1", "cast_output"], ["output"], "mul_bottom"),
+    ]
+
+    graph = helper.make_graph(
+        nodes,
+        "matmul_integer_to_float_large_tensor_fusion",  # name
+        inputs,
+        outputs,
+        initializers,
+    )
+
+    model = helper.make_model(graph)
+    onnx.save(model, model_name)
+
+
+if __name__ == "__main__":
+    GenerateModel("matmul_integer_to_float_large_tensor.onnx")

-Original file line number
+Diff line change
@@ @@ -0,0 +1,41 @@ @@
 +	:�
 +R
 +inputAa_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear
 +Y
 +a_quantized
 +inputB
 +a_zp
 +inputBZPmatmulinteger_outputMatMulInteger"MatMulInteger
 +-
 +a_scale
 +inputBScalemul_1	mul_right"Mul
 +:
 +matmulinteger_outputcast_outputcast"Cast*
 +to�
 +-
 +mul_1
 +cast_outputoutput
 +mul_bottom"Mul+matmul_integer_to_float_large_tensor_fusionZ"
 +inputA
++
++
++
 +�
++
 +�
 +Z
 +inputB
++
++
 +�
++
 +�
 +Z
 +inputBZP
++
++
 +Z
 +inputBScale
++
++
 +B