Extend vocab padding for logits MatMul for fp16 GPT2 GreedySearch (microsoft#13842)

hariharans29 · fuhengwu2021 · commit 1211c24cb87a · 2022-12-26T04:23:34.000Z
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
@@ -1708,6 +1708,8 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>no repeat ngrams size</dd>
 <dt><tt>pad_token_id</tt> : int (required)</dt>
 <dd>The id of the padding token</dd>
+<dt><tt>vocab_size</tt> : int</dt>
+<dd>Size of the vocabulary. If not provided, it will be inferred from the decoder subgraph's output shape</dd>
 </dl>
 
 #### Inputs (2 - 7)
diff --git a/onnxruntime/contrib_ops/cpu/transformers/greedy_search_parameters.cc b/onnxruntime/contrib_ops/cpu/transformers/greedy_search_parameters.cc
@@ -14,6 +14,7 @@ void GreedySearchParameters::ParseFromAttributes(const OpKernelInfo& info) {
   pad_token_id = static_cast<int>(info.GetAttrOrDefault<int64_t>("pad_token_id", -1));
   decoder_start_token_id = static_cast<int>(info.GetAttrOrDefault<int64_t>("decoder_start_token_id", -1));
   no_repeat_ngram_size = static_cast<int>(info.GetAttrOrDefault<int64_t>("no_repeat_ngram_size", 0));
+  vocab_size = static_cast<int>(info.GetAttrOrDefault<int64_t>("vocab_size", -1));
 }
 
 void GreedySearchParameters::ParseFromInputs(OpKernelContext* context) {
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
@@ -244,7 +244,7 @@ Status ProcessLogits(const OrtValue& logits,                                 //
   // NOTE: `padded_vocab_size` MAY be different from `vocab_size`.
   // But the following implementation should work correctly if they are the same
   // or different.
-  int padded_vocab_size = static_cast<int>(logits_shape[2]);
+  auto padded_vocab_size = static_cast<int>(logits_shape[2]);
 
   cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);
 
@@ -475,12 +475,17 @@ Status GreedySearchProcessLogits(
   typedef typename ToCudaType<T>::MappedType CudaT;
   const CudaT* logits_data = reinterpret_cast<const CudaT*>(logits.Get<Tensor>().Data<T>());
 
-  // Logits has shape (batch_size, input_length, vocab_size),
+  // Logits has shape (batch_size, input_length, padded_vocab_size),
   // where input_length equals to parameters_->sequence_length for first subgraph call, and 1 for the remaining calls.
   const TensorShape& logits_shape = logits.Get<Tensor>().Shape();
   ORT_ENFORCE(logits_shape.NumDimensions() == 3);
   auto input_length = logits_shape[1];
 
+  // NOTE: `padded_vocab_size` MAY be different from `vocab_size`.
+  // But the following implementation should work correctly if they are the same
+  // or different.
+  auto padded_vocab_size = static_cast<int>(logits_shape[2]);
+
   cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);
 
   // Get logits for the last token:
@@ -489,13 +494,18 @@ Status GreedySearchProcessLogits(
   gsl::span<T>& next_token_scores = greedy_state->next_token_scores;
 
   // TODO(tianleiwu): use one kernel to replace a loop of memory copy.
-  const CudaT* current_logits = logits_data + (input_length - 1) * vocab_size;
+  // Move the pointer in increments of padded_vocab_size to account for any padding
+  // if any in the logits weight of the MatMul.
+  const CudaT* current_logits = logits_data + (input_length - 1) * padded_vocab_size;
   for (int i = 0; i < batch_beam_size; i++) {
+    // We only copy what is relevant (i.e.) vocab_size as padded_vocab_size will contain
+    // some logits corresponding to the "padded" vocab size which we will ignore
+    // for token generation.
     gsl::span<const T> source(reinterpret_cast<const T*>(current_logits), vocab_size);
     gsl::span<T> target = next_token_scores.subspan(i * vocab_size, vocab_size);
     CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target.data(), source.data(), sizeof(T) * vocab_size,
                                          cudaMemcpyDeviceToDevice, cuda_stream));
-    current_logits += input_length * vocab_size;
+    current_logits += input_length * padded_vocab_size;
   }
 
 #ifdef DEBUG_GENERATION
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -1087,6 +1087,10 @@ ONNX_MS_OPERATOR_SET_SCHEMA(GreedySearch, 1,
                                 .Attr("model_type", "model type: 0 for decoder only like GPT-2; 1 for encoder decoder like Bart", AttributeProto::INT, static_cast<int64_t>(0))
                                 .Attr("encoder", "The subgraph for initialization of encoder and decoder. It will be called once before decoder subgraph.", AttributeProto::GRAPH, OPTIONAL_VALUE)
                                 .Attr("decoder", "Decoder subgraph to execute in a loop.", AttributeProto::GRAPH)
+                                .Attr("vocab_size",
+                                      "Size of the vocabulary. "
+                                      "If not provided, it will be inferred from the decoder subgraph's output shape",
+                                      AttributeProto::INT, static_cast<int64_t>(-1))
                                 .Input(0, "input_ids", "The sequence used as a prompt for the generation. Shape is (batch_size, sequence_length)", "I")
                                 .Input(1, "max_length", "The maximum length of the sequence to be generated. Shape is (1)", "I")
                                 .Input(2, "min_length", "The minimum length below which the score of eos_token_id is set to -Inf. Shape is (1)", "I", OpSchema::Optional)
diff --git a/onnxruntime/python/tools/transformers/convert_generation.py b/onnxruntime/python/tools/transformers/convert_generation.py
@@ -906,14 +906,14 @@ def convert_generation_model(args: argparse.Namespace, generation_type: Generati
     # We only want to pad the logits MatMul weight in the decoder for fp16 models.
     # The inherent assumption is that fp16 models run on GPU for which all
     # dims need to be a multiple of 8 to leverage tensor cores.
-    # NOTE: We currently only support padding the MatMul logits weight for GPT2 BeamSearch.
+    # NOTE: We currently only support padding the MatMul logits weight for GPT2 GreedySearch/BeamSearch.
     # This can be expanded to other models/decoding strategies later
     logits_matmul_weight_padded = False
     if (
         args.pad_vocab_size
         and args.precision == Precision.FLOAT16
         and is_gpt2
-        and generation_type == GenerationType.BEAMSEARCH
+        and (generation_type == GenerationType.BEAMSEARCH or generation_type == GenerationType.GREEDYSEARCH)
     ):
         logger.info(
             f"Pad logits MatMul weights for optimal MatMul perf in fp16 on {args.decoder_onnx}. "
diff --git a/onnxruntime/test/contrib_ops/greedy_search_test.cc b/onnxruntime/test/contrib_ops/greedy_search_test.cc
@@ -0,0 +1,82 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <memory>
+#include <vector>
+#include "gtest/gtest.h"
+#include "core/common/gsl.h"
+#include "core/session/onnxruntime_cxx_api.h"
+#include "test/common/cuda_op_test_utils.h"
+
+extern std::unique_ptr<Ort::Env> ort_env;
+
+namespace onnxruntime {
+namespace test {
+
+TEST(GreedySearchTest, GptGreedySearchFp16_VocabPadded) {
+  std::vector<int64_t> input_ids_shape{2, 4};
+  std::vector<int32_t> input_ids{
+      0, 0, 0, 52, 0, 0, 195, 731};
+
+  std::vector<int64_t> parameter_shape{1};
+  std::vector<int32_t> max_length{10};
+  std::vector<int32_t> min_length{1};
+  std::vector<float> repetition_penalty{1.0f};
+
+  std::vector<int64_t> expected_output_shape{input_ids_shape[0], max_length[0]};
+
+  std::vector<int32_t> expected_output{
+      0, 0, 0, 52, 204, 204, 204, 204, 204, 204,
+      0, 0, 195, 731, 731, 114, 114, 114, 114, 114};
+
+  Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
+  auto input_ids_tensor = Ort::Value::CreateTensor(
+      info, input_ids.data(), input_ids.size(), input_ids_shape.data(), input_ids_shape.size());
+
+  auto max_length_tensor = Ort::Value::CreateTensor(
+      info, max_length.data(), max_length.size(), parameter_shape.data(), parameter_shape.size());
+
+  auto min_length_tensor = Ort::Value::CreateTensor(
+      info, min_length.data(), min_length.size(), parameter_shape.data(), parameter_shape.size());
+
+  auto repetition_penalty_tensor = Ort::Value::CreateTensor(
+      info, repetition_penalty.data(), repetition_penalty.size(), parameter_shape.data(), parameter_shape.size());
+
+  std::vector<Ort::Value> ort_inputs;
+  ort_inputs.push_back(std::move(input_ids_tensor));
+  ort_inputs.push_back(std::move(max_length_tensor));
+  ort_inputs.push_back(std::move(min_length_tensor));
+  ort_inputs.push_back(std::move(repetition_penalty_tensor));
+  const char* input_names[] = {"input_ids", "max_length", "min_length", "repetition_penalty"};
+  const char* const output_names[] = {"sequences"};
+
+  constexpr int min_cuda_architecture = 530;
+  if (HasCudaEnvironment(min_cuda_architecture)) {
+    Ort::SessionOptions session_options;
+#ifdef USE_CUDA
+    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+#endif
+
+    // The following model was obtained by padding the vocabulary size in testdata/transformers/tiny_gpt2_beamsearch_fp16.onnx
+    // (by making beam_size == 1) from 1000 to 1600 (just for illustrative and testing purposes) to see if the beam search
+    // implementation can handle such a scenario
+    Ort::Session session(*ort_env, ORT_TSTR("testdata/transformers/tiny_gpt2_greedysearch_fp16_padded_vocab.onnx"), session_options);
+
+    auto ort_outputs = session.Run(Ort::RunOptions{}, input_names, ort_inputs.data(), ort_inputs.size(),
+                                   output_names, 1);
+
+    ASSERT_EQ(ort_outputs.size(), 1U);
+    const auto& sequences = ort_outputs[0];
+    ASSERT_TRUE(sequences.IsTensor());
+
+    auto result_ts = sequences.GetTensorTypeAndShapeInfo();
+    ASSERT_EQ(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, result_ts.GetElementType());
+
+    ASSERT_EQ(expected_output_shape, result_ts.GetShape());
+    const auto* result_vals = sequences.GetTensorData<int32_t>();
+    auto result_span = gsl::make_span(result_vals, expected_output.size());
+    ASSERT_TRUE(std::equal(expected_output.cbegin(), expected_output.cend(), result_span.begin(), result_span.end()));
+  }
+}
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/testdata/transformers/tiny_gpt2_greedysearch_fp16_padded_vocab.onnx b/onnxruntime/test/testdata/transformers/tiny_gpt2_greedysearch_fp16_padded_vocab.onnx

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,7 @@ void GreedySearchParameters::ParseFromAttributes(const OpKernelInfo& info) {`
`14`	`14`	`pad_token_id = static_cast<int>(info.GetAttrOrDefault<int64_t>("pad_token_id", -1));`
`15`	`15`	`decoder_start_token_id = static_cast<int>(info.GetAttrOrDefault<int64_t>("decoder_start_token_id", -1));`
`16`	`16`	`no_repeat_ngram_size = static_cast<int>(info.GetAttrOrDefault<int64_t>("no_repeat_ngram_size", 0));`
	`17`	`+ vocab_size = static_cast<int>(info.GetAttrOrDefault<int64_t>("vocab_size", -1));`
`17`	`18`	`}`
`18`	`19`
`19`	`20`	`void GreedySearchParameters::ParseFromInputs(OpKernelContext* context) {`