intel
diff --git a/‎include/onnxruntime/core/framework/op_kernel.h‎
Lines changed: 0 additions & 22 deletions b/‎include/onnxruntime/core/framework/op_kernel.h‎
Lines changed: 0 additions & 22 deletions
diff --git a/‎include/onnxruntime/core/graph/graph.h‎
Lines changed: 2 additions & 27 deletions b/‎include/onnxruntime/core/graph/graph.h‎
Lines changed: 2 additions & 27 deletions
diff --git a/‎include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h‎
Lines changed: 0 additions & 6 deletions b/‎include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/bert/attention.cc‎
Lines changed: 0 additions & 2 deletions b/‎onnxruntime/contrib_ops/cpu/bert/attention.cc‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc‎
Lines changed: 0 additions & 2 deletions b/‎onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_lstm.cc‎
Lines changed: 1 addition & 2 deletions b/‎onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_lstm.cc‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc‎
Lines changed: 0 additions & 56 deletions b/‎onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc‎
Lines changed: 0 additions & 56 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/skip_layer_norm.cc‎
Lines changed: 0 additions & 1 deletion b/‎onnxruntime/contrib_ops/cpu/skip_layer_norm.cc‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎onnxruntime/contrib_ops/cpu/skip_layer_norm.h‎
Lines changed: 1 addition & 1 deletion b/‎onnxruntime/contrib_ops/cpu/skip_layer_norm.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc‎
Lines changed: 0 additions & 1 deletion b/‎onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc‎
Lines changed: 0 additions & 1 deletion
@@ -79,7 +79,6 @@ class OpKernel {
   //               the allocator tied to the session if the kernel owns the pre-packed buffer or an
   //               allocator shared between sessions if the pre-packed buffer is to be shared across sessions
   //               (i.e.) the kernel does not own the buffer.
-  // @param save_prepacked_initializers: Set it to true if intend to save prepacked initializers to external data file.
   // @param is_packed: Set it to true if the kernel packed the tensor or to false
   //                   The kernel is responsible for keeping the packed data and related metadata if is_packed is true,
   //                   and the original initialized constant tensor will be released and not accessible anymore in
@@ -89,7 +88,6 @@ class OpKernel {
 
   virtual Status
   PrePack(const Tensor& /*tensor*/, int /*input_idx*/, AllocatorPtr /*alloc*/,
-          bool, /*save_prepacked_initializers*/
           /*out*/ bool& is_packed, /*out*/ PrePackedWeights* /*prepacked_weights*/) {
     is_packed = false;
     return Status::OK();
@@ -131,26 +129,6 @@ class OpKernel {
     return Status::OK();
   }
 
-  // Override this function to get pre-packed tensors from this kernel.
-  // Only useful for models run on PC with CPU so ORT could load prepacked weights directly from
-  // ONNX data file with mmap and no need to do prepacking on fly to save a lot of heap memory.
-  // @param input_idx : The index of input we prepacked before and intend to get packed tensor back.
-  // Please refer to matmul_nbits kernel for a complete example.
-  virtual std::optional<Tensor> GetPrePackTensor(int /*input_idx*/) {
-    return std::nullopt;
-  }
-
-  // Override this function to set pre-packed tensors to this kernel and restore prepacked weight buffer.
-  // Only useful for models run on PC with CPU so ORT could load prepacked weights directly from
-  // ONNX data file with mmap and no need to do prepacking on fly to save a lot of heap memory.
-  // Please refer to matmul_nbits kernel for a complete example.
-  // @param input_idx : The input index of the tensor in this kernel.
-  // @param pre_packed_tensor: The prepacked tensor read from onnx data file and use the prepacked tensor
-  // to restore prepacked weight buffer.
-  virtual Status SetPrePackTensor(int /*input_idx*/, const Tensor& /*pre_packed_tensor*/) {
-    return Status::OK();
-  }
-
   const OrtDevice GetDevice(OrtMemType mem_type) const;
   const OpKernelInfo& Info() const {
     return *op_kernel_info_;
 
@@ -1148,11 +1148,6 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   void FinalizeFuseSubGraph(const IndexedSubGraph& sub_graph, Node& fused_node);
 #endif
 
-  // Since one constant initializer could be used by different kernels
-  // and prepacked differently, use an unordered_map to store prepacked
-  // initializer in format of <[initializer_name], <[node_name], [prepacked_initializer]>>
-  typedef std::unordered_map<std::string, std::unordered_map<std::string, ONNX_NAMESPACE::TensorProto>> PrePackedTensorProtoToSave;
-
 #if !defined(ORT_MINIMAL_BUILD)
   /** Gets the GraphProto representation of this Graph. */
   const ONNX_NAMESPACE::GraphProto& ToGraphProto();
@@ -1187,26 +1182,18 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   @param initializer_size_threshold initializers larger or equal to this threshold (in bytes) are saved
   in the external file. Initializer smaller than this threshold are included in the onnx file.
   @param align_info offset alignment info.
-  @param save_prepacked_constant_initializers whether to save prepacked initializer into external data file.
-         If set false to this boolean, prepacked initializer will not be saved into onnxruntime data file,
-         we keep constant initializer as it is.
-  @param pre_packed_initializers struct used to store all the prepacked initializers.
   @returns GraphProto serialization of the graph.
   */
   ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
                                                                   const std::filesystem::path& model_file_path,
                                                                   size_t initializer_size_threshold,
-                                                                  const OffsetAlignmentInfo& align_info,
-                                                                  bool save_prepacked_constant_initializers,
-                                                                  PrePackedTensorProtoToSave& pre_packed_initializers) const;
+                                                                  const OffsetAlignmentInfo& align_info) const;
 
   ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
                                                                   const std::filesystem::path& model_file_path,
                                                                   size_t initializer_size_threshold) const {
     OffsetAlignmentInfo default_options;
-    PrePackedTensorProtoToSave pre_packed_initializers;
-    return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options,
-                                                false, pre_packed_initializers);
+    return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options);
   }
 
   /** Gets the ISchemaRegistry instances being used with this Graph. */
@@ -1521,18 +1508,6 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
  private:
   void InitializeStateFromModelFileGraphProto();
 
-  // Private method used to setup external initializer properly during model save,
-  // this external initializer could be oroginal initializer or prepacked initializer.
-  static void SetUpExternalInitializer(const Graph::OffsetAlignmentInfo& align_info,
-                                       size_t tensor_bytes_size,
-                                       int64_t& external_offset,
-                                       std::ofstream& external_stream,
-                                       gsl::span<const uint8_t> raw_data,
-                                       ONNX_NAMESPACE::TensorProto& output_proto,
-                                       const std::filesystem::path& external_file_path,
-                                       const ONNX_NAMESPACE::TensorProto& initializer,
-                                       bool is_prepacked);
-
   // Add node with specified <node_proto>.
   Node& AddNode(const ONNX_NAMESPACE::NodeProto& node_proto,
                 const ArgNameToTypeMap& name_to_type);
 
@@ -246,12 +246,6 @@ static const char* const kOrtSessionOptionsDisableCPUEPFallback = "session.disab
 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFileName =
     "session.optimized_model_external_initializers_file_name";
 
-// Use this config when save prepacked constant initializers to onnx external data file.
-// Default is not save prepacked initializers to onnx data file.
-// Sample usage: sess_options.add_session_config_entry('session.save_prepacked_constant_initializers',  "1")
-static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers =
-    "session.save_prepacked_constant_initializers";
-
 // Use this config to control the minimum size of the initializer when externalizing it during serialization
 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
     "session.optimized_model_external_initializers_min_size_in_bytes";
 
@@ -30,7 +30,6 @@ class Attention : public OpKernel, public AttentionCPUBase {
   Status Compute(OpKernelContext* context) const override;
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 
@@ -102,7 +101,6 @@ bool Attention<T>::IsPackWeightsSuccessful(int qkv_index,
 
 template <typename T>
 Status Attention<T>::PrePack(const Tensor& weights, int input_idx, AllocatorPtr alloc,
-                             bool /*save_prepacked_initializers*/,
                              /*out*/ bool& is_packed,
                              /*out*/ PrePackedWeights* prepacked_weights) {
   /* The PrePack() massages the weights to speed up Compute(), there is an option to
 
@@ -24,7 +24,6 @@ class QAttention : public OpKernel, public AttentionCPUBase {
   Status Compute(OpKernelContext* context) const override;
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  bool& /*out*/ is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 
@@ -59,7 +58,6 @@ QAttention<T>::QAttention(const OpKernelInfo& info) : OpKernel(info), AttentionC
 
 template <typename T>
 Status QAttention<T>::PrePack(const Tensor& weights, int input_idx, AllocatorPtr alloc,
-                              bool /*save_prepacked_initializers*/,
                               /*out*/ bool& is_packed,
                               /*out*/ PrePackedWeights* prepacked_weights) {
   if (1 != input_idx) {
 
@@ -13,7 +13,7 @@ class DynamicQuantizeLSTM : public OpKernel, public LSTMBase {
   DynamicQuantizeLSTM(const OpKernelInfo& info) : OpKernel(info), LSTMBase(info) {}
 
   Status PrePack(const Tensor& tensor, int input_idx,
-                 AllocatorPtr alloc, bool save_prepacked_initializers, /*out*/ bool& is_packed,
+                 AllocatorPtr alloc, /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 
   Status UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers,
@@ -91,7 +91,6 @@ static void UseSharedPrePackedBuffersImpl(std::vector<BufferUniquePtr>& prepacke
 }
 
 Status DynamicQuantizeLSTM::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                                    bool /*save_prepacked_initializers*/,
                                     /*out*/ bool& is_packed,
                                     /*out*/ PrePackedWeights* prepacked_weights) {
   is_packed = false;
 
@@ -120,19 +120,12 @@ class MatMulNBits final : public OpKernel {
   Status Compute(OpKernelContext* context) const override;
 
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 bool save_prepacked_initializers,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override;
 
-  void ConvertPrepackWeightIntoTensor(const onnxruntime::Tensor& tensor, int input_idx);
-
   Status UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers, int input_idx,
                                    /*out*/ bool& used_shared_buffers) override;
 
-  std::optional<Tensor> GetPrePackTensor(int /*input_idx*/) override;
-
-  Status SetPrePackTensor(int input_idx, const Tensor& pre_packed_tensor) override;
-
  private:
   const size_t K_;
   const size_t N_;
@@ -147,8 +140,6 @@ class MatMulNBits final : public OpKernel {
   size_t packed_b_size_{0};
   IAllocatorUniquePtr<float> scales_fp32_{};
   IAllocatorUniquePtr<float> bias_fp32_{};
-  std::optional<Tensor> packed_tensor_{std::nullopt};
-  MLDataType prepack_tensor_data_type_;
 
   bool has_zp_input_{false};
 
@@ -176,22 +167,8 @@ class MatMulNBits final : public OpKernel {
                         const MatMulComputeHelper& helper) const;
 };
 
-template <typename T1>
-void MatMulNBits<T1>::ConvertPrepackWeightIntoTensor(const onnxruntime::Tensor& tensor, int input_idx) {
-  if (input_idx == InputIndex::B) {
-    prepack_tensor_data_type_ = tensor.DataType();
-  }
-
-  TensorShapeVector weights_dims = {static_cast<int64_t>((packed_b_size_ - 1) / prepack_tensor_data_type_->Size()) + 1};
-  packed_tensor_ = Tensor(prepack_tensor_data_type_,
-                          TensorShape(weights_dims),
-                          packed_b_.get(),
-                          OrtMemoryInfo(CPU, OrtAllocatorType::OrtDeviceAllocator));
-}
-
 template <typename T1>
 Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
-                                bool save_prepacked_initializers,
                                 /*out*/ bool& is_packed,
                                 /*out*/ PrePackedWeights* prepacked_weights) {
   ORT_UNUSED_PARAMETER(prepacked_weights);
@@ -227,18 +204,13 @@ Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ All
 #endif  // MLAS_TARGET_AMD64_IX86
   }
 
-  if (save_prepacked_initializers) {
-    ConvertPrepackWeightIntoTensor(tensor, input_idx);
-  }
-
   return Status::OK();
 }
 
 #if !defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || !defined(MLAS_TARGET_ARM64)
 // Non-ARM-with-fp16-intrinsics fall back fp16 to fp32.
 template <>
 Status MatMulNBits<MLFloat16>::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
-                                       bool save_prepacked_initializers,
                                        /*out*/ bool& is_packed,
                                        /*out*/ PrePackedWeights* prepacked_weights) {
   ORT_UNUSED_PARAMETER(prepacked_weights);
@@ -288,34 +260,6 @@ Status MatMulNBits<MLFloat16>::PrePack(const Tensor& tensor, int input_idx, /*ou
 #endif  // MLAS_TARGET_AMD64_IX86
   }
 
-  if (save_prepacked_initializers) {
-    ConvertPrepackWeightIntoTensor(tensor, input_idx);
-  }
-
-  return Status::OK();
-}
-
-template <typename T1>
-std::optional<Tensor> MatMulNBits<T1>::GetPrePackTensor(int input_idx) {
-  // For this kernel, prepack is performed on input_B, and possibly scales, zeros_points.
-  // During compute process, scales and zeros_points will keep as it is and only use prepacked
-  // buffer to replace input_B.
-  // Inorder to cope with this logic, we need to return latest prepacked buffer and only serialize
-  // the latest one. So, we need to always return packed_tensor_ here not only for input_B.
-  ORT_UNUSED_PARAMETER(input_idx);
-  return std::move(packed_tensor_);
-}
-
-template <typename T1>
-Status MatMulNBits<T1>::SetPrePackTensor(int input_idx, const Tensor& pre_packed_tensor) {
-  if (input_idx == 1) {
-    // pre_packed_tensor is constant initialized tensor and its lifecycle is managed by session_state,
-    // session_state will release memory from pre_packed_tensor. packed_b_ will not release memory so
-    // pass empty/default buffer deleter here.
-    // const_cast here is temporary, will fix in follow up PR.
-    packed_b_ = BufferUniquePtr(const_cast<void*>(pre_packed_tensor.DataRaw()), BufferDeleter());
-  }
-
   return Status::OK();
 }
 #endif  // end !MLAS_F16VEC_INTRINSICS_SUPPORTED || !MLAS_TARGET_ARM64
 
@@ -252,7 +252,6 @@ Status SkipLayerNorm<T, simplified>::Compute(OpKernelContext* p_ctx) const {
 
 template <typename T, bool simplified>
 Status SkipLayerNorm<T, simplified>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                                             bool /*save_prepacked_initializers*/,
                                              bool& is_packed, PrePackedWeights* prepacked_weights) {
   ORT_UNUSED_PARAMETER(prepacked_weights);
   is_packed = false;
 
@@ -16,7 +16,7 @@ class SkipLayerNorm final : public OpKernel {
   SkipLayerNorm(const OpKernelInfo& op_kernel_info);
   Status Compute(OpKernelContext* p_op_kernel_context) const override;
 
-  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool save_prepacked_initializers,
+  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
                  bool& is_packed, PrePackedWeights* prepacked_weights) override;
 
  private:
 
@@ -95,7 +95,6 @@ GroupNorm::GroupNorm(const OpKernelInfo& op_info) : CudaKernel(op_info) {
 }
 
 Status GroupNorm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr /*alloc*/,
-                          bool /*save_prepacked_initializers*/,
                           bool& is_packed, PrePackedWeights* /*prepacked_weights*/) {
   is_packed = false;
Original file line number	Diff line number	Diff line change
`@@ -95,7 +95,6 @@ GroupNorm::GroupNorm(const OpKernelInfo& op_info) : CudaKernel(op_info) {`
`95`	`95`	`}`
`96`	`96`
`97`	`97`	`Status GroupNorm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr /alloc/,`
`98`		`- bool /save_prepacked_initializers/,`
`99`	`98`	`bool& is_packed, PrePackedWeights* /prepacked_weights/) {`
`100`	`99`	`is_packed = false;`
`101`	`100`