Skip to content

Commit c44f92f

Browse files
yuslepukhinankitm3k
authored andcommitted
Revert "enable serialize prepacked weights into data file (microsoft#22256)" (microsoft#22788)
This reverts commit c5b6be0. ### Description Revert ### Motivation and Context This needs simpler and more robust approach
1 parent 0045291 commit c44f92f

File tree

72 files changed

+137
-872
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+137
-872
lines changed

include/onnxruntime/core/framework/op_kernel.h

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,6 @@ class OpKernel {
7979
// the allocator tied to the session if the kernel owns the pre-packed buffer or an
8080
// allocator shared between sessions if the pre-packed buffer is to be shared across sessions
8181
// (i.e.) the kernel does not own the buffer.
82-
// @param save_prepacked_initializers: Set it to true if intend to save prepacked initializers to external data file.
8382
// @param is_packed: Set it to true if the kernel packed the tensor or to false
8483
// The kernel is responsible for keeping the packed data and related metadata if is_packed is true,
8584
// and the original initialized constant tensor will be released and not accessible anymore in
@@ -89,7 +88,6 @@ class OpKernel {
8988

9089
virtual Status
9190
PrePack(const Tensor& /*tensor*/, int /*input_idx*/, AllocatorPtr /*alloc*/,
92-
bool, /*save_prepacked_initializers*/
9391
/*out*/ bool& is_packed, /*out*/ PrePackedWeights* /*prepacked_weights*/) {
9492
is_packed = false;
9593
return Status::OK();
@@ -131,26 +129,6 @@ class OpKernel {
131129
return Status::OK();
132130
}
133131

134-
// Override this function to get pre-packed tensors from this kernel.
135-
// Only useful for models run on PC with CPU so ORT could load prepacked weights directly from
136-
// ONNX data file with mmap and no need to do prepacking on fly to save a lot of heap memory.
137-
// @param input_idx : The index of input we prepacked before and intend to get packed tensor back.
138-
// Please refer to matmul_nbits kernel for a complete example.
139-
virtual std::optional<Tensor> GetPrePackTensor(int /*input_idx*/) {
140-
return std::nullopt;
141-
}
142-
143-
// Override this function to set pre-packed tensors to this kernel and restore prepacked weight buffer.
144-
// Only useful for models run on PC with CPU so ORT could load prepacked weights directly from
145-
// ONNX data file with mmap and no need to do prepacking on fly to save a lot of heap memory.
146-
// Please refer to matmul_nbits kernel for a complete example.
147-
// @param input_idx : The input index of the tensor in this kernel.
148-
// @param pre_packed_tensor: The prepacked tensor read from onnx data file and use the prepacked tensor
149-
// to restore prepacked weight buffer.
150-
virtual Status SetPrePackTensor(int /*input_idx*/, const Tensor& /*pre_packed_tensor*/) {
151-
return Status::OK();
152-
}
153-
154132
const OrtDevice GetDevice(OrtMemType mem_type) const;
155133
const OpKernelInfo& Info() const {
156134
return *op_kernel_info_;

include/onnxruntime/core/graph/graph.h

Lines changed: 2 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1148,11 +1148,6 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
11481148
void FinalizeFuseSubGraph(const IndexedSubGraph& sub_graph, Node& fused_node);
11491149
#endif
11501150

1151-
// Since one constant initializer could be used by different kernels
1152-
// and prepacked differently, use an unordered_map to store prepacked
1153-
// initializer in format of <[initializer_name], <[node_name], [prepacked_initializer]>>
1154-
typedef std::unordered_map<std::string, std::unordered_map<std::string, ONNX_NAMESPACE::TensorProto>> PrePackedTensorProtoToSave;
1155-
11561151
#if !defined(ORT_MINIMAL_BUILD)
11571152
/** Gets the GraphProto representation of this Graph. */
11581153
const ONNX_NAMESPACE::GraphProto& ToGraphProto();
@@ -1187,26 +1182,18 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
11871182
@param initializer_size_threshold initializers larger or equal to this threshold (in bytes) are saved
11881183
in the external file. Initializer smaller than this threshold are included in the onnx file.
11891184
@param align_info offset alignment info.
1190-
@param save_prepacked_constant_initializers whether to save prepacked initializer into external data file.
1191-
If set false to this boolean, prepacked initializer will not be saved into onnxruntime data file,
1192-
we keep constant initializer as it is.
1193-
@param pre_packed_initializers struct used to store all the prepacked initializers.
11941185
@returns GraphProto serialization of the graph.
11951186
*/
11961187
ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
11971188
const std::filesystem::path& model_file_path,
11981189
size_t initializer_size_threshold,
1199-
const OffsetAlignmentInfo& align_info,
1200-
bool save_prepacked_constant_initializers,
1201-
PrePackedTensorProtoToSave& pre_packed_initializers) const;
1190+
const OffsetAlignmentInfo& align_info) const;
12021191

12031192
ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
12041193
const std::filesystem::path& model_file_path,
12051194
size_t initializer_size_threshold) const {
12061195
OffsetAlignmentInfo default_options;
1207-
PrePackedTensorProtoToSave pre_packed_initializers;
1208-
return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options,
1209-
false, pre_packed_initializers);
1196+
return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options);
12101197
}
12111198

12121199
/** Gets the ISchemaRegistry instances being used with this Graph. */
@@ -1521,18 +1508,6 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
15211508
private:
15221509
void InitializeStateFromModelFileGraphProto();
15231510

1524-
// Private method used to setup external initializer properly during model save,
1525-
// this external initializer could be oroginal initializer or prepacked initializer.
1526-
static void SetUpExternalInitializer(const Graph::OffsetAlignmentInfo& align_info,
1527-
size_t tensor_bytes_size,
1528-
int64_t& external_offset,
1529-
std::ofstream& external_stream,
1530-
gsl::span<const uint8_t> raw_data,
1531-
ONNX_NAMESPACE::TensorProto& output_proto,
1532-
const std::filesystem::path& external_file_path,
1533-
const ONNX_NAMESPACE::TensorProto& initializer,
1534-
bool is_prepacked);
1535-
15361511
// Add node with specified <node_proto>.
15371512
Node& AddNode(const ONNX_NAMESPACE::NodeProto& node_proto,
15381513
const ArgNameToTypeMap& name_to_type);

include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -246,12 +246,6 @@ static const char* const kOrtSessionOptionsDisableCPUEPFallback = "session.disab
246246
static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFileName =
247247
"session.optimized_model_external_initializers_file_name";
248248

249-
// Use this config when save prepacked constant initializers to onnx external data file.
250-
// Default is not save prepacked initializers to onnx data file.
251-
// Sample usage: sess_options.add_session_config_entry('session.save_prepacked_constant_initializers', "1")
252-
static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers =
253-
"session.save_prepacked_constant_initializers";
254-
255249
// Use this config to control the minimum size of the initializer when externalizing it during serialization
256250
static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
257251
"session.optimized_model_external_initializers_min_size_in_bytes";

onnxruntime/contrib_ops/cpu/bert/attention.cc

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ class Attention : public OpKernel, public AttentionCPUBase {
3030
Status Compute(OpKernelContext* context) const override;
3131

3232
Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
33-
bool save_prepacked_initializers,
3433
/*out*/ bool& is_packed,
3534
/*out*/ PrePackedWeights* prepacked_weights) override;
3635

@@ -102,7 +101,6 @@ bool Attention<T>::IsPackWeightsSuccessful(int qkv_index,
102101

103102
template <typename T>
104103
Status Attention<T>::PrePack(const Tensor& weights, int input_idx, AllocatorPtr alloc,
105-
bool /*save_prepacked_initializers*/,
106104
/*out*/ bool& is_packed,
107105
/*out*/ PrePackedWeights* prepacked_weights) {
108106
/* The PrePack() massages the weights to speed up Compute(), there is an option to

onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ class QAttention : public OpKernel, public AttentionCPUBase {
2424
Status Compute(OpKernelContext* context) const override;
2525

2626
Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
27-
bool save_prepacked_initializers,
2827
bool& /*out*/ is_packed,
2928
/*out*/ PrePackedWeights* prepacked_weights) override;
3029

@@ -59,7 +58,6 @@ QAttention<T>::QAttention(const OpKernelInfo& info) : OpKernel(info), AttentionC
5958

6059
template <typename T>
6160
Status QAttention<T>::PrePack(const Tensor& weights, int input_idx, AllocatorPtr alloc,
62-
bool /*save_prepacked_initializers*/,
6361
/*out*/ bool& is_packed,
6462
/*out*/ PrePackedWeights* prepacked_weights) {
6563
if (1 != input_idx) {

onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_lstm.cc

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ class DynamicQuantizeLSTM : public OpKernel, public LSTMBase {
1313
DynamicQuantizeLSTM(const OpKernelInfo& info) : OpKernel(info), LSTMBase(info) {}
1414

1515
Status PrePack(const Tensor& tensor, int input_idx,
16-
AllocatorPtr alloc, bool save_prepacked_initializers, /*out*/ bool& is_packed,
16+
AllocatorPtr alloc, /*out*/ bool& is_packed,
1717
/*out*/ PrePackedWeights* prepacked_weights) override;
1818

1919
Status UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers,
@@ -91,7 +91,6 @@ static void UseSharedPrePackedBuffersImpl(std::vector<BufferUniquePtr>& prepacke
9191
}
9292

9393
Status DynamicQuantizeLSTM::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
94-
bool /*save_prepacked_initializers*/,
9594
/*out*/ bool& is_packed,
9695
/*out*/ PrePackedWeights* prepacked_weights) {
9796
is_packed = false;

onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc

Lines changed: 0 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -120,19 +120,12 @@ class MatMulNBits final : public OpKernel {
120120
Status Compute(OpKernelContext* context) const override;
121121

122122
Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
123-
bool save_prepacked_initializers,
124123
/*out*/ bool& is_packed,
125124
/*out*/ PrePackedWeights* prepacked_weights) override;
126125

127-
void ConvertPrepackWeightIntoTensor(const onnxruntime::Tensor& tensor, int input_idx);
128-
129126
Status UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers, int input_idx,
130127
/*out*/ bool& used_shared_buffers) override;
131128

132-
std::optional<Tensor> GetPrePackTensor(int /*input_idx*/) override;
133-
134-
Status SetPrePackTensor(int input_idx, const Tensor& pre_packed_tensor) override;
135-
136129
private:
137130
const size_t K_;
138131
const size_t N_;
@@ -147,8 +140,6 @@ class MatMulNBits final : public OpKernel {
147140
size_t packed_b_size_{0};
148141
IAllocatorUniquePtr<float> scales_fp32_{};
149142
IAllocatorUniquePtr<float> bias_fp32_{};
150-
std::optional<Tensor> packed_tensor_{std::nullopt};
151-
MLDataType prepack_tensor_data_type_;
152143

153144
bool has_zp_input_{false};
154145

@@ -176,22 +167,8 @@ class MatMulNBits final : public OpKernel {
176167
const MatMulComputeHelper& helper) const;
177168
};
178169

179-
template <typename T1>
180-
void MatMulNBits<T1>::ConvertPrepackWeightIntoTensor(const onnxruntime::Tensor& tensor, int input_idx) {
181-
if (input_idx == InputIndex::B) {
182-
prepack_tensor_data_type_ = tensor.DataType();
183-
}
184-
185-
TensorShapeVector weights_dims = {static_cast<int64_t>((packed_b_size_ - 1) / prepack_tensor_data_type_->Size()) + 1};
186-
packed_tensor_ = Tensor(prepack_tensor_data_type_,
187-
TensorShape(weights_dims),
188-
packed_b_.get(),
189-
OrtMemoryInfo(CPU, OrtAllocatorType::OrtDeviceAllocator));
190-
}
191-
192170
template <typename T1>
193171
Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
194-
bool save_prepacked_initializers,
195172
/*out*/ bool& is_packed,
196173
/*out*/ PrePackedWeights* prepacked_weights) {
197174
ORT_UNUSED_PARAMETER(prepacked_weights);
@@ -227,18 +204,13 @@ Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ All
227204
#endif // MLAS_TARGET_AMD64_IX86
228205
}
229206

230-
if (save_prepacked_initializers) {
231-
ConvertPrepackWeightIntoTensor(tensor, input_idx);
232-
}
233-
234207
return Status::OK();
235208
}
236209

237210
#if !defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || !defined(MLAS_TARGET_ARM64)
238211
// Non-ARM-with-fp16-intrinsics fall back fp16 to fp32.
239212
template <>
240213
Status MatMulNBits<MLFloat16>::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
241-
bool save_prepacked_initializers,
242214
/*out*/ bool& is_packed,
243215
/*out*/ PrePackedWeights* prepacked_weights) {
244216
ORT_UNUSED_PARAMETER(prepacked_weights);
@@ -288,34 +260,6 @@ Status MatMulNBits<MLFloat16>::PrePack(const Tensor& tensor, int input_idx, /*ou
288260
#endif // MLAS_TARGET_AMD64_IX86
289261
}
290262

291-
if (save_prepacked_initializers) {
292-
ConvertPrepackWeightIntoTensor(tensor, input_idx);
293-
}
294-
295-
return Status::OK();
296-
}
297-
298-
template <typename T1>
299-
std::optional<Tensor> MatMulNBits<T1>::GetPrePackTensor(int input_idx) {
300-
// For this kernel, prepack is performed on input_B, and possibly scales, zeros_points.
301-
// During compute process, scales and zeros_points will keep as it is and only use prepacked
302-
// buffer to replace input_B.
303-
// Inorder to cope with this logic, we need to return latest prepacked buffer and only serialize
304-
// the latest one. So, we need to always return packed_tensor_ here not only for input_B.
305-
ORT_UNUSED_PARAMETER(input_idx);
306-
return std::move(packed_tensor_);
307-
}
308-
309-
template <typename T1>
310-
Status MatMulNBits<T1>::SetPrePackTensor(int input_idx, const Tensor& pre_packed_tensor) {
311-
if (input_idx == 1) {
312-
// pre_packed_tensor is constant initialized tensor and its lifecycle is managed by session_state,
313-
// session_state will release memory from pre_packed_tensor. packed_b_ will not release memory so
314-
// pass empty/default buffer deleter here.
315-
// const_cast here is temporary, will fix in follow up PR.
316-
packed_b_ = BufferUniquePtr(const_cast<void*>(pre_packed_tensor.DataRaw()), BufferDeleter());
317-
}
318-
319263
return Status::OK();
320264
}
321265
#endif // end !MLAS_F16VEC_INTRINSICS_SUPPORTED || !MLAS_TARGET_ARM64

onnxruntime/contrib_ops/cpu/skip_layer_norm.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,6 @@ Status SkipLayerNorm<T, simplified>::Compute(OpKernelContext* p_ctx) const {
252252

253253
template <typename T, bool simplified>
254254
Status SkipLayerNorm<T, simplified>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
255-
bool /*save_prepacked_initializers*/,
256255
bool& is_packed, PrePackedWeights* prepacked_weights) {
257256
ORT_UNUSED_PARAMETER(prepacked_weights);
258257
is_packed = false;

onnxruntime/contrib_ops/cpu/skip_layer_norm.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ class SkipLayerNorm final : public OpKernel {
1616
SkipLayerNorm(const OpKernelInfo& op_kernel_info);
1717
Status Compute(OpKernelContext* p_op_kernel_context) const override;
1818

19-
Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool save_prepacked_initializers,
19+
Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
2020
bool& is_packed, PrePackedWeights* prepacked_weights) override;
2121

2222
private:

onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,6 @@ GroupNorm::GroupNorm(const OpKernelInfo& op_info) : CudaKernel(op_info) {
9595
}
9696

9797
Status GroupNorm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr /*alloc*/,
98-
bool /*save_prepacked_initializers*/,
9998
bool& is_packed, PrePackedWeights* /*prepacked_weights*/) {
10099
is_packed = false;
101100

0 commit comments

Comments
 (0)