Skip to content

Commit c5b6be0

Browse files
enable serialize prepacked weights into data file (microsoft#22256)
### Description part of microsoft#21448 This change is intend to save CPU memory during model load for inference. Added session option save_prepacked_constant_initializers, with save_prepacked_constant_initializers turn on: 1. optimize model with inference session, prepacked external initializer will be saved into data file. 2. load optimized model and external data file with prepacked initializer, no prepack is needed 3. run inference with optimized model and data file Tested with model Phi-3-mini-instruct-onnx, with ORT 1.12.0: ![image](https://github.com/user-attachments/assets/3c0337be-f340-4bb7-8f9f-30f3552072ef) with this change: ![image](https://github.com/user-attachments/assets/23282990-2e1e-4a1f-92de-afa8ed7e6a43) Peak memory usage dropped from **5.438 GB to 2.726GB**. This change takes advantage of ORT loads external initializer with mmap on CPU. Prepack will use extra memory on heap, omit prepack process can save this part of memory (roughly same size as external initializers). next step: Change all the kernels on CPU with PrePack method implemented and test properly. Will do in next PR. ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. -->
1 parent 4ed5bec commit c5b6be0

File tree

72 files changed

+872
-137
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+872
-137
lines changed

include/onnxruntime/core/framework/op_kernel.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ class OpKernel {
7979
// the allocator tied to the session if the kernel owns the pre-packed buffer or an
8080
// allocator shared between sessions if the pre-packed buffer is to be shared across sessions
8181
// (i.e.) the kernel does not own the buffer.
82+
// @param save_prepacked_initializers: Set it to true if intend to save prepacked initializers to external data file.
8283
// @param is_packed: Set it to true if the kernel packed the tensor or to false
8384
// The kernel is responsible for keeping the packed data and related metadata if is_packed is true,
8485
// and the original initialized constant tensor will be released and not accessible anymore in
@@ -88,6 +89,7 @@ class OpKernel {
8889

8990
virtual Status
9091
PrePack(const Tensor& /*tensor*/, int /*input_idx*/, AllocatorPtr /*alloc*/,
92+
bool, /*save_prepacked_initializers*/
9193
/*out*/ bool& is_packed, /*out*/ PrePackedWeights* /*prepacked_weights*/) {
9294
is_packed = false;
9395
return Status::OK();
@@ -129,6 +131,26 @@ class OpKernel {
129131
return Status::OK();
130132
}
131133

134+
// Override this function to get pre-packed tensors from this kernel.
135+
// Only useful for models run on PC with CPU so ORT could load prepacked weights directly from
136+
// ONNX data file with mmap and no need to do prepacking on fly to save a lot of heap memory.
137+
// @param input_idx : The index of input we prepacked before and intend to get packed tensor back.
138+
// Please refer to matmul_nbits kernel for a complete example.
139+
virtual std::optional<Tensor> GetPrePackTensor(int /*input_idx*/) {
140+
return std::nullopt;
141+
}
142+
143+
// Override this function to set pre-packed tensors to this kernel and restore prepacked weight buffer.
144+
// Only useful for models run on PC with CPU so ORT could load prepacked weights directly from
145+
// ONNX data file with mmap and no need to do prepacking on fly to save a lot of heap memory.
146+
// Please refer to matmul_nbits kernel for a complete example.
147+
// @param input_idx : The input index of the tensor in this kernel.
148+
// @param pre_packed_tensor: The prepacked tensor read from onnx data file and use the prepacked tensor
149+
// to restore prepacked weight buffer.
150+
virtual Status SetPrePackTensor(int /*input_idx*/, const Tensor& /*pre_packed_tensor*/) {
151+
return Status::OK();
152+
}
153+
132154
const OrtDevice GetDevice(OrtMemType mem_type) const;
133155
const OpKernelInfo& Info() const {
134156
return *op_kernel_info_;

include/onnxruntime/core/graph/graph.h

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1148,6 +1148,11 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
11481148
void FinalizeFuseSubGraph(const IndexedSubGraph& sub_graph, Node& fused_node);
11491149
#endif
11501150

1151+
// Since one constant initializer could be used by different kernels
1152+
// and prepacked differently, use an unordered_map to store prepacked
1153+
// initializer in format of <[initializer_name], <[node_name], [prepacked_initializer]>>
1154+
typedef std::unordered_map<std::string, std::unordered_map<std::string, ONNX_NAMESPACE::TensorProto>> PrePackedTensorProtoToSave;
1155+
11511156
#if !defined(ORT_MINIMAL_BUILD)
11521157
/** Gets the GraphProto representation of this Graph. */
11531158
const ONNX_NAMESPACE::GraphProto& ToGraphProto();
@@ -1182,18 +1187,26 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
11821187
@param initializer_size_threshold initializers larger or equal to this threshold (in bytes) are saved
11831188
in the external file. Initializer smaller than this threshold are included in the onnx file.
11841189
@param align_info offset alignment info.
1190+
@param save_prepacked_constant_initializers whether to save prepacked initializer into external data file.
1191+
If set false to this boolean, prepacked initializer will not be saved into onnxruntime data file,
1192+
we keep constant initializer as it is.
1193+
@param pre_packed_initializers struct used to store all the prepacked initializers.
11851194
@returns GraphProto serialization of the graph.
11861195
*/
11871196
ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
11881197
const std::filesystem::path& model_file_path,
11891198
size_t initializer_size_threshold,
1190-
const OffsetAlignmentInfo& align_info) const;
1199+
const OffsetAlignmentInfo& align_info,
1200+
bool save_prepacked_constant_initializers,
1201+
PrePackedTensorProtoToSave& pre_packed_initializers) const;
11911202

11921203
ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
11931204
const std::filesystem::path& model_file_path,
11941205
size_t initializer_size_threshold) const {
11951206
OffsetAlignmentInfo default_options;
1196-
return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options);
1207+
PrePackedTensorProtoToSave pre_packed_initializers;
1208+
return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options,
1209+
false, pre_packed_initializers);
11971210
}
11981211

11991212
/** Gets the ISchemaRegistry instances being used with this Graph. */
@@ -1508,6 +1521,18 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
15081521
private:
15091522
void InitializeStateFromModelFileGraphProto();
15101523

1524+
// Private method used to setup external initializer properly during model save,
1525+
// this external initializer could be oroginal initializer or prepacked initializer.
1526+
static void SetUpExternalInitializer(const Graph::OffsetAlignmentInfo& align_info,
1527+
size_t tensor_bytes_size,
1528+
int64_t& external_offset,
1529+
std::ofstream& external_stream,
1530+
gsl::span<const uint8_t> raw_data,
1531+
ONNX_NAMESPACE::TensorProto& output_proto,
1532+
const std::filesystem::path& external_file_path,
1533+
const ONNX_NAMESPACE::TensorProto& initializer,
1534+
bool is_prepacked);
1535+
15111536
// Add node with specified <node_proto>.
15121537
Node& AddNode(const ONNX_NAMESPACE::NodeProto& node_proto,
15131538
const ArgNameToTypeMap& name_to_type);

include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,12 @@ static const char* const kOrtSessionOptionsDisableCPUEPFallback = "session.disab
246246
static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFileName =
247247
"session.optimized_model_external_initializers_file_name";
248248

249+
// Use this config when save prepacked constant initializers to onnx external data file.
250+
// Default is not save prepacked initializers to onnx data file.
251+
// Sample usage: sess_options.add_session_config_entry('session.save_prepacked_constant_initializers', "1")
252+
static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers =
253+
"session.save_prepacked_constant_initializers";
254+
249255
// Use this config to control the minimum size of the initializer when externalizing it during serialization
250256
static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
251257
"session.optimized_model_external_initializers_min_size_in_bytes";

onnxruntime/contrib_ops/cpu/bert/attention.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ class Attention : public OpKernel, public AttentionCPUBase {
3030
Status Compute(OpKernelContext* context) const override;
3131

3232
Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
33+
bool save_prepacked_initializers,
3334
/*out*/ bool& is_packed,
3435
/*out*/ PrePackedWeights* prepacked_weights) override;
3536

@@ -101,6 +102,7 @@ bool Attention<T>::IsPackWeightsSuccessful(int qkv_index,
101102

102103
template <typename T>
103104
Status Attention<T>::PrePack(const Tensor& weights, int input_idx, AllocatorPtr alloc,
105+
bool /*save_prepacked_initializers*/,
104106
/*out*/ bool& is_packed,
105107
/*out*/ PrePackedWeights* prepacked_weights) {
106108
/* The PrePack() massages the weights to speed up Compute(), there is an option to

onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ class QAttention : public OpKernel, public AttentionCPUBase {
2424
Status Compute(OpKernelContext* context) const override;
2525

2626
Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
27+
bool save_prepacked_initializers,
2728
bool& /*out*/ is_packed,
2829
/*out*/ PrePackedWeights* prepacked_weights) override;
2930

@@ -58,6 +59,7 @@ QAttention<T>::QAttention(const OpKernelInfo& info) : OpKernel(info), AttentionC
5859

5960
template <typename T>
6061
Status QAttention<T>::PrePack(const Tensor& weights, int input_idx, AllocatorPtr alloc,
62+
bool /*save_prepacked_initializers*/,
6163
/*out*/ bool& is_packed,
6264
/*out*/ PrePackedWeights* prepacked_weights) {
6365
if (1 != input_idx) {

onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_lstm.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ class DynamicQuantizeLSTM : public OpKernel, public LSTMBase {
1313
DynamicQuantizeLSTM(const OpKernelInfo& info) : OpKernel(info), LSTMBase(info) {}
1414

1515
Status PrePack(const Tensor& tensor, int input_idx,
16-
AllocatorPtr alloc, /*out*/ bool& is_packed,
16+
AllocatorPtr alloc, bool save_prepacked_initializers, /*out*/ bool& is_packed,
1717
/*out*/ PrePackedWeights* prepacked_weights) override;
1818

1919
Status UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers,
@@ -91,6 +91,7 @@ static void UseSharedPrePackedBuffersImpl(std::vector<BufferUniquePtr>& prepacke
9191
}
9292

9393
Status DynamicQuantizeLSTM::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
94+
bool /*save_prepacked_initializers*/,
9495
/*out*/ bool& is_packed,
9596
/*out*/ PrePackedWeights* prepacked_weights) {
9697
is_packed = false;

onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,12 +98,19 @@ class MatMulNBits final : public OpKernel {
9898
Status Compute(OpKernelContext* context) const override;
9999

100100
Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
101+
bool save_prepacked_initializers,
101102
/*out*/ bool& is_packed,
102103
/*out*/ PrePackedWeights* prepacked_weights) override;
103104

105+
void ConvertPrepackWeightIntoTensor(const onnxruntime::Tensor& tensor, int input_idx);
106+
104107
Status UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers, int input_idx,
105108
/*out*/ bool& used_shared_buffers) override;
106109

110+
std::optional<Tensor> GetPrePackTensor(int /*input_idx*/) override;
111+
112+
Status SetPrePackTensor(int input_idx, const Tensor& pre_packed_tensor) override;
113+
107114
private:
108115
const size_t K_;
109116
const size_t N_;
@@ -119,6 +126,8 @@ class MatMulNBits final : public OpKernel {
119126
size_t packed_b_size_{0};
120127
IAllocatorUniquePtr<float> scales_fp32_{};
121128
IAllocatorUniquePtr<float> bias_fp32_{};
129+
std::optional<Tensor> packed_tensor_{std::nullopt};
130+
MLDataType prepack_tensor_data_type_;
122131

123132
bool has_zp_input_{false};
124133

@@ -148,8 +157,22 @@ class MatMulNBits final : public OpKernel {
148157
}
149158
};
150159

160+
template <typename T1>
161+
void MatMulNBits<T1>::ConvertPrepackWeightIntoTensor(const onnxruntime::Tensor& tensor, int input_idx) {
162+
if (input_idx == InputIndex::B) {
163+
prepack_tensor_data_type_ = tensor.DataType();
164+
}
165+
166+
TensorShapeVector weights_dims = {static_cast<int64_t>((packed_b_size_ - 1) / prepack_tensor_data_type_->Size()) + 1};
167+
packed_tensor_ = Tensor(prepack_tensor_data_type_,
168+
TensorShape(weights_dims),
169+
packed_b_.get(),
170+
OrtMemoryInfo(CPU, OrtAllocatorType::OrtDeviceAllocator));
171+
}
172+
151173
template <typename T1>
152174
Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
175+
bool save_prepacked_initializers,
153176
/*out*/ bool& is_packed,
154177
/*out*/ PrePackedWeights* prepacked_weights) {
155178
ORT_UNUSED_PARAMETER(prepacked_weights);
@@ -185,11 +208,16 @@ Status MatMulNBits<T1>::PrePack(const Tensor& tensor, int input_idx, /*out*/ All
185208
#endif // MLAS_TARGET_AMD64_IX86
186209
}
187210

211+
if (save_prepacked_initializers) {
212+
ConvertPrepackWeightIntoTensor(tensor, input_idx);
213+
}
214+
188215
return Status::OK();
189216
}
190217

191218
template <>
192219
Status MatMulNBits<MLFloat16>::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
220+
bool save_prepacked_initializers,
193221
/*out*/ bool& is_packed,
194222
/*out*/ PrePackedWeights* prepacked_weights) {
195223
ORT_UNUSED_PARAMETER(prepacked_weights);
@@ -239,6 +267,34 @@ Status MatMulNBits<MLFloat16>::PrePack(const Tensor& tensor, int input_idx, /*ou
239267
#endif // MLAS_TARGET_AMD64_IX86
240268
}
241269

270+
if (save_prepacked_initializers) {
271+
ConvertPrepackWeightIntoTensor(tensor, input_idx);
272+
}
273+
274+
return Status::OK();
275+
}
276+
277+
template <typename T1>
278+
std::optional<Tensor> MatMulNBits<T1>::GetPrePackTensor(int input_idx) {
279+
// For this kernel, prepack is performed on input_B, and possibly scales, zeros_points.
280+
// During compute process, scales and zeros_points will keep as it is and only use prepacked
281+
// buffer to replace input_B.
282+
// Inorder to cope with this logic, we need to return latest prepacked buffer and only serialize
283+
// the latest one. So, we need to always return packed_tensor_ here not only for input_B.
284+
ORT_UNUSED_PARAMETER(input_idx);
285+
return std::move(packed_tensor_);
286+
}
287+
288+
template <typename T1>
289+
Status MatMulNBits<T1>::SetPrePackTensor(int input_idx, const Tensor& pre_packed_tensor) {
290+
if (input_idx == 1) {
291+
// pre_packed_tensor is constant initialized tensor and its lifecycle is managed by session_state,
292+
// session_state will release memory from pre_packed_tensor. packed_b_ will not release memory so
293+
// pass empty/default buffer deleter here.
294+
// const_cast here is temporary, will fix in follow up PR.
295+
packed_b_ = BufferUniquePtr(const_cast<void*>(pre_packed_tensor.DataRaw()), BufferDeleter());
296+
}
297+
242298
return Status::OK();
243299
}
244300

onnxruntime/contrib_ops/cpu/skip_layer_norm.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@ Status SkipLayerNorm<T, simplified>::Compute(OpKernelContext* p_ctx) const {
278278

279279
template <typename T, bool simplified>
280280
Status SkipLayerNorm<T, simplified>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
281+
bool /*save_prepacked_initializers*/,
281282
bool& is_packed, PrePackedWeights* prepacked_weights) {
282283
ORT_UNUSED_PARAMETER(prepacked_weights);
283284

onnxruntime/contrib_ops/cpu/skip_layer_norm.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ class SkipLayerNorm final : public OpKernel {
1616
SkipLayerNorm(const OpKernelInfo& op_kernel_info);
1717
Status Compute(OpKernelContext* p_op_kernel_context) const override;
1818

19-
Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
19+
Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool save_prepacked_initializers,
2020
bool& is_packed, PrePackedWeights* prepacked_weights) override;
2121

2222
private:

onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ GroupNorm::GroupNorm(const OpKernelInfo& op_info) : CudaKernel(op_info) {
9595
}
9696

9797
Status GroupNorm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr /*alloc*/,
98+
bool /*save_prepacked_initializers*/,
9899
bool& is_packed, PrePackedWeights* /*prepacked_weights*/) {
99100
is_packed = false;
100101

0 commit comments

Comments
 (0)