From 3661f9c0c34ef3b381286621428d908955f1b1e1 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Mon, 7 Jul 2025 08:22:21 -0300 Subject: [PATCH 1/2] feat: support specifying quant types for specific tensors on conversion Uses a regex syntax similar to the llama.cpp's tensor overrides. --- examples/cli/main.cpp | 14 +++++++++- model.cpp | 65 +++++++++++++++++++++++++++++++++++++++---- model.h | 2 +- stable-diffusion.h | 2 +- 4 files changed, 74 insertions(+), 9 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index c7db3708b..bfd2091e0 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -87,6 +87,7 @@ struct SDParams { std::string stacked_id_embeddings_path; std::string input_id_images_path; sd_type_t wtype = SD_TYPE_COUNT; + std::string tensor_wtype; std::string lora_model_dir; std::string output_path = "output.png"; std::string input_path; @@ -223,6 +224,7 @@ void print_usage(int argc, const char* argv[]) { printf(" --upscale-repeats Run the ESRGAN upscaler this many times (default 1)\n"); printf(" --type [TYPE] weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)\n"); printf(" If not specified, the default is the type of the weight file\n"); + printf(" --tensor-type [EXPRESSION] weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")\n"); printf(" --lora-model-dir [DIR] lora model directory\n"); printf(" -i, --init-img [IMAGE] path to the input image, required by img2img\n"); printf(" --mask [MASK] path to the mask image, required by img2img with mask\n"); @@ -404,6 +406,12 @@ void parse_args(int argc, const char** argv, SDParams& params) { valid_types.c_str()); exit(1); } + } else if (arg == "--tensor-type") { + if (++i >= argc) { + invalid_arg = true; + break; + } + params.tensor_wtype = argv[i]; } else if (arg == "--lora-model-dir") { if (++i >= argc) { invalid_arg = true; @@ -733,6 +741,10 @@ void parse_args(int argc, const char** argv, SDParams& params) { exit(1); } + if (params.mode != CONVERT && params.tensor_wtype.size() > 0) { + fprintf(stderr, "warning: --tensor-type is currently supported only for conversion\n"); + } + if (params.seed < 0) { srand((int)time(NULL)); params.seed = rand(); @@ -845,7 +857,7 @@ int main(int argc, const char* argv[]) { } if (params.mode == CONVERT) { - bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype); + bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype, params.tensor_wtype.c_str()); if (!success) { fprintf(stderr, "convert '%s'/'%s' to '%s' failed\n", diff --git a/model.cpp b/model.cpp index 85c959057..40dc68d3c 100644 --- a/model.cpp +++ b/model.cpp @@ -2088,6 +2088,45 @@ bool ModelLoader::load_tensors(std::map& tenso return true; } +std::vector > parse_quant_overrides (const std::string & overrides) +{ + std::vector > result; + for (const auto & item : splitString(overrides, ',')) { + if (item.size() == 0) + continue; + std::string::size_type pos = item.find('='); + if (pos == std::string::npos) { + LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str()); + continue; + } + std::string tensor_pattern = item.substr(0, pos); + std::string quant_name = item.substr(pos + 1); + + ggml_type over_type = GGML_TYPE_COUNT; + + if (quant_name == "f32") { + over_type = GGML_TYPE_F32; + } + else { + for (size_t i = 0; i < SD_TYPE_COUNT; i++) { + auto trait = ggml_get_type_traits((ggml_type)i); + if (trait->to_float && trait->type_size && quant_name == trait->type_name) { + over_type = (ggml_type)i; + } + } + } + + if (over_type != GGML_TYPE_COUNT) { + result.emplace_back(tensor_pattern, over_type); + } + else { + LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str()); + } + + } + return result; +} + bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type) { const std::string& name = tensor_storage.name; if (type != GGML_TYPE_COUNT) { @@ -2119,7 +2158,7 @@ bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage return false; } -bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type) { +bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type, const char * overrides) { auto backend = ggml_backend_cpu_init(); size_t mem_size = 1 * 1024 * 1024; // for padding mem_size += tensor_storages.size() * ggml_tensor_overhead(); @@ -2129,12 +2168,25 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type gguf_context* gguf_ctx = gguf_init_empty(); + if (overrides == nullptr) + overrides = ""; + auto quant_overrides = parse_quant_overrides(overrides); + auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { const std::string& name = tensor_storage.name; - ggml_type tensor_type = tensor_storage.type; - if (tensor_should_be_converted(tensor_storage, type)) { - tensor_type = type; + ggml_type change_type = type; + + for (const auto & quant_override : quant_overrides) { + std::regex pattern(quant_override.first); + if (std::regex_search(name, pattern)) { + change_type = quant_override.second; + break; + } + } + + if (tensor_should_be_converted(tensor_storage, change_type)) { + tensor_type = change_type; } ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne); @@ -2193,7 +2245,8 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type) return mem_size; } -bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type) { +bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type, + const char * output_tensor_type) { ModelLoader model_loader; if (!model_loader.init_from_file(input_path)) { @@ -2207,6 +2260,6 @@ bool convert(const char* input_path, const char* vae_path, const char* output_pa return false; } } - bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type); + bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type, output_tensor_type); return success; } diff --git a/model.h b/model.h index 82885dd96..5780e26bb 100644 --- a/model.h +++ b/model.h @@ -222,7 +222,7 @@ class ModelLoader { ggml_backend_t backend, std::set ignore_tensors = {}); - bool save_to_gguf_file(const std::string& file_path, ggml_type type); + bool save_to_gguf_file(const std::string& file_path, ggml_type type, const char * overrides); bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type); int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT); ~ModelLoader() = default; diff --git a/stable-diffusion.h b/stable-diffusion.h index b4d6fc327..92338e1d7 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -257,7 +257,7 @@ SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx); SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor); -SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type); +SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type, const char * output_tensor_type); SD_API uint8_t* preprocess_canny(uint8_t* img, int width, From 4dc1aa9dba774e65d06a5d5aa47524487bfc928f Mon Sep 17 00:00:00 2001 From: leejet Date: Tue, 8 Jul 2025 00:07:37 +0800 Subject: [PATCH 2/2] unify code style --- examples/cli/main.cpp | 14 +++++----- model.cpp | 64 +++++++++++++++++++------------------------ model.h | 2 +- stable-diffusion.h | 2 +- 4 files changed, 37 insertions(+), 45 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index bfd2091e0..bb695c3bb 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -87,7 +87,7 @@ struct SDParams { std::string stacked_id_embeddings_path; std::string input_id_images_path; sd_type_t wtype = SD_TYPE_COUNT; - std::string tensor_wtype; + std::string tensor_type_rules; std::string lora_model_dir; std::string output_path = "output.png"; std::string input_path; @@ -224,7 +224,7 @@ void print_usage(int argc, const char* argv[]) { printf(" --upscale-repeats Run the ESRGAN upscaler this many times (default 1)\n"); printf(" --type [TYPE] weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)\n"); printf(" If not specified, the default is the type of the weight file\n"); - printf(" --tensor-type [EXPRESSION] weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")\n"); + printf(" --tensor-type-rules [EXPRESSION] weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")\n"); printf(" --lora-model-dir [DIR] lora model directory\n"); printf(" -i, --init-img [IMAGE] path to the input image, required by img2img\n"); printf(" --mask [MASK] path to the mask image, required by img2img with mask\n"); @@ -406,12 +406,12 @@ void parse_args(int argc, const char** argv, SDParams& params) { valid_types.c_str()); exit(1); } - } else if (arg == "--tensor-type") { + } else if (arg == "--tensor-type-rules") { if (++i >= argc) { invalid_arg = true; break; } - params.tensor_wtype = argv[i]; + params.tensor_type_rules = argv[i]; } else if (arg == "--lora-model-dir") { if (++i >= argc) { invalid_arg = true; @@ -741,8 +741,8 @@ void parse_args(int argc, const char** argv, SDParams& params) { exit(1); } - if (params.mode != CONVERT && params.tensor_wtype.size() > 0) { - fprintf(stderr, "warning: --tensor-type is currently supported only for conversion\n"); + if (params.mode != CONVERT && params.tensor_type_rules.size() > 0) { + fprintf(stderr, "warning: --tensor-type-rules is currently supported only for conversion\n"); } if (params.seed < 0) { @@ -857,7 +857,7 @@ int main(int argc, const char* argv[]) { } if (params.mode == CONVERT) { - bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype, params.tensor_wtype.c_str()); + bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype, params.tensor_type_rules.c_str()); if (!success) { fprintf(stderr, "convert '%s'/'%s' to '%s' failed\n", diff --git a/model.cpp b/model.cpp index 40dc68d3c..559c876c6 100644 --- a/model.cpp +++ b/model.cpp @@ -100,7 +100,7 @@ const char* unused_tensors[] = { "model_ema.diffusion_model", "embedding_manager", "denoiser.sigmas", - "text_encoders.t5xxl.transformer.encoder.embed_tokens.weight", // only used during training + "text_encoders.t5xxl.transformer.encoder.embed_tokens.weight", // only used during training }; bool is_unused_tensor(std::string name) { @@ -1169,7 +1169,6 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const n_dims = 1; } - TensorStorage tensor_storage(prefix + name, type, ne, n_dims, file_index, ST_HEADER_SIZE_LEN + header_size_ + begin); tensor_storage.reverse_ne(); @@ -1914,7 +1913,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend }; int tensor_count = 0; int64_t t1 = ggml_time_ms(); - bool partial = false; + bool partial = false; for (auto& tensor_storage : processed_tensor_storages) { if (tensor_storage.file_index != file_index) { ++tensor_count; @@ -1997,9 +1996,9 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend } } size_t tensor_max = processed_tensor_storages.size(); - int64_t t2 = ggml_time_ms(); + int64_t t2 = ggml_time_ms(); pretty_progress(++tensor_count, tensor_max, (t2 - t1) / 1000.0f); - t1 = t2; + t1 = t2; partial = tensor_count != tensor_max; } @@ -2088,10 +2087,9 @@ bool ModelLoader::load_tensors(std::map& tenso return true; } -std::vector > parse_quant_overrides (const std::string & overrides) -{ - std::vector > result; - for (const auto & item : splitString(overrides, ',')) { +std::vector> parse_tensor_type_rules(const std::string& tensor_type_rules) { + std::vector> result; + for (const auto& item : splitString(tensor_type_rules, ',')) { if (item.size() == 0) continue; std::string::size_type pos = item.find('='); @@ -2100,29 +2098,26 @@ std::vector > parse_quant_overrides (const std: continue; } std::string tensor_pattern = item.substr(0, pos); - std::string quant_name = item.substr(pos + 1); + std::string type_name = item.substr(pos + 1); - ggml_type over_type = GGML_TYPE_COUNT; + ggml_type tensor_type = GGML_TYPE_COUNT; - if (quant_name == "f32") { - over_type = GGML_TYPE_F32; - } - else { + if (type_name == "f32") { + tensor_type = GGML_TYPE_F32; + } else { for (size_t i = 0; i < SD_TYPE_COUNT; i++) { auto trait = ggml_get_type_traits((ggml_type)i); - if (trait->to_float && trait->type_size && quant_name == trait->type_name) { - over_type = (ggml_type)i; + if (trait->to_float && trait->type_size && type_name == trait->type_name) { + tensor_type = (ggml_type)i; } } } - if (over_type != GGML_TYPE_COUNT) { - result.emplace_back(tensor_pattern, over_type); - } - else { + if (tensor_type != GGML_TYPE_COUNT) { + result.emplace_back(tensor_pattern, tensor_type); + } else { LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str()); } - } return result; } @@ -2158,7 +2153,7 @@ bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage return false; } -bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type, const char * overrides) { +bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules_str) { auto backend = ggml_backend_cpu_init(); size_t mem_size = 1 * 1024 * 1024; // for padding mem_size += tensor_storages.size() * ggml_tensor_overhead(); @@ -2168,25 +2163,23 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type gguf_context* gguf_ctx = gguf_init_empty(); - if (overrides == nullptr) - overrides = ""; - auto quant_overrides = parse_quant_overrides(overrides); + auto tensor_type_rules = parse_tensor_type_rules(tensor_type_rules_str); auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { const std::string& name = tensor_storage.name; - ggml_type tensor_type = tensor_storage.type; - ggml_type change_type = type; + ggml_type tensor_type = tensor_storage.type; + ggml_type dst_type = type; - for (const auto & quant_override : quant_overrides) { - std::regex pattern(quant_override.first); + for (const auto& tensor_type_rule : tensor_type_rules) { + std::regex pattern(tensor_type_rule.first); if (std::regex_search(name, pattern)) { - change_type = quant_override.second; + dst_type = tensor_type_rule.second; break; } } - if (tensor_should_be_converted(tensor_storage, change_type)) { - tensor_type = change_type; + if (tensor_should_be_converted(tensor_storage, dst_type)) { + tensor_type = dst_type; } ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne); @@ -2245,8 +2238,7 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type) return mem_size; } -bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type, - const char * output_tensor_type) { +bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type, const char* tensor_type_rules) { ModelLoader model_loader; if (!model_loader.init_from_file(input_path)) { @@ -2260,6 +2252,6 @@ bool convert(const char* input_path, const char* vae_path, const char* output_pa return false; } } - bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type, output_tensor_type); + bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type, tensor_type_rules); return success; } diff --git a/model.h b/model.h index 5780e26bb..95c66319d 100644 --- a/model.h +++ b/model.h @@ -222,7 +222,7 @@ class ModelLoader { ggml_backend_t backend, std::set ignore_tensors = {}); - bool save_to_gguf_file(const std::string& file_path, ggml_type type, const char * overrides); + bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules); bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type); int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT); ~ModelLoader() = default; diff --git a/stable-diffusion.h b/stable-diffusion.h index 92338e1d7..212e1c918 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -257,7 +257,7 @@ SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx); SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor); -SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type, const char * output_tensor_type); +SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type, const char* tensor_type_rules); SD_API uint8_t* preprocess_canny(uint8_t* img, int width,