From 3661f9c0c34ef3b381286621428d908955f1b1e1 Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@users.noreply.github.com>
Date: Mon, 7 Jul 2025 08:22:21 -0300
Subject: [PATCH 1/2] feat: support specifying quant types for specific tensors
 on conversion

Uses a regex syntax similar to the llama.cpp's tensor overrides.
---
 examples/cli/main.cpp | 14 +++++++++-
 model.cpp             | 65 +++++++++++++++++++++++++++++++++++++++----
 model.h               |  2 +-
 stable-diffusion.h    |  2 +-
 4 files changed, 74 insertions(+), 9 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index c7db3708b..bfd2091e0 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -87,6 +87,7 @@ struct SDParams {
     std::string stacked_id_embeddings_path;
     std::string input_id_images_path;
     sd_type_t wtype = SD_TYPE_COUNT;
+    std::string tensor_wtype;
     std::string lora_model_dir;
     std::string output_path = "output.png";
     std::string input_path;
@@ -223,6 +224,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)\n");
     printf("  --type [TYPE]                      weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)\n");
     printf("                                     If not specified, the default is the type of the weight file\n");
+    printf("  --tensor-type [EXPRESSION]         weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")\n");
     printf("  --lora-model-dir [DIR]             lora model directory\n");
     printf("  -i, --init-img [IMAGE]             path to the input image, required by img2img\n");
     printf("  --mask [MASK]                      path to the mask image, required by img2img with mask\n");
@@ -404,6 +406,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                         valid_types.c_str());
                 exit(1);
             }
+        } else if (arg == "--tensor-type") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.tensor_wtype = argv[i];
         } else if (arg == "--lora-model-dir") {
             if (++i >= argc) {
                 invalid_arg = true;
@@ -733,6 +741,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         exit(1);
     }
 
+    if (params.mode != CONVERT && params.tensor_wtype.size() > 0) {
+        fprintf(stderr, "warning: --tensor-type is currently supported only for conversion\n");
+    }
+
     if (params.seed < 0) {
         srand((int)time(NULL));
         params.seed = rand();
@@ -845,7 +857,7 @@ int main(int argc, const char* argv[]) {
     }
 
     if (params.mode == CONVERT) {
-        bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype);
+        bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype, params.tensor_wtype.c_str());
         if (!success) {
             fprintf(stderr,
                     "convert '%s'/'%s' to '%s' failed\n",
diff --git a/model.cpp b/model.cpp
index 85c959057..40dc68d3c 100644
--- a/model.cpp
+++ b/model.cpp
@@ -2088,6 +2088,45 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
     return true;
 }
 
+std::vector<std::pair<std::string,ggml_type> > parse_quant_overrides (const std::string & overrides)
+{
+    std::vector<std::pair<std::string, ggml_type> > result;
+    for (const auto & item : splitString(overrides, ',')) {
+        if (item.size() == 0)
+            continue;
+        std::string::size_type pos = item.find('=');
+        if (pos == std::string::npos) {
+            LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str());
+            continue;
+        }
+        std::string tensor_pattern = item.substr(0, pos);
+        std::string quant_name = item.substr(pos + 1);
+
+        ggml_type over_type = GGML_TYPE_COUNT;
+
+        if (quant_name == "f32") {
+            over_type = GGML_TYPE_F32;
+        }
+        else {
+            for (size_t i = 0; i < SD_TYPE_COUNT; i++) {
+                auto trait = ggml_get_type_traits((ggml_type)i);
+                if (trait->to_float && trait->type_size && quant_name == trait->type_name) {
+                    over_type = (ggml_type)i;
+                }
+            }
+        }
+
+        if (over_type != GGML_TYPE_COUNT) {
+            result.emplace_back(tensor_pattern, over_type);
+        }
+        else {
+            LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str());
+        }
+
+    }
+    return result;
+}
+
 bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type) {
     const std::string& name = tensor_storage.name;
     if (type != GGML_TYPE_COUNT) {
@@ -2119,7 +2158,7 @@ bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage
     return false;
 }
 
-bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type) {
+bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type, const char * overrides) {
     auto backend    = ggml_backend_cpu_init();
     size_t mem_size = 1 * 1024 * 1024;  // for padding
     mem_size += tensor_storages.size() * ggml_tensor_overhead();
@@ -2129,12 +2168,25 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
 
     gguf_context* gguf_ctx = gguf_init_empty();
 
+    if (overrides == nullptr)
+        overrides = "";
+    auto quant_overrides = parse_quant_overrides(overrides);
+
     auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
         const std::string& name = tensor_storage.name;
-
         ggml_type tensor_type = tensor_storage.type;
-        if (tensor_should_be_converted(tensor_storage, type)) {
-            tensor_type = type;
+        ggml_type change_type = type;
+
+        for (const auto & quant_override : quant_overrides) {
+            std::regex pattern(quant_override.first);
+            if (std::regex_search(name, pattern)) {
+                change_type = quant_override.second;
+                break;
+            }
+        }
+
+        if (tensor_should_be_converted(tensor_storage, change_type)) {
+            tensor_type = change_type;
         }
 
         ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne);
@@ -2193,7 +2245,8 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type)
     return mem_size;
 }
 
-bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type) {
+bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type,
+    const char * output_tensor_type) {
     ModelLoader model_loader;
 
     if (!model_loader.init_from_file(input_path)) {
@@ -2207,6 +2260,6 @@ bool convert(const char* input_path, const char* vae_path, const char* output_pa
             return false;
         }
     }
-    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type);
+    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type, output_tensor_type);
     return success;
 }
diff --git a/model.h b/model.h
index 82885dd96..5780e26bb 100644
--- a/model.h
+++ b/model.h
@@ -222,7 +222,7 @@ class ModelLoader {
                       ggml_backend_t backend,
                       std::set<std::string> ignore_tensors = {});
 
-    bool save_to_gguf_file(const std::string& file_path, ggml_type type);
+    bool save_to_gguf_file(const std::string& file_path, ggml_type type, const char * overrides);
     bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
     int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
     ~ModelLoader() = default;
diff --git a/stable-diffusion.h b/stable-diffusion.h
index b4d6fc327..92338e1d7 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -257,7 +257,7 @@ SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
 
 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
 
-SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type);
+SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type, const char * output_tensor_type);
 
 SD_API uint8_t* preprocess_canny(uint8_t* img,
                                  int width,

From 4dc1aa9dba774e65d06a5d5aa47524487bfc928f Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Tue, 8 Jul 2025 00:07:37 +0800
Subject: [PATCH 2/2] unify code style

---
 examples/cli/main.cpp | 14 +++++-----
 model.cpp             | 64 +++++++++++++++++++------------------------
 model.h               |  2 +-
 stable-diffusion.h    |  2 +-
 4 files changed, 37 insertions(+), 45 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index bfd2091e0..bb695c3bb 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -87,7 +87,7 @@ struct SDParams {
     std::string stacked_id_embeddings_path;
     std::string input_id_images_path;
     sd_type_t wtype = SD_TYPE_COUNT;
-    std::string tensor_wtype;
+    std::string tensor_type_rules;
     std::string lora_model_dir;
     std::string output_path = "output.png";
     std::string input_path;
@@ -224,7 +224,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)\n");
     printf("  --type [TYPE]                      weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)\n");
     printf("                                     If not specified, the default is the type of the weight file\n");
-    printf("  --tensor-type [EXPRESSION]         weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")\n");
+    printf("  --tensor-type-rules [EXPRESSION]   weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")\n");
     printf("  --lora-model-dir [DIR]             lora model directory\n");
     printf("  -i, --init-img [IMAGE]             path to the input image, required by img2img\n");
     printf("  --mask [MASK]                      path to the mask image, required by img2img with mask\n");
@@ -406,12 +406,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                         valid_types.c_str());
                 exit(1);
             }
-        } else if (arg == "--tensor-type") {
+        } else if (arg == "--tensor-type-rules") {
             if (++i >= argc) {
                 invalid_arg = true;
                 break;
             }
-            params.tensor_wtype = argv[i];
+            params.tensor_type_rules = argv[i];
         } else if (arg == "--lora-model-dir") {
             if (++i >= argc) {
                 invalid_arg = true;
@@ -741,8 +741,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         exit(1);
     }
 
-    if (params.mode != CONVERT && params.tensor_wtype.size() > 0) {
-        fprintf(stderr, "warning: --tensor-type is currently supported only for conversion\n");
+    if (params.mode != CONVERT && params.tensor_type_rules.size() > 0) {
+        fprintf(stderr, "warning: --tensor-type-rules is currently supported only for conversion\n");
     }
 
     if (params.seed < 0) {
@@ -857,7 +857,7 @@ int main(int argc, const char* argv[]) {
     }
 
     if (params.mode == CONVERT) {
-        bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype, params.tensor_wtype.c_str());
+        bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype, params.tensor_type_rules.c_str());
         if (!success) {
             fprintf(stderr,
                     "convert '%s'/'%s' to '%s' failed\n",
diff --git a/model.cpp b/model.cpp
index 40dc68d3c..559c876c6 100644
--- a/model.cpp
+++ b/model.cpp
@@ -100,7 +100,7 @@ const char* unused_tensors[] = {
     "model_ema.diffusion_model",
     "embedding_manager",
     "denoiser.sigmas",
-    "text_encoders.t5xxl.transformer.encoder.embed_tokens.weight", // only used during training
+    "text_encoders.t5xxl.transformer.encoder.embed_tokens.weight",  // only used during training
 };
 
 bool is_unused_tensor(std::string name) {
@@ -1169,7 +1169,6 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
             n_dims = 1;
         }
 
-
         TensorStorage tensor_storage(prefix + name, type, ne, n_dims, file_index, ST_HEADER_SIZE_LEN + header_size_ + begin);
         tensor_storage.reverse_ne();
 
@@ -1914,7 +1913,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
         };
         int tensor_count = 0;
         int64_t t1       = ggml_time_ms();
-        bool partial = false;
+        bool partial     = false;
         for (auto& tensor_storage : processed_tensor_storages) {
             if (tensor_storage.file_index != file_index) {
                 ++tensor_count;
@@ -1997,9 +1996,9 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                 }
             }
             size_t tensor_max = processed_tensor_storages.size();
-            int64_t t2 = ggml_time_ms();
+            int64_t t2        = ggml_time_ms();
             pretty_progress(++tensor_count, tensor_max, (t2 - t1) / 1000.0f);
-            t1 = t2;
+            t1      = t2;
             partial = tensor_count != tensor_max;
         }
 
@@ -2088,10 +2087,9 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
     return true;
 }
 
-std::vector<std::pair<std::string,ggml_type> > parse_quant_overrides (const std::string & overrides)
-{
-    std::vector<std::pair<std::string, ggml_type> > result;
-    for (const auto & item : splitString(overrides, ',')) {
+std::vector<std::pair<std::string, ggml_type>> parse_tensor_type_rules(const std::string& tensor_type_rules) {
+    std::vector<std::pair<std::string, ggml_type>> result;
+    for (const auto& item : splitString(tensor_type_rules, ',')) {
         if (item.size() == 0)
             continue;
         std::string::size_type pos = item.find('=');
@@ -2100,29 +2098,26 @@ std::vector<std::pair<std::string,ggml_type> > parse_quant_overrides (const std:
             continue;
         }
         std::string tensor_pattern = item.substr(0, pos);
-        std::string quant_name = item.substr(pos + 1);
+        std::string type_name      = item.substr(pos + 1);
 
-        ggml_type over_type = GGML_TYPE_COUNT;
+        ggml_type tensor_type = GGML_TYPE_COUNT;
 
-        if (quant_name == "f32") {
-            over_type = GGML_TYPE_F32;
-        }
-        else {
+        if (type_name == "f32") {
+            tensor_type = GGML_TYPE_F32;
+        } else {
             for (size_t i = 0; i < SD_TYPE_COUNT; i++) {
                 auto trait = ggml_get_type_traits((ggml_type)i);
-                if (trait->to_float && trait->type_size && quant_name == trait->type_name) {
-                    over_type = (ggml_type)i;
+                if (trait->to_float && trait->type_size && type_name == trait->type_name) {
+                    tensor_type = (ggml_type)i;
                 }
             }
         }
 
-        if (over_type != GGML_TYPE_COUNT) {
-            result.emplace_back(tensor_pattern, over_type);
-        }
-        else {
+        if (tensor_type != GGML_TYPE_COUNT) {
+            result.emplace_back(tensor_pattern, tensor_type);
+        } else {
             LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str());
         }
-
     }
     return result;
 }
@@ -2158,7 +2153,7 @@ bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage
     return false;
 }
 
-bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type, const char * overrides) {
+bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules_str) {
     auto backend    = ggml_backend_cpu_init();
     size_t mem_size = 1 * 1024 * 1024;  // for padding
     mem_size += tensor_storages.size() * ggml_tensor_overhead();
@@ -2168,25 +2163,23 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
 
     gguf_context* gguf_ctx = gguf_init_empty();
 
-    if (overrides == nullptr)
-        overrides = "";
-    auto quant_overrides = parse_quant_overrides(overrides);
+    auto tensor_type_rules = parse_tensor_type_rules(tensor_type_rules_str);
 
     auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
         const std::string& name = tensor_storage.name;
-        ggml_type tensor_type = tensor_storage.type;
-        ggml_type change_type = type;
+        ggml_type tensor_type   = tensor_storage.type;
+        ggml_type dst_type      = type;
 
-        for (const auto & quant_override : quant_overrides) {
-            std::regex pattern(quant_override.first);
+        for (const auto& tensor_type_rule : tensor_type_rules) {
+            std::regex pattern(tensor_type_rule.first);
             if (std::regex_search(name, pattern)) {
-                change_type = quant_override.second;
+                dst_type = tensor_type_rule.second;
                 break;
             }
         }
 
-        if (tensor_should_be_converted(tensor_storage, change_type)) {
-            tensor_type = change_type;
+        if (tensor_should_be_converted(tensor_storage, dst_type)) {
+            tensor_type = dst_type;
         }
 
         ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne);
@@ -2245,8 +2238,7 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type)
     return mem_size;
 }
 
-bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type,
-    const char * output_tensor_type) {
+bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type, const char* tensor_type_rules) {
     ModelLoader model_loader;
 
     if (!model_loader.init_from_file(input_path)) {
@@ -2260,6 +2252,6 @@ bool convert(const char* input_path, const char* vae_path, const char* output_pa
             return false;
         }
     }
-    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type, output_tensor_type);
+    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type, tensor_type_rules);
     return success;
 }
diff --git a/model.h b/model.h
index 5780e26bb..95c66319d 100644
--- a/model.h
+++ b/model.h
@@ -222,7 +222,7 @@ class ModelLoader {
                       ggml_backend_t backend,
                       std::set<std::string> ignore_tensors = {});
 
-    bool save_to_gguf_file(const std::string& file_path, ggml_type type, const char * overrides);
+    bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules);
     bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
     int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
     ~ModelLoader() = default;
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 92338e1d7..212e1c918 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -257,7 +257,7 @@ SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
 
 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
 
-SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type, const char * output_tensor_type);
+SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type, const char* tensor_type_rules);
 
 SD_API uint8_t* preprocess_canny(uint8_t* img,
                                  int width,