feat: Add conversion support in GraniteHybrid for non-hybrid (all attn)

gabe-l-hart · gabe-l-hart · commit bb98b31e7f26 · 2025-09-22T11:29:43.000-06:00
This is a configuration of the hparams in the GraniteHybrid architecture
that devolves to the Granite (or GraniteMoe) architecture (ie Granite 3.x).
It may be used for some models in the Granite 4 family with the
GraniteHybrid architecture acting as a superset arch. Rather than support
it directly in the c++ graph, we simply coerce the architecture flag back
to the correct "granite" or "granitemoe" architecture.

Branch: gabe-l-hart/GraniteNonHybridConversion

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -7656,6 +7656,24 @@ def __init__(self, *args, **kwargs):
             if i not in self._attn_layers
         ]
 
+        # There are some models in this family that are non-hybrid, but keep the
+        # same parent class by setting all layers to "attention." If this is the
+        # case, the model architecture needs to be updated to a standard
+        # "granite" or "granitemoe" model
+        if not self._ssm_layers:
+            old_arch = self.gguf_writer.arch
+            has_experts = self.find_hparam(["num_experts_per_tok"])
+            new_arch = (
+                gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.GRANITE_MOE]
+                if has_experts else
+                gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.GRANITE]
+            )
+            self.gguf_writer.arch = new_arch
+            for kv_entry in self.gguf_writer.kv_data:
+                for kv_val in kv_entry.values():
+                    if isinstance(kv_val.value, str) and old_arch in kv_val.value:
+                        kv_val.value = kv_val.value.replace(old_arch, new_arch)
+
         # n_group and d_inner are used during reshape_tensors for mamba2
         # NOTE: Explicitly include hparam prefix prefix for d_model to
         #   disambiguate with top-level head_dim
@@ -7740,8 +7758,11 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_rope_dimension_count(rope_dim)
         self.gguf_writer.add_head_count_kv(head_count_kv_vec)
 
-        ## If Bamba, use rope, otherwise don't
-        use_rope = "BambaForCausalLM" in self.hparams["architectures"]
+        ## If Bamba or non-hybrid, use rope, otherwise don't
+        use_rope = (
+            "BambaForCausalLM" in self.hparams["architectures"]
+            or not self._ssm_layers
+        )
         self.gguf_writer.add_rope_scaling_finetuned(use_rope)
         if not use_rope:
             self.gguf_writer.add_context_length(2**20)