[fix] use NEOX, and remove permute & split in convert process

im0qianqian · im0qianqian · commit 48ddb75588a0 · 2025-09-18T21:17:41.000+08:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -7828,28 +7828,6 @@ def prepare_tensors(self):
 class BailingMoeV2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.BAILINGMOE_V2
 
-    @staticmethod
-    def permute(
-        weights: Tensor, n_head: int, n_head_kv: int | None, rope_dim: int | None
-    ):
-        if n_head_kv is not None and n_head != n_head_kv:
-            n_head = n_head_kv
-        if rope_dim is None:
-            rope_dim = weights.shape[0] // n_head
-        weights_rope, weights_nope = weights.reshape(
-            n_head, weights.shape[0] // n_head, *weights.shape[1:]
-        ).split([rope_dim, weights.shape[0] // n_head - rope_dim], dim=1)
-        return torch.cat(
-            [
-                weights_rope.reshape(
-                    n_head, 2, rope_dim // 2, *weights_rope.shape[2:]
-                )
-                .swapaxes(1, 2)
-                .reshape(weights_rope.shape),
-                weights_nope,
-            ], dim=1
-        ).reshape(weights.shape)
-
     def set_vocab(self):
         self._set_vocab_gpt2()
 
@@ -7889,7 +7867,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         if match and int(match.group(1)) >= block_count:
             return []
 
-        rope_dim = int(self.hparams['partial_rotary_factor'] * self.hparams['head_dim'])
         if name.endswith("query_key_value.weight"):
             n_head = self.hparams["num_attention_heads"]
             n_kv_head = self.hparams.get("num_key_value_heads")
@@ -7899,18 +7876,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2)
 
             return [
-                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), BailingMoeV2Model.permute(q, n_head, n_head, rope_dim)),
-                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), BailingMoeV2Model.permute(k, n_head, n_kv_head, rope_dim)),
+                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q),
+                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k),
                 (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v)
             ]
-        elif "attention.key_layernorm" in name or "attention.query_layernorm" in name:
-            mapping = {
-                "attention.key_layernorm": "self_attn.key_layernorm",
-                "attention.query_layernorm": "self_attn.query_layernorm",
-            }
-            for k, v in mapping.items():
-                name = name.replace(k, v)
-            return [(self.map_tensor_name(name), BailingMoeV2Model.permute(data_torch, 1, 1, rope_dim))]
         elif name.find("mlp.experts") != -1:
             n_experts = self.hparams["num_experts"]
             assert bid is not None
@@ -7945,6 +7914,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         pre_tensor_name_mapping = {
             "attention.dense": "self_attn.dense",
             "mlp.gate.expert_bias": "mlp.gate.e_score_correction.bias",
+            "attention.key_layernorm": "self_attn.key_layernorm",
+            "attention.query_layernorm": "self_attn.query_layernorm",
         }
         for k, v in pre_tensor_name_mapping.items():
             name = name.replace(k, v)
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -19485,7 +19485,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_GRANITE_HYBRID:
         case LLM_ARCH_CHAMELEON:
         case LLM_ARCH_BAILINGMOE:
-        case LLM_ARCH_BAILINGMOE_V2:
         case LLM_ARCH_NEO_BERT:
         case LLM_ARCH_SMOLLM3:
         case LLM_ARCH_ARCEE:
@@ -19539,6 +19538,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_SMALLTHINKER:
         case LLM_ARCH_GLM4_MOE:
         case LLM_ARCH_SEED_OSS:
+        case LLM_ARCH_BAILINGMOE_V2:
             return LLAMA_ROPE_TYPE_NEOX;
 
         case LLM_ARCH_QWEN2VL: