From 8865bdf390abb5948416fb693831436f73f5432f Mon Sep 17 00:00:00 2001
From: zRzRzRzRzRzRzR <2448370773@qq.com>
Date: Thu, 10 Apr 2025 16:24:52 +0800
Subject: [PATCH 1/7] GLM-4-0414
---
README.md | 1 +
convert_hf_to_gguf.py | 39 ++++++-
convert_hf_to_gguf_update.py | 1 +
examples/server/README.md | 76 ++++++------
gguf-py/gguf/constants.py | 17 +++
gguf-py/gguf/tensor_mapping.py | 4 +-
src/llama-arch.cpp | 20 ++++
src/llama-arch.h | 3 +
src/llama-model.cpp | 204 +++++++++++++++++++++++++++++++++
src/llama-vocab.cpp | 1 +
10 files changed, 325 insertions(+), 41 deletions(-)
diff --git a/README.md b/README.md
index e56042f1cb2c9..a129d27d54c91 100644
--- a/README.md
+++ b/README.md
@@ -97,6 +97,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) + [GLMEdge-1.5b](https://huggingface.co/THUDM/glm-edge-1.5b-chat) + [GLMEdge-4b](https://huggingface.co/THUDM/glm-edge-4b-chat)
+- [x] [GLM-4-0414](https://huggingface.co/collections/THUDM/glm-4-0414-67f3cbcb34dd9d252707cb2e)
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 656dc987780c9..1fa2c92c69072 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -717,6 +717,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406":
# ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
res = "llama4"
+ if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
+ # ref: https://huggingface.co/THUDM/glm-4-9b-hf
+ res = "glm4"
if res is None:
logger.warning("\n")
@@ -4882,6 +4885,41 @@ def prepare_tensors(self):
super().prepare_tensors()
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
+@Model.register("Glm4ForCausalLM")
+class Glm4Model(Model):
+ model_arch = gguf.MODEL_ARCH.GLM4
+
+ def set_vocab(self):
+ self._set_vocab_gpt2()
+
+ def set_gguf_parameters(self):
+ super().set_gguf_parameters()
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+ if self.hparams["rope_scaling"].get("type") == "yarn":
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[Tuple[str, Tensor]]:
+ if "gate_up_proj" in name:
+ match = re.match(r"model\.layers\.(\d+)\.gate_up_proj\.weight", name)
+ if match:
+ bid = int(match.group(1))
+ return [(f"blk.{bid}.ffn_up.weight", data_torch)]
+
+ if "post_self_attn_layernorm" in name:
+ match = re.match(r"model\.layers\.(\d+)\.post_self_attn_layernorm\.weight", name)
+ if match:
+ bid = int(match.group(1))
+ return [(f"blk.{bid}.post_attn_norm.weight", data_torch)]
+
+ if "post_mlp_layernorm" in name:
+ match = re.match(r"model\.layers\.(\d+)\.post_mlp_layernorm\.weight", name)
+ if match:
+ bid = int(match.group(1))
+ return [(f"blk.{bid}.post_mlp_norm.weight", data_torch)]
+
+ return super().modify_tensors(data_torch, name, bid)
@Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
class ChatGLMModel(Model):
@@ -5551,7 +5589,6 @@ def main() -> None:
with torch.inference_mode():
output_type = ftype_map[args.outtype]
model_architecture = hparams["architectures"][0]
-
try:
model_class = Model.from_model_architecture(model_architecture)
except NotImplementedError:
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index ce6104da4a0e9..492767ce15fda 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -81,6 +81,7 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
+ {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
{"name": "jina-v1-en", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
diff --git a/examples/server/README.md b/examples/server/README.md
index a2a0903261e31..22aa4dc700648 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -132,44 +132,44 @@ The project is under active development, and we are [looking for feedback and co
**Example-specific params**
-| Argument | Explanation |
-| -------- | ----------- |
-| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)
(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
-| `-sp, --special` | special tokens output enabled (default: false) |
-| `--no-warmup` | skip warming up the model with an empty run |
-| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
-| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified
(env: LLAMA_ARG_POOLING) |
-| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) |
-| `-nocb, --no-cont-batching` | disable continuous batching
(env: LLAMA_ARG_NO_CONT_BATCHING) |
-| `-a, --alias STRING` | set alias for model name (to be used by REST API)
(env: LLAMA_ARG_ALIAS) |
-| `--host HOST` | ip address to listen (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) |
-| `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) |
-| `--path PATH` | path to serve static files from (default: )
(env: LLAMA_ARG_STATIC_PATH) |
-| `--no-webui` | Disable the Web UI (default: enabled)
(env: LLAMA_ARG_NO_WEBUI) |
-| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) |
-| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) |
-| `--api-key KEY` | API key to use for authentication (default: none)
(env: LLAMA_API_KEY) |
-| `--api-key-file FNAME` | path to file containing API keys (default: none) |
-| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key
(env: LLAMA_ARG_SSL_KEY_FILE) |
-| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate
(env: LLAMA_ARG_SSL_CERT_FILE) |
-| `-to, --timeout N` | server read/write timeout in seconds (default: 600)
(env: LLAMA_ARG_TIMEOUT) |
-| `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) |
-| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)
(env: LLAMA_ARG_CACHE_REUSE) |
-| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_METRICS) |
-| `--slots` | enable slots monitoring endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_SLOTS) |
-| `--props` | enable changing global properties via POST /props (default: disabled)
(env: LLAMA_ARG_ENDPOINT_PROPS) |
-| `--no-slots` | disables slots monitoring endpoint
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
-| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
list of built-in templates:
chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
|
-| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
-| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) |
-| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5)
(env: LLAMA_ARG_DRAFT_MIN) |
-| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9)
(env: LLAMA_ARG_DRAFT_P_MIN) |
-| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
-| `-devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices |
-| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
-| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_MODEL_DRAFT) |
+| Argument | Explanation |
+| -------- |--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)
(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
+| `-sp, --special` | special tokens output enabled (default: false) |
+| `--no-warmup` | skip warming up the model with an empty run |
+| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
+| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified
(env: LLAMA_ARG_POOLING) |
+| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) |
+| `-nocb, --no-cont-batching` | disable continuous batching
(env: LLAMA_ARG_NO_CONT_BATCHING) |
+| `-a, --alias STRING` | set alias for model name (to be used by REST API)
(env: LLAMA_ARG_ALIAS) |
+| `--host HOST` | ip address to listen (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) |
+| `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) |
+| `--path PATH` | path to serve static files from (default: )
(env: LLAMA_ARG_STATIC_PATH) |
+| `--no-webui` | Disable the Web UI (default: enabled)
(env: LLAMA_ARG_NO_WEBUI) |
+| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) |
+| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) |
+| `--api-key KEY` | API key to use for authentication (default: none)
(env: LLAMA_API_KEY) |
+| `--api-key-file FNAME` | path to file containing API keys (default: none) |
+| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key
(env: LLAMA_ARG_SSL_KEY_FILE) |
+| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate
(env: LLAMA_ARG_SSL_CERT_FILE) |
+| `-to, --timeout N` | server read/write timeout in seconds (default: 600)
(env: LLAMA_ARG_TIMEOUT) |
+| `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) |
+| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)
(env: LLAMA_ARG_CACHE_REUSE) |
+| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_METRICS) |
+| `--slots` | enable slots monitoring endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_SLOTS) |
+| `--props` | enable changing global properties via POST /props (default: disabled)
(env: LLAMA_ARG_ENDPOINT_PROPS) |
+| `--no-slots` | disables slots monitoring endpoint
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
+| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
list of built-in templates:
chatglm3, chatglm4,glm-4-0414,chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
|
+| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
+| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) |
+| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5)
(env: LLAMA_ARG_DRAFT_MIN) |
+| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9)
(env: LLAMA_ARG_DRAFT_P_MIN) |
+| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
+| `-devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices |
+| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
+| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_MODEL_DRAFT) |
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 0410654dde2bd..49c2365e1d731 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -280,6 +280,7 @@ class MODEL_ARCH(IntEnum):
DEEPSEEK = auto()
DEEPSEEK2 = auto()
CHATGLM = auto()
+ GLM4 = auto()
BITNET = auto()
T5 = auto()
T5ENCODER = auto()
@@ -487,6 +488,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.DEEPSEEK: "deepseek",
MODEL_ARCH.DEEPSEEK2: "deepseek2",
MODEL_ARCH.CHATGLM: "chatglm",
+ MODEL_ARCH.GLM4: "glm4",
MODEL_ARCH.BITNET: "bitnet",
MODEL_ARCH.T5: "t5",
MODEL_ARCH.T5ENCODER: "t5encoder",
@@ -1561,6 +1563,21 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
+ MODEL_ARCH.GLM4 : [
+ MODEL_TENSOR.TOKEN_EMBD,
+ MODEL_TENSOR.ROPE_FREQS,
+ MODEL_TENSOR.OUTPUT_NORM,
+ MODEL_TENSOR.OUTPUT,
+ MODEL_TENSOR.ATTN_NORM,
+ MODEL_TENSOR.ATTN_QKV,
+ MODEL_TENSOR.ATTN_Q,
+ MODEL_TENSOR.ATTN_K,
+ MODEL_TENSOR.ATTN_V,
+ MODEL_TENSOR.ATTN_OUT,
+ MODEL_TENSOR.FFN_NORM,
+ MODEL_TENSOR.FFN_DOWN,
+ MODEL_TENSOR.FFN_UP,
+ ],
MODEL_ARCH.BITNET: [
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 50bef12e3dbe7..391b019278256 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -13,7 +13,7 @@ class TensorNameMap:
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone
"transformer.word_embeddings", # falcon
"word_embeddings", # bloom
- "model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2
+ "model.embed_tokens", # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414
"tok_embeddings", # llama-pth
"embeddings.word_embeddings", # bert nomic-bert
"language_model.embedding.word_embeddings", # persimmon
@@ -306,7 +306,7 @@ class TensorNameMap:
"h.{bid}.mlp.c_fc", # gpt2
"transformer.h.{bid}.mlp.fc1", # phi2
"model.layers.{bid}.mlp.fc1", # phi2
- "model.layers.{bid}.mlp.gate_up_proj", # phi3
+ "model.layers.{bid}.mlp.gate_up_proj", # phi3 glm-4-0414
"model.layers.layers.{bid}.mlp.up_proj", # plamo
"model.layers.{bid}.feed_forward.w3", # internlm2
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 264f8c5b98ffa..0aa87b1992d85 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -54,6 +54,7 @@ static const std::map LLM_ARCH_NAMES = {
{ LLM_ARCH_DEEPSEEK, "deepseek" },
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
{ LLM_ARCH_CHATGLM, "chatglm" },
+ { LLM_ARCH_GLM4, "glm4" },
{ LLM_ARCH_BITNET, "bitnet" },
{ LLM_ARCH_T5, "t5" },
{ LLM_ARCH_T5ENCODER, "t5encoder" },
@@ -1152,6 +1153,25 @@ static const std::map> LLM_TENSOR_N
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
},
},
+ {
+ LLM_ARCH_GLM4,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attn_norm" },
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_mlp_norm" },
+ },
+ },
{
LLM_ARCH_BITNET,
{
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 2019352812d5c..2c2099b3c3851 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -58,6 +58,7 @@ enum llm_arch {
LLM_ARCH_DEEPSEEK,
LLM_ARCH_DEEPSEEK2,
LLM_ARCH_CHATGLM,
+ LLM_ARCH_GLM4,
LLM_ARCH_BITNET,
LLM_ARCH_T5,
LLM_ARCH_T5ENCODER,
@@ -256,6 +257,8 @@ enum llm_tensor {
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM,
LLM_TENSOR_LAYER_OUT_NORM,
+ LLM_TENSOR_POST_ATTN_NORM,
+ LLM_TENSOR_POST_MLP_NORM,
LLM_TENSOR_SSM_IN,
LLM_TENSOR_SSM_CONV1D,
LLM_TENSOR_SSM_X,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 9e4166a71c641..b9235e389048a 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1205,6 +1205,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
+ case LLM_ARCH_GLM4:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ switch (hparams.n_layer) {
+ case 40: type = LLM_TYPE_9B; break;
+ case 61: type = LLM_TYPE_32B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
case LLM_ARCH_BITNET:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -3476,6 +3485,45 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
}
} break;
+ case LLM_ARCH_GLM4:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+
+ if (layer.wqkv == nullptr) {
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ }
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
+
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+ }
+ } break;
case LLM_ARCH_NEMOTRON:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -10854,6 +10902,157 @@ struct llm_build_chatglm : public llm_graph_context {
}
};
+struct llm_build_glm4 : public llm_graph_context {
+ llm_build_glm4(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ // inp_pos - contains the positions
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ auto * inp_attn = build_attn_inp_kv_unified();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ // Pre-attention norm
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm,
+ NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // self-attention
+ {
+ ggml_tensor * Qcur = nullptr;
+ ggml_tensor * Kcur = nullptr;
+ ggml_tensor * Vcur = nullptr;
+
+ if (model.layers[il].wqkv == nullptr) {
+ Qcur = build_lora_mm(model.layers[il].wq, cur);
+ if (model.layers[il].bq) {
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+ }
+ Kcur = build_lora_mm(model.layers[il].wk, cur);
+ if (model.layers[il].bk) {
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+ }
+ Vcur = build_lora_mm(model.layers[il].wv, cur);
+ if (model.layers[il].bv) {
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+ }
+ } else {
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+ if (model.layers[il].bqkv) {
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+ cb(cur, "bqkv", il);
+ }
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+ }
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn, gf,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ }
+
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ // Post-attention norm (new!)
+ cur = build_norm(cur,
+ model.layers[il].attn_post_norm,
+ NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "post_attn_norm", il);
+
+ // Add the input (residual connection after post-attention norm)
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // FF
+ {
+ // Pre-MLP norm
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm,
+ NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // MLP
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+
+ // Post-MLP norm
+ cur = build_norm(cur,
+ model.layers[il].ffn_post_norm,
+ NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "post_mlp_norm", il);
+ }
+
+ // Add residual connection after post-MLP norm
+ inpL = ggml_add(ctx0, cur, ffn_inp);
+ cb(inpL, "l_out", il);
+ }
+
+ // Final norm
+ cur = build_norm(inpL,
+ model.output_norm,
+ NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // Output projection
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+ }
+};
+
struct llm_build_nemotron : public llm_graph_context {
llm_build_nemotron(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -12735,6 +12934,10 @@ llm_graph_result_ptr llama_model::build_graph(
{
llm = std::make_unique(*this, params, gf);
} break;
+ case LLM_ARCH_GLM4:
+ {
+ llm = std::make_unique(*this, params, gf);
+ } break;
case LLM_ARCH_BITNET:
{
llm = std::make_unique(*this, params, gf);
@@ -12932,6 +13135,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_DEEPSEEK2:
case LLM_ARCH_PLM:
case LLM_ARCH_CHATGLM:
+ case LLM_ARCH_GLM4:
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_CHAMELEON:
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 0feabd95aaf2b..464ff01e06fe1 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1572,6 +1572,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
clean_spaces = false;
} else if (
+ tokenizer_pre == "glm4" ||
tokenizer_pre == "chatglm-bpe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
special_bos_id = LLAMA_TOKEN_NULL;
From 2190494a00b640b272abd4da06d2249415a7c44a Mon Sep 17 00:00:00 2001
From: zRzRzRzRzRzRzR <2448370773@qq.com>
Date: Thu, 10 Apr 2025 17:27:54 +0800
Subject: [PATCH 2/7] use original one
---
examples/server/README.md | 76 +++++++++++++++++++--------------------
1 file changed, 38 insertions(+), 38 deletions(-)
diff --git a/examples/server/README.md b/examples/server/README.md
index 22aa4dc700648..a2a0903261e31 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -132,44 +132,44 @@ The project is under active development, and we are [looking for feedback and co
**Example-specific params**
-| Argument | Explanation |
-| -------- |--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)
(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
-| `-sp, --special` | special tokens output enabled (default: false) |
-| `--no-warmup` | skip warming up the model with an empty run |
-| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
-| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified
(env: LLAMA_ARG_POOLING) |
-| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) |
-| `-nocb, --no-cont-batching` | disable continuous batching
(env: LLAMA_ARG_NO_CONT_BATCHING) |
-| `-a, --alias STRING` | set alias for model name (to be used by REST API)
(env: LLAMA_ARG_ALIAS) |
-| `--host HOST` | ip address to listen (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) |
-| `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) |
-| `--path PATH` | path to serve static files from (default: )
(env: LLAMA_ARG_STATIC_PATH) |
-| `--no-webui` | Disable the Web UI (default: enabled)
(env: LLAMA_ARG_NO_WEBUI) |
-| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) |
-| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) |
-| `--api-key KEY` | API key to use for authentication (default: none)
(env: LLAMA_API_KEY) |
-| `--api-key-file FNAME` | path to file containing API keys (default: none) |
-| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key
(env: LLAMA_ARG_SSL_KEY_FILE) |
-| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate
(env: LLAMA_ARG_SSL_CERT_FILE) |
-| `-to, --timeout N` | server read/write timeout in seconds (default: 600)
(env: LLAMA_ARG_TIMEOUT) |
-| `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) |
-| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)
(env: LLAMA_ARG_CACHE_REUSE) |
-| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_METRICS) |
-| `--slots` | enable slots monitoring endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_SLOTS) |
-| `--props` | enable changing global properties via POST /props (default: disabled)
(env: LLAMA_ARG_ENDPOINT_PROPS) |
-| `--no-slots` | disables slots monitoring endpoint
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
-| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
list of built-in templates:
chatglm3, chatglm4,glm-4-0414,chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
|
-| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
-| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) |
-| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5)
(env: LLAMA_ARG_DRAFT_MIN) |
-| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9)
(env: LLAMA_ARG_DRAFT_P_MIN) |
-| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
-| `-devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices |
-| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
-| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_MODEL_DRAFT) |
+| Argument | Explanation |
+| -------- | ----------- |
+| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)
(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
+| `-sp, --special` | special tokens output enabled (default: false) |
+| `--no-warmup` | skip warming up the model with an empty run |
+| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
+| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified
(env: LLAMA_ARG_POOLING) |
+| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) |
+| `-nocb, --no-cont-batching` | disable continuous batching
(env: LLAMA_ARG_NO_CONT_BATCHING) |
+| `-a, --alias STRING` | set alias for model name (to be used by REST API)
(env: LLAMA_ARG_ALIAS) |
+| `--host HOST` | ip address to listen (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) |
+| `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) |
+| `--path PATH` | path to serve static files from (default: )
(env: LLAMA_ARG_STATIC_PATH) |
+| `--no-webui` | Disable the Web UI (default: enabled)
(env: LLAMA_ARG_NO_WEBUI) |
+| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) |
+| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) |
+| `--api-key KEY` | API key to use for authentication (default: none)
(env: LLAMA_API_KEY) |
+| `--api-key-file FNAME` | path to file containing API keys (default: none) |
+| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key
(env: LLAMA_ARG_SSL_KEY_FILE) |
+| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate
(env: LLAMA_ARG_SSL_CERT_FILE) |
+| `-to, --timeout N` | server read/write timeout in seconds (default: 600)
(env: LLAMA_ARG_TIMEOUT) |
+| `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) |
+| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)
(env: LLAMA_ARG_CACHE_REUSE) |
+| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_METRICS) |
+| `--slots` | enable slots monitoring endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_SLOTS) |
+| `--props` | enable changing global properties via POST /props (default: disabled)
(env: LLAMA_ARG_ENDPOINT_PROPS) |
+| `--no-slots` | disables slots monitoring endpoint
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
+| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
list of built-in templates:
chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
|
+| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
+| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) |
+| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5)
(env: LLAMA_ARG_DRAFT_MIN) |
+| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9)
(env: LLAMA_ARG_DRAFT_P_MIN) |
+| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
+| `-devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices |
+| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
+| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_MODEL_DRAFT) |
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
From 9f8a7765e941baf5bc21b87c7be57a2ef31ab5d6 Mon Sep 17 00:00:00 2001
From: zRzRzRzRzRzRzR <2448370773@qq.com>
Date: Thu, 10 Apr 2025 17:37:44 +0800
Subject: [PATCH 3/7] Using with tensor map
---
convert_hf_to_gguf.py | 21 ---------------------
gguf-py/gguf/constants.py | 2 ++
gguf-py/gguf/tensor_mapping.py | 4 +++-
3 files changed, 5 insertions(+), 22 deletions(-)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 1fa2c92c69072..eea1c83b7c913 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4900,27 +4900,6 @@ def set_gguf_parameters(self):
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
- def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[Tuple[str, Tensor]]:
- if "gate_up_proj" in name:
- match = re.match(r"model\.layers\.(\d+)\.gate_up_proj\.weight", name)
- if match:
- bid = int(match.group(1))
- return [(f"blk.{bid}.ffn_up.weight", data_torch)]
-
- if "post_self_attn_layernorm" in name:
- match = re.match(r"model\.layers\.(\d+)\.post_self_attn_layernorm\.weight", name)
- if match:
- bid = int(match.group(1))
- return [(f"blk.{bid}.post_attn_norm.weight", data_torch)]
-
- if "post_mlp_layernorm" in name:
- match = re.match(r"model\.layers\.(\d+)\.post_mlp_layernorm\.weight", name)
- if match:
- bid = int(match.group(1))
- return [(f"blk.{bid}.post_mlp_norm.weight", data_torch)]
-
- return super().modify_tensors(data_torch, name, bid)
-
@Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
class ChatGLMModel(Model):
model_arch = gguf.MODEL_ARCH.CHATGLM
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 49c2365e1d731..162070e6e193a 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -1577,6 +1577,8 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
+ MODEL_TENSOR.ATTN_POST_NORM,
+ MODEL_TENSOR.FFN_POST_NORM,
],
MODEL_ARCH.BITNET: [
MODEL_TENSOR.ATTN_Q,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 391b019278256..463db77ca01c6 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -233,7 +233,8 @@ class TensorNameMap:
),
MODEL_TENSOR.ATTN_POST_NORM: (
- "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2
+ "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 # ge
+ "model.layers.{bid}.post_self_attn_layernorm", # glm-4-0414
),
# Rotary embeddings
@@ -269,6 +270,7 @@ class TensorNameMap:
# Post feed-forward norm
MODEL_TENSOR.FFN_POST_NORM: (
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
+ "model.layers.{bid}.post_mlp_layernorm", # glm-4-0414
),
MODEL_TENSOR.FFN_GATE_INP: (
From 0458ff8da03e01aafb0aa1875600e53368ea9d77 Mon Sep 17 00:00:00 2001
From: zRzRzRzRzRzRzR <2448370773@qq.com>
Date: Thu, 10 Apr 2025 17:43:01 +0800
Subject: [PATCH 4/7] fix bug
---
src/llama-arch.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 0aa87b1992d85..a6fddc7fd2e54 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -1168,8 +1168,8 @@ static const std::map> LLM_TENSOR_N
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
- { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attn_norm" },
- { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_mlp_norm" },
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
},
},
{
From 10f43c512440faff1244a4d6de6bcd14b71398e5 Mon Sep 17 00:00:00 2001
From: zRzRzRzRzRzRzR <2448370773@qq.com>
Date: Thu, 10 Apr 2025 21:12:23 +0800
Subject: [PATCH 5/7] change order
---
convert_hf_to_gguf_update.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index 492767ce15fda..e41c6a01f99ec 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -81,7 +81,6 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
- {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
{"name": "jina-v1-en", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
@@ -115,6 +114,7 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
+ {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
]
From 24708828a9c25bd65eb6916dd2f527c59987bcc5 Mon Sep 17 00:00:00 2001
From: zRzRzRzRzRzRzR <2448370773@qq.com>
Date: Thu, 10 Apr 2025 21:12:43 +0800
Subject: [PATCH 6/7] change order
---
convert_hf_to_gguf_update.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index e41c6a01f99ec..160c9fe0e616a 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -114,7 +114,7 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
- {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
+ {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
]
From 6f2b5b4abae387a089c9bc86b521fbcd39d6cf13 Mon Sep 17 00:00:00 2001
From: zRzRzRzRzRzRzR <2448370773@qq.com>
Date: Fri, 11 Apr 2025 11:29:42 +0800
Subject: [PATCH 7/7] format with flask8
---
convert_hf_to_gguf.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index eea1c83b7c913..9ec7338a7c3a6 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4885,6 +4885,7 @@ def prepare_tensors(self):
super().prepare_tensors()
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
+
@Model.register("Glm4ForCausalLM")
class Glm4Model(Model):
model_arch = gguf.MODEL_ARCH.GLM4
@@ -4900,6 +4901,7 @@ def set_gguf_parameters(self):
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
+
@Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
class ChatGLMModel(Model):
model_arch = gguf.MODEL_ARCH.CHATGLM