Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
64add82
First attempt
pwilkin Sep 7, 2025
86cfc18
No permute during convert (fixes qk tensors), proper norm application.
pwilkin Sep 8, 2025
8c762a3
RoPE = NeoX
pwilkin Sep 8, 2025
ffdfd1d
Coherence!
pwilkin Sep 8, 2025
74dcf89
Merge branch 'ggml-org:master' into apertus-implementation
pwilkin Sep 9, 2025
ab11d94
Migrate xielu params from tensors to hyperparameters
pwilkin Sep 9, 2025
1b18472
Simple CUDA kernel
pwilkin Sep 13, 2025
1606a3c
Revert stupid LLM refactorings
pwilkin Sep 14, 2025
eec384f
Chat template support
pwilkin Sep 14, 2025
b2a92d0
configchecker / flake8 errors
pwilkin Sep 14, 2025
ef9ef66
Reorder unary.cu
pwilkin Sep 15, 2025
d009194
I do conclude that LLMs are, in fact, stupid.
pwilkin Sep 15, 2025
73bd64f
Merge branch 'master' into apertus-implementation
pwilkin Sep 16, 2025
28f9086
Fix after merge
pwilkin Sep 16, 2025
2f68c03
Final newline
pwilkin Sep 16, 2025
4294dbf
Make xIELU an UNARY_OP
pwilkin Sep 17, 2025
b2aa4fb
Final newline
pwilkin Sep 17, 2025
86a239c
Correctly account for parameter shift
pwilkin Sep 17, 2025
bd19026
Argh.
pwilkin Sep 17, 2025
13cc3be
Update ggml/src/ggml-cpu/unary-ops.cpp
pwilkin Sep 18, 2025
40f2f80
Refactor: remove unused methods, inline and factorize softplus, add c…
pwilkin Sep 18, 2025
dbe0ccb
Merge branch 'master' into apertus-implementation
pwilkin Sep 23, 2025
0d36139
Revert CUDA changes, implement xIELU as a separate OP
pwilkin Sep 23, 2025
fc58d3b
Pesky newline
pwilkin Sep 23, 2025
db9eb29
Add float2half / half2float for F16 inputs/outputs
pwilkin Sep 23, 2025
dc1e4d5
CUDA variants, attempt 2
pwilkin Sep 24, 2025
6c843ce
Actually, attempt 3
pwilkin Sep 24, 2025
f11ab3c
Update ggml/src/ggml-cuda/unary.cu
pwilkin Sep 25, 2025
57e2263
Missing convert header
pwilkin Sep 25, 2025
62401d8
Proper formula and reference for xIELU in the comments.
pwilkin Sep 25, 2025
58e6e0f
Modify unary-ops.cpp to add the functor-based logic besides the templ…
pwilkin Sep 25, 2025
5a02bd4
Apply suggestions from code review
pwilkin Sep 25, 2025
90f052d
Add tensor mappings for Apertus to global list instead
pwilkin Sep 25, 2025
6972404
Fix lazy on scalars
pwilkin Sep 25, 2025
d29cb45
Merge remote-tracking branch 'pwilkin/master' into apertus-implementa…
pwilkin Sep 25, 2025
2ca353b
Update ggml/src/ggml-cuda/unary.cu
pwilkin Sep 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions common/chat-parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,35 @@ bool common_chat_msg_parser::add_tool_calls(const json & arr) {
}
return true;
}

bool common_chat_msg_parser::add_tool_call_short_form(const json & tool_call) {
if (!tool_call.is_object() || tool_call.size() != 1) {
return false;
}

// Get the tool name (the single key in the object)
auto it = tool_call.begin();
std::string name = it.key();

if (name.empty()) {
return false;
}

// Get the arguments (the nested object)
const json & args_json = it.value();
std::string arguments = "";

if (args_json.is_object()) {
arguments = args_json.dump();
} else if (args_json.is_string()) {
arguments = args_json;
} else if (!args_json.is_null()) {
// For other types, convert to string representation
arguments = args_json.dump();
}

return add_tool_call(name, "", arguments);
}
void common_chat_msg_parser::finish() {
if (!is_partial_ && pos_ != input_.size()) {
throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));
Expand Down
3 changes: 3 additions & 0 deletions common/chat-parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ class common_chat_msg_parser {
// Adds an array of tool calls using their "name", "id" and "arguments" fields.
bool add_tool_calls(const nlohmann::ordered_json & arr);

// Adds a tool call using the short form: { "tool_name": { "arg1": val, "arg2": val } }
bool add_tool_call_short_form(const nlohmann::ordered_json & tool_call);

void finish();

bool consume_spaces();
Expand Down
110 changes: 110 additions & 0 deletions common/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -638,6 +638,7 @@ const char * common_chat_format_name(common_chat_format format) {
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
case COMMON_CHAT_FORMAT_APERTUS: return "Apertus";
default:
throw std::runtime_error("Unknown chat format");
}
Expand Down Expand Up @@ -801,6 +802,7 @@ static std::string apply(
}
tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
tmpl_inputs.extra_context = inputs.extra_context;
tmpl_inputs.extra_context["enable_thinking"] = inputs.enable_thinking;
if (additional_context) {
tmpl_inputs.extra_context.merge_patch(*additional_context);
}
Expand Down Expand Up @@ -1264,6 +1266,75 @@ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_
}
return data;
}

static common_chat_params common_chat_params_init_apertus(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;

// Generate the prompt using the apply() function with the template
data.prompt = apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_APERTUS;

// Handle thinking tags appropriately based on inputs.enable_thinking
if (string_ends_with(data.prompt, "<|inner_prefix|>")) {
if (!inputs.enable_thinking) {
data.prompt += "<|inner_suffix|>";
} else {
data.thinking_forced_open = true;
}
}

// When tools are present, build grammar for the <|tools_prefix|> format
if (!inputs.tools.is_null() && inputs.tools.is_array() && !inputs.tools.empty()) {
data.grammar_lazy = true;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
auto schemas = json::array();
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
schemas.push_back({
{ "type", "object" },
{ "properties",
{
{ function.at("name"), function.at("parameters") }
} },
{ "required", json::array({ function.at("name") }) },
});
});
auto schema = json{
{ "type", "array" },
{ "items", schemas.size() == 1 ? schemas[0] : json{ { "anyOf", schemas } } },
{ "minItems", 1 },
};
if (!inputs.parallel_tool_calls) {
schema["maxItems"] = 1;
}
builder.add_rule("root",
std::string(data.thinking_forced_open ? "( \"<|inner_suffix|>\" space )? " : "") +
"\"<|tools_prefix|>\"" + builder.add_schema("tool_calls", schema) + "\"<|tools_suffix|>\"");
});
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
// If thinking_forced_open, then we capture the <|inner_suffix|> tag in the grammar,
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
std::string(data.thinking_forced_open ?
"[\\s\\S]*?(<\\|inner_suffix\\|>\\s*)" :
"(?:<\\|inner_prefix\\|>[\\s\\S]*?<\\|inner_suffix\\|>\\s*)?") +
"(<\\|tools_prefix\\|>)[\\s\\S]*" });
data.preserved_tokens = {
"<|system_start|>",
"<|system_end|>",
"<|developer_start|>",
"<|developer_end|>",
"<|user_start|>",
"<|user_end|>",
"<|assistant_start|>",
"<|assistant_end|>",
"<|inner_prefix|>",
"<|inner_suffix|>",
"<|tools_prefix|>",
"<|tools_suffix|>",
};
}
return data;
}
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
Expand Down Expand Up @@ -2304,6 +2375,37 @@ static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
builder.add_content(builder.consume_rest());
}

static void common_chat_parse_apertus(common_chat_msg_parser & builder) {
// Parse thinking tags
builder.try_parse_reasoning("<|inner_prefix|>", "<|inner_suffix|>");
if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
}

// Look for tool calls
static const common_regex tool_call_regex(regex_escape("<|tools_prefix|>"));
if (auto res = builder.try_find_regex(tool_call_regex)) {
builder.move_to(res->groups[0].end);

auto tool_calls_data = builder.consume_json();
if (tool_calls_data.json.is_array()) {
builder.consume_spaces();
if (!builder.try_consume_literal("<|tools_suffix|>")) {
throw common_chat_msg_partial_exception("Incomplete tool call");
}
for (const auto & value : tool_calls_data.json) {
if (value.is_object()) {
builder.add_tool_call_short_form(value);
}
}
} else {
throw common_chat_msg_partial_exception("Incomplete tool call");
}
}
builder.add_content(builder.consume_rest());
}

static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
// Parse thinking tags first - this handles the main reasoning content
builder.try_parse_reasoning("<seed:think>", "</seed:think>");
Expand Down Expand Up @@ -2548,6 +2650,11 @@ static common_chat_params common_chat_templates_apply_jinja(
return common_chat_params_init_nemotron_v2(tmpl, params);
}

// Apertus format detection
if (src.find("<|system_start|>") != std::string::npos && src.find("<|tools_prefix|>") != std::string::npos) {
return common_chat_params_init_apertus(tmpl, params);
}

// Use generic handler when mixing tools + JSON schema.
// TODO: support that mix in handlers below.
if ((params.tools.is_array() && params.json_schema.is_object())) {
Expand Down Expand Up @@ -2715,6 +2822,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
case COMMON_CHAT_FORMAT_NEMOTRON_V2:
common_chat_parse_nemotron_v2(builder);
break;
case COMMON_CHAT_FORMAT_APERTUS:
common_chat_parse_apertus(builder);
break;
default:
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
}
Expand Down
1 change: 1 addition & 0 deletions common/chat.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ enum common_chat_format {
COMMON_CHAT_FORMAT_GPT_OSS,
COMMON_CHAT_FORMAT_SEED_OSS,
COMMON_CHAT_FORMAT_NEMOTRON_V2,
COMMON_CHAT_FORMAT_APERTUS,

COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
};
Expand Down
39 changes: 38 additions & 1 deletion convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8940,6 +8940,43 @@ def prepare_tensors(self):
raise ValueError(f"Unprocessed experts: {experts}")


@ModelBase.register("ApertusForCausalLM")
class ApertusModel(LlamaModel):
model_arch = gguf.MODEL_ARCH.APERTUS
undo_permute = False

_alpha_n = {}
_alpha_p = {}
_beta = {}
_eps = {}

def modify_tensors(self, data_torch, name, bid):
# Handle xIELU activation parameters
n_layers = self.hparams["num_hidden_layers"]
if name.endswith(".act_fn.alpha_n"):
self._alpha_n[bid] = data_torch.to("cpu").float().item()
if (len(self._alpha_n) == n_layers):
self.gguf_writer.add_xielu_alpha_n([self._alpha_n[k] for k in sorted(self._alpha_n)])
return []
if name.endswith(".act_fn.alpha_p"):
self._alpha_p[bid] = data_torch.to("cpu").float().item()
if (len(self._alpha_p) == n_layers):
self.gguf_writer.add_xielu_alpha_p([self._alpha_p[k] for k in sorted(self._alpha_p)])
return []
if name.endswith(".act_fn.beta"):
self._beta[bid] = data_torch.to("cpu").float().item()
if (len(self._beta) == n_layers):
self.gguf_writer.add_xielu_beta([self._beta[k] for k in sorted(self._beta)])
return []
if name.endswith(".act_fn.eps"):
self._eps[bid] = data_torch.to("cpu").float().item()
if (len(self._eps) == n_layers):
self.gguf_writer.add_xielu_eps([self._eps[k] for k in sorted(self._eps)])
return []

return super().modify_tensors(data_torch, name, bid)


class MistralModel(LlamaModel):
model_arch = gguf.MODEL_ARCH.LLAMA
model_name = "Mistral"
Expand Down Expand Up @@ -9107,7 +9144,7 @@ def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -
def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
dtype = cls._dtype_str_map[st_slice.get_dtype()]
shape: tuple[int, ...] = tuple(st_slice.get_shape())
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[...] if len(s.get_shape()) == 0 else s[:])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why add this now, didn't you work around this already?

Probably belongs in a separate PR.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, it turns out the workaround that I had works if you use --remote, but not if you convert from a local repo.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a bugfix and I think it's too tiny for a separate PR anyway.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No such thing. :) It's better for visibility to have separate PRs, but since it's an issue specifically for this model it's probably ok to have it here.

return cast(torch.Tensor, lazy)

@classmethod
Expand Down
12 changes: 12 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,7 @@ extern "C" {
GGML_UNARY_OP_HARDSIGMOID,
GGML_UNARY_OP_EXP,
GGML_UNARY_OP_GELU_ERF,
GGML_UNARY_OP_XIELU,

GGML_UNARY_OP_COUNT,
};
Expand Down Expand Up @@ -1148,6 +1149,17 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);

// xIELU activation function
// https://arxiv.org/abs/2411.13010
// x = (x > 0) ? a_p * x^2 + b * x : (exmp1(min(x, eps)) - x) * a_n + b * x
GGML_API struct ggml_tensor * ggml_xielu(
struct ggml_context * ctx,
struct ggml_tensor * a,
float alpha_n,
float alpha_p,
float beta,
float eps);

// gated linear unit ops
// A: n columns, r rows,
// result is n / 2 columns, r rows,
Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -2187,6 +2187,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_UNARY_OP_GELU_ERF:
case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_XIELU:
{
n_tasks = n_threads;
} break;
Expand Down
8 changes: 6 additions & 2 deletions ggml/src/ggml-cpu/ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8637,7 +8637,7 @@ static void ggml_compute_forward_ssm_scan_f32(
// n_head
for (int h = ih0; h < ih1; ++h) {
// ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
const float dt_soft_plus = softplus(dt[h]);
const float dA = expf(dt_soft_plus * A[h]);
const int g = h / (nh / ng); // repeat_interleave

Expand Down Expand Up @@ -8734,7 +8734,7 @@ static void ggml_compute_forward_ssm_scan_f32(
// n_head
for (int h = ih0; h < ih1; ++h) {
// ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
const float dt_soft_plus = softplus(dt[h]);
const int g = h / (nh / ng); // repeat_interleave

// dim
Expand Down Expand Up @@ -8997,6 +8997,10 @@ void ggml_compute_forward_unary(
{
ggml_compute_forward_exp(params, dst);
} break;
case GGML_UNARY_OP_XIELU:
{
ggml_compute_forward_xielu(params, dst);
} break;
default:
{
GGML_ABORT("fatal error");
Expand Down
Loading
Loading