Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 83 additions & 32 deletions actions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ json dump_metadata(app_t &app)
continue;
if (res > buf.size())
{
buf.resize(res);
buf.resize(res + 1);
res = llama_model_meta_val_str_by_index(app.model, i, buf.data(), buf.size());
}
val = std::string(buf.data(), res);
Expand All @@ -149,7 +149,7 @@ json dump_metadata(app_t &app)
continue;
if (res > buf.size())
{
buf.resize(res);
buf.resize(res + 1);
res = llama_model_meta_key_by_index(app.model, i, buf.data(), buf.size());
}
key = std::string(buf.data(), res);
Expand Down Expand Up @@ -250,8 +250,10 @@ json action_load(app_t &app, json &body)
}
int n_vocab = llama_vocab_n_tokens(app.vocab);
llama_tokens list_tokens_eog;
for (int i = 0; i < n_vocab; i++) {
if (llama_vocab_is_eog(app.vocab, i)) {
for (int i = 0; i < n_vocab; i++)
{
if (llama_vocab_is_eog(app.vocab, i))
{
list_tokens_eog.push_back(i);
}
}
Expand Down Expand Up @@ -595,34 +597,6 @@ json action_embeddings(app_t &app, json &body)
};
}

// apply chat template
json action_chat_format(app_t &app, json &body)
{
std::string tmpl = body.contains("tmpl") ? body["tmpl"] : "";
bool add_ass = body.contains("add_ass") ? body.at("add_ass").get<bool>() : false;
if (!body.contains("messages"))
{
return json{{"error", "messages is required"}};
}
std::vector<common_chat_msg> chat;
for (auto &item : body["messages"])
{
chat.push_back({item["role"], item["content"]});
}
try
{
std::string formatted_chat = common_chat_apply_template(app.model, tmpl, chat, add_ass);
return json{
{"success", true},
{"formatted_chat", formatted_chat},
};
}
catch (const std::exception &e)
{
return json{{"error", e.what()}};
}
}

// remove tokens in kv, for context-shifting
json action_kv_remove(app_t &app, json &body)
{
Expand Down Expand Up @@ -709,3 +683,80 @@ json action_current_status(app_t &app, json &body)
{"tokens", app.tokens},
};
}

//////////////////////////////////////////

// because we can't support jinja for now, we temporary use an old version of common_chat_apply_template
// TODO: support jinja
std::string common_chat_apply_template_old(const struct llama_model *model,
const std::string &tmpl,
const std::vector<common_chat_msg> &msgs,
bool add_ass)
{
int alloc_size = 0;
bool fallback = false; // indicate if we must fallback to default chatml
std::vector<llama_chat_message> chat;
for (const auto &msg : msgs)
{
chat.push_back({msg.role.c_str(), msg.content.c_str()});
alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
}

const char *ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model, nullptr) : tmpl.c_str();
std::vector<char> buf(alloc_size);

// run the first time to get the total output length
int32_t res = llama_chat_apply_template(ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());

// error: chat template is not supported
if (res < 0)
{
if (ptr_tmpl != nullptr)
{
throw std::runtime_error("this custom template is not supported");
}
// If the built-in template is not supported, we default to chatml
res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
fallback = true;
}

// if it turns out that our buffer is too small, we resize it
if ((size_t)res > buf.size())
{
buf.resize(res);
res = llama_chat_apply_template(
fallback ? "chatml" : ptr_tmpl,
chat.data(), chat.size(), add_ass, buf.data(), buf.size());
}

std::string formatted_chat(buf.data(), res);
return formatted_chat;
}

// apply chat template
json action_chat_format(app_t &app, json &body)
{
std::string tmpl = body.contains("tmpl") ? body["tmpl"] : "";
bool add_ass = body.contains("add_ass") ? body.at("add_ass").get<bool>() : false;
if (!body.contains("messages"))
{
return json{{"error", "messages is required"}};
}
std::vector<common_chat_msg> chat;
for (auto &item : body["messages"])
{
chat.push_back({item["role"], item["content"]});
}
try
{
std::string formatted_chat = common_chat_apply_template_old(app.model, tmpl, chat, add_ass);
return json{
{"success", true},
{"formatted_chat", formatted_chat},
};
}
catch (const std::exception &e)
{
return json{{"error", e.what()}};
}
}
14 changes: 10 additions & 4 deletions examples/main/src/components/ChatScreen.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,16 @@ export default function ChatScreen() {
if (!loadedModel) {
throw new Error('loadedModel is null');
}
const formattedChat = await formatChat(getWllamaInstance(), [
...currHistory,
userMsg,
]);
let formattedChat: string;
try {
formattedChat = await formatChat(getWllamaInstance(), [
...currHistory,
userMsg,
]);
} catch (e) {
alert(`Error while formatting chat: ${(e as any)?.message ?? 'unknown'}`);
throw e;
}
console.log({ formattedChat });
await createCompletion(formattedChat, (newContent) => {
editMessageInConversation(convId, assistantMsg.id, newContent);
Expand Down
8 changes: 8 additions & 0 deletions examples/main/src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,18 @@ export const LIST_MODELS = [
url: 'https://huggingface.co/ngxson/SmolLM2-360M-Instruct-Q8_0-GGUF/resolve/main/smollm2-360m-instruct-q8_0.gguf',
size: 386404992,
},
{
url: 'https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q8_0.gguf',
size: 675710816,
},
{
url: 'https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/resolve/main/llama-3.2-1b-instruct-q4_k_m.gguf',
size: 807690656,
},
{
url: 'https://huggingface.co/bartowski/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf',
size: 924456032,
},
{
url: 'https://huggingface.co/ngxson/wllama-split-models/resolve/main/qwen2-1_5b-instruct-q4_k_m-00001-of-00004.gguf',
size: 986046272,
Expand Down
2 changes: 1 addition & 1 deletion examples/main/src/utils/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ export interface InferenceParams {
export interface Message {
id: number;
content: string;
role: 'user' | 'assistant';
role: 'system' | 'user' | 'assistant';
}

export interface Conversation {
Expand Down
23 changes: 20 additions & 3 deletions examples/main/src/utils/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,26 @@ export const formatChat = async (
modelWllama: Wllama,
messages: Message[]
): Promise<string> => {
const template = new Template(
modelWllama.getChatTemplate() ?? DEFAULT_CHAT_TEMPLATE
);
const templateStr = modelWllama.getChatTemplate() ?? DEFAULT_CHAT_TEMPLATE;
// dirty patch for DeepSeek model (crash on @huggingface/jinja)
const isDeepSeekR1 =
templateStr.match(/<|Assistant|>/) &&
templateStr.match(/<|User|>/) &&
templateStr.match(/<\/think>/);
if (isDeepSeekR1) {
let result = '';
for (const message of messages) {
if (message.role === 'system') {
result += `${message.content}\n\n`;
} else if (message.role === 'user') {
result += `<|User|>${message.content}`;
} else {
result += `<|Assistant|>${message.content.split('</think>').pop()}<|end▁of▁sentence|>`;
}
}
return result + '<|Assistant|>';
}
const template = new Template(templateStr);
const bos_token: string = textDecoder.decode(
await modelWllama.detokenize([modelWllama.getBOS()])
);
Expand Down
2 changes: 1 addition & 1 deletion llama.cpp
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@wllama/wllama",
"version": "2.1.2",
"version": "2.1.3",
"description": "WebAssembly binding for llama.cpp - Enabling on-browser LLM inference",
"main": "index.js",
"type": "module",
Expand Down
2 changes: 1 addition & 1 deletion scripts/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ services:
mkdir -p wasm/single-thread
cd wasm/single-thread

export SHARED_EMCC_CFLAGS="--no-entry -O3 -msimd128 -fno-rtti -DNDEBUG -flto=full -fwasm-exceptions -sEXPORT_ALL=1 -sEXPORT_ES6=0 -sMODULARIZE=0 -sINITIAL_MEMORY=128MB -sMAXIMUM_MEMORY=4096MB -sALLOW_MEMORY_GROWTH=1 -sFORCE_FILESYSTEM=1 -sEXPORTED_FUNCTIONS=_main,_wllama_start,_wllama_action,_wllama_exit,_wllama_debug -sEXPORTED_RUNTIME_METHODS=ccall,cwrap -sNO_EXIT_RUNTIME=1"
export SHARED_EMCC_CFLAGS="--no-entry -O3 -msimd128 -fno-rtti -DNDEBUG -flto=full -frtti -fwasm-exceptions -sEXPORT_ALL=1 -sEXPORT_ES6=0 -sMODULARIZE=0 -sINITIAL_MEMORY=128MB -sMAXIMUM_MEMORY=4096MB -sALLOW_MEMORY_GROWTH=1 -sFORCE_FILESYSTEM=1 -sEXPORTED_FUNCTIONS=_main,_wllama_start,_wllama_action,_wllama_exit,_wllama_debug -sEXPORTED_RUNTIME_METHODS=ccall,cwrap -sNO_EXIT_RUNTIME=1"

# emcc --clear-cache

Expand Down
2 changes: 1 addition & 1 deletion src/multi-thread/wllama.js

Large diffs are not rendered by default.

Binary file modified src/multi-thread/wllama.wasm
Binary file not shown.
2 changes: 1 addition & 1 deletion src/single-thread/wllama.js

Large diffs are not rendered by default.

Binary file modified src/single-thread/wllama.wasm
Binary file not shown.
4 changes: 2 additions & 2 deletions src/wasm-from-cdn.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
// Do not edit this file directly

const WasmFromCDN = {
'single-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].2/src/single-thread/wllama.wasm',
'multi-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].2/src/multi-thread/wllama.wasm',
'single-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].3/src/single-thread/wllama.wasm',
'multi-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].3/src/multi-thread/wllama.wasm',
};

export default WasmFromCDN;
4 changes: 2 additions & 2 deletions src/workers-code/generated.ts

Large diffs are not rendered by default.

Loading