ngxson · ngxson · Jun 6, 2025 · Jun 6, 2025
diff --git a/cpp/actions.hpp b/cpp/actions.hpp
@@ -649,13 +649,14 @@ glue_msg_get_kv_remove_res action_kv_remove(app_t &app, const char *req_raw)
   PARSE_REQ(glue_msg_get_kv_remove_req);
   const int n_keep = req.n_keep.value;
   const int n_discard = req.n_discard.value;
+  auto * mem = llama_get_memory(app.ctx);
 
   if (n_discard > 0)
   {
     // TODO: this code branch is kinda broken, to be fixed later
     const int n_past = app.tokens.size();
-    llama_kv_self_seq_rm(app.ctx, 0, n_keep, n_keep + n_discard);
-    llama_kv_self_seq_add(app.ctx, 0, n_keep + n_discard, n_past, -n_discard);
+    llama_memory_seq_rm(mem, 0, n_keep, n_keep + n_discard);
+    llama_memory_seq_add(mem, 0, n_keep + n_discard, n_past, -n_discard);
     app.tokens.erase(
         app.tokens.begin() + n_keep,
         app.tokens.begin() + n_keep + n_discard);
@@ -664,11 +665,11 @@ glue_msg_get_kv_remove_res action_kv_remove(app_t &app, const char *req_raw)
   {
     if (n_keep == 0)
     {
-      llama_kv_self_clear(app.ctx);
+      llama_memory_clear(mem, true);
     }
     else
     {
-      llama_kv_self_seq_rm(app.ctx, 0, n_keep, -1);
+      llama_memory_seq_rm(mem, 0, n_keep, -1);
       app.tokens.erase(
           app.tokens.begin() + n_keep,
           app.tokens.end());
@@ -685,7 +686,8 @@ glue_msg_get_kv_remove_res action_kv_remove(app_t &app, const char *req_raw)
 glue_msg_get_kv_clear_res action_kv_clear(app_t &app, const char *req_raw)
 {
   PARSE_REQ(glue_msg_get_kv_clear_req);
-  llama_kv_self_clear(app.ctx);
+  auto * mem = llama_get_memory(app.ctx);
+  llama_memory_clear(mem, true);
   app.tokens.clear();
 
   glue_msg_get_kv_clear_res res;
@@ -766,7 +768,7 @@ glue_msg_test_benchmark_res action_test_benchmark(app_t &app, const char *req_ra
   std::string type = req.type.value;   // "pp" (prompt proc) or "tg" (tok gen)
   int n_samples = req.n_samples.value; // n_batch in pp and n_predict in pg
 
-  llama_kv_self_clear(app.ctx);
+  llama_memory_clear(llama_get_memory(app.ctx), true);
   int n_vocab = llama_vocab_n_tokens(app.vocab);
   int64_t t_start = ggml_time_ms();
 
@@ -837,7 +839,7 @@ glue_msg_test_perplexity_res action_test_perplexity(app_t &app, const char *req_
   }
 
   // Clear existing context to start fresh
-  llama_kv_self_clear(app.ctx);
+  llama_memory_clear(llama_get_memory(app.ctx), true);
   app.tokens.clear();
 
   const int32_t n_vocab = llama_vocab_n_tokens(app.vocab);

diff --git a/llama.cpp b/llama.cpp
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@wllama/wllama",
-  "version": "2.3.1",
+  "version": "2.3.2",
   "description": "WebAssembly binding for llama.cpp - Enabling on-browser LLM inference",
   "main": "index.js",
   "type": "module",

diff --git a/src/multi-thread/wllama.js b/src/multi-thread/wllama.js
diff --git a/src/multi-thread/wllama.wasm b/src/multi-thread/wllama.wasm
diff --git a/src/single-thread/wllama.js b/src/single-thread/wllama.js
diff --git a/src/single-thread/wllama.wasm b/src/single-thread/wllama.wasm
diff --git a/src/wasm-from-cdn.ts b/src/wasm-from-cdn.ts
@@ -2,8 +2,8 @@
 // Do not edit this file directly
 
 const WasmFromCDN = {
-  'single-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].1/src/single-thread/wllama.wasm',
-  'multi-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].1/src/multi-thread/wllama.wasm',
+  'single-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].2/src/single-thread/wllama.wasm',
+  'multi-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].2/src/multi-thread/wllama.wasm',
 };
 
 export default WasmFromCDN;
diff --git a/src/workers-code/generated.ts b/src/workers-code/generated.ts