ngxson
diff --git a/‎actions.hpp‎
Lines changed: 126 additions & 3 deletions b/‎actions.hpp‎
Lines changed: 126 additions & 3 deletions
diff --git a/‎examples/main/src/App.tsx‎
Lines changed: 1 addition & 0 deletions b/‎examples/main/src/App.tsx‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/main/src/utils/benchmark.ts‎
Lines changed: 116 additions & 0 deletions b/‎examples/main/src/utils/benchmark.ts‎
Lines changed: 116 additions & 0 deletions
diff --git a/‎llama.cpp‎ b/‎llama.cpp‎
diff --git a/‎package.json‎
Lines changed: 1 addition & 1 deletion b/‎package.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/docker-compose.yml‎
Lines changed: 1 addition & 1 deletion b/‎scripts/docker-compose.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/multi-thread/wllama.js‎
Lines changed: 1 addition & 1 deletion b/‎src/multi-thread/wllama.js‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/multi-thread/wllama.wasm‎
10.4 KB b/‎src/multi-thread/wllama.wasm‎
10.4 KB
diff --git a/‎src/single-thread/wllama.js‎
Lines changed: 1 addition & 1 deletion b/‎src/single-thread/wllama.js‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/single-thread/wllama.wasm‎
11.1 KB b/‎src/single-thread/wllama.wasm‎
11.1 KB
@@ -684,14 +684,137 @@ json action_current_status(app_t &app, json &body)
   };
 }
 
+//
+// benchmark & perplexity
+//
+
+json action_test_benchmark(app_t &app, json &body)
+{
+  std::string type = body.at("type");   // "pp" (prompt proc) or "tg" (tok gen)
+  int n_samples = body.at("n_samples"); // n_batch in pp and n_predict in pg
+
+  llama_kv_cache_clear(app.ctx);
+  int n_vocab = llama_vocab_n_tokens(app.vocab);
+  int64_t t_start = ggml_time_ms();
+
+  if (type == "pp")
+  {
+    llama_batch batch = llama_batch_init(n_samples, 0, 1);
+    for (int i = 0; i < n_samples; i++)
+    {
+      common_batch_add(batch, i % n_vocab, i, {0}, i == n_samples - 1);
+    }
+    int ret = llama_decode(app.ctx, batch);
+    llama_batch_free(batch);
+    if (ret != 0)
+    {
+      return json{{"error", "llama_decode failed with status = " + std::to_string(ret)}};
+    }
+  }
+  else if (type == "tg")
+  {
+    llama_batch batch = llama_batch_init(1, 0, 1);
+    for (int i = 0; i < n_samples; i++)
+    {
+      common_batch_clear(batch);
+      common_batch_add(batch, i % n_vocab, i, {0}, true);
+      int ret = llama_decode(app.ctx, batch);
+      if (ret != 0)
+      {
+        return json{{"error", "llama_decode failed with status = " + std::to_string(ret)}};
+      }
+    }
+    llama_batch_free(batch);
+  }
+  else
+  {
+    return json{{"error", "unknown type: " + type}};
+  }
+
+  int64_t t_end = ggml_time_ms();
+  return json{
+      {"success", true},
+      {"t_ms", t_end - t_start},
+  };
+}
+
+json action_test_perplexity(app_t &app, json &body)
+{
+  llama_tokens input = body["input"];
+  const size_t n = input.size();
+
+  int64_t t_start = ggml_time_ms();
+
+  if (n < 2)
+  {
+    return json{{"error", "Input must contain at least two tokens"}};
+  }
+
+  // Clear existing context to start fresh
+  llama_kv_cache_clear(app.ctx);
+  app.tokens.clear();
+
+  const int32_t n_vocab = llama_vocab_n_tokens(app.vocab);
+  double nll = 0.0;
+
+  static auto log_softmax = [](int n_vocab, const float *logits, int tok) -> double
+  {
+    float max_logit = logits[0];
+    for (int i = 1; i < n_vocab; ++i)
+    {
+      max_logit = std::max(max_logit, logits[i]);
+    }
+    double sum_exp = 0.0;
+    for (int i = 0; i < n_vocab; ++i)
+    {
+      sum_exp += expf(logits[i] - max_logit);
+    }
+    return logits[tok] - max_logit - log(sum_exp);
+  };
+
+  for (size_t i = 0; i < n - 1; ++i)
+  {
+    // Prepare batch with current token (input[i])
+    common_batch_clear(app.batch);
+    common_batch_add(app.batch, input[i], i, {0}, true); // Enable logits for this token
+
+    if (llama_decode(app.ctx, app.batch) != 0)
+    {
+      return json{{"error", "Decoding failed at position " + std::to_string(i)}};
+    }
+
+    float *logits = llama_get_logits_ith(app.ctx, 0);
+
+    // Get true next token (input[i+1])
+    const int32_t true_token = input[i + 1];
+
+    nll += -log_softmax(n_vocab, logits, true_token);
+  }
+
+  // Calculate final metrics
+  const double cross_entropy = nll / (n - 1);
+  const double ppl = std::exp(cross_entropy);
+
+  int64_t t_end = ggml_time_ms();
+
+  return json{
+      {"success", true},
+      {"ppl", ppl},
+      {"nll", nll},
+      {"cross_entropy", cross_entropy},
+      {"n_tokens", n - 1},
+      {"t_ms", t_end - t_start},
+  };
+}
+
 //////////////////////////////////////////
 
 // because we can't support jinja for now, we temporary use an old version of common_chat_apply_template
 // TODO: support jinja
 std::string common_chat_apply_template_old(const struct llama_model *model,
-                                       const std::string &tmpl,
-                                       const std::vector<common_chat_msg> &msgs,
-                                       bool add_ass)
+                                           const std::string &tmpl,
+                                           const std::vector<common_chat_msg> &msgs,
+                                           bool add_ass)
 {
   int alloc_size = 0;
   bool fallback = false; // indicate if we must fallback to default chatml
 
@@ -7,6 +7,7 @@ import Sidebar from './components/Sidebar';
 import { MessagesProvider } from './utils/messages.context';
 import { Screen } from './utils/types';
 import { useWllama, WllamaProvider } from './utils/wllama.context';
+import './utils/benchmark';
 
 function App() {
   return (
 
@@ -0,0 +1,116 @@
+import { Wllama } from '@wllama/wllama';
+import { WLLAMA_CONFIG_PATHS } from '../config';
+import { delay } from './utils';
+
+// TODO: this is console-only for now, should we implement a GUI in the future?
+
+const WIKITEXT_URL =
+  'https://raw.githubusercontent.com/wangfin/QAsystem/refs/heads/master/QAManagement/language_model/data/wikitext-2/valid.txt';
+
+const BENCH_MODELS = [
+  'https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q8_0.gguf',
+  'https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf',
+  'https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf',
+  'https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q5_K_L.gguf',
+];
+
+const BENCH_N_REPEATED = 4;
+
+const BENCH_CONFIGS: { type: 'pp' | 'tg'; n_samples: number }[] = [
+  { type: 'pp', n_samples: 32 },
+  { type: 'pp', n_samples: 64 },
+  { type: 'pp', n_samples: 128 },
+  { type: 'pp', n_samples: 256 },
+  { type: 'tg', n_samples: 32 },
+  { type: 'tg', n_samples: 64 },
+  { type: 'tg', n_samples: 128 },
+  { type: 'tg', n_samples: 256 },
+];
+
+async function loadModel(modelUrl: string) {
+  const modelFile = modelUrl.split('/').pop();
+  const wllama = new Wllama(WLLAMA_CONFIG_PATHS);
+  await wllama.loadModelFromUrl(modelUrl, {
+    n_batch: 512,
+    n_ctx: 4096,
+    progressCallback: ({ total, loaded }) => {
+      console.log(`Model ${modelFile}: ${Math.round((100 * loaded) / total)}%`);
+    },
+  });
+  return { wllama, modelFile };
+}
+
+async function benchmark() {
+  const output: any[][] = [
+    ['model', 'threads', 'test', 't/s'],
+    ['---', '---', '---', '---'],
+  ];
+  for (const modelUrl of BENCH_MODELS) {
+    const [{ wllama, modelFile }] = await Promise.all([
+      loadModel(modelUrl),
+      delay(10000), // force delay for CPU to cool down
+    ]);
+    console.clear();
+    const nThreads = wllama.getNumThreads();
+    for (const config of BENCH_CONFIGS) {
+      const { type, n_samples } = config;
+      const results: number[] = [];
+      for (let i = 0; i < BENCH_N_REPEATED; i++) {
+        console.log('Running', modelFile, config);
+        const { t_ms } = await wllama._testBenchmark(type, n_samples);
+        const t_per_tok = n_samples / (t_ms / 1000);
+        results.push(t_per_tok);
+        console.log('Run ', i, 'pref:', t_per_tok, 't/s');
+      }
+      const t_avg = results.reduce((a, b) => a + b, 0) / results.length;
+      const t_plus_minus = Math.abs(
+        Math.max(...results) - Math.min(...results)
+      );
+      output.push([
+        modelFile,
+        nThreads,
+        `${type} ${n_samples}`,
+        `${t_avg.toFixed(2)} ± ${t_plus_minus.toFixed(2)}`,
+      ]);
+    }
+    wllama.exit();
+  }
+
+  console.table(output);
+  const markdown = output
+    .map((row) => '| ' + row.join(' | ') + ' |')
+    .join('\n');
+  console.log(markdown);
+}
+
+async function perplexity() {
+  const output: any[][] = [
+    ['model', 'PPL', 'n_tokens'],
+    ['---', '---', '---'],
+  ];
+  const LIMIT_TOKENS = 2048;
+  const wikitext = await fetch(WIKITEXT_URL).then((res) => res.text());
+  console.log('Loaded wikitext:', wikitext.substring(0, 100), '...');
+  for (const modelUrl of BENCH_MODELS) {
+    const { wllama, modelFile } = await loadModel(modelUrl);
+    console.clear();
+    let tokens = await wllama.tokenize(
+      wikitext.substring(0, LIMIT_TOKENS * 16)
+    );
+    tokens = tokens.slice(0, LIMIT_TOKENS);
+    console.log('Running', modelFile, 'n_tokens', tokens.length);
+    const { ppl } = await wllama._testPerplexity(tokens);
+    console.log('PPL:', ppl);
+    output.push([modelFile, ppl, tokens.length]);
+    wllama.exit();
+  }
+
+  console.table(output);
+  const markdown = output
+    .map((row) => '| ' + row.join(' | ') + ' |')
+    .join('\n');
+  console.log(markdown);
+}
+
+(window as any).__benchmark = benchmark;
+(window as any).__perplexity = perplexity;
@@ -1,6 +1,6 @@
 {
   "name": "@wllama/wllama",
-  "version": "2.1.3",
+  "version": "2.1.4",
   "description": "WebAssembly binding for llama.cpp - Enabling on-browser LLM inference",
   "main": "index.js",
   "type": "module",
 
@@ -19,7 +19,7 @@ services:
         mkdir -p wasm/single-thread
         cd wasm/single-thread
 
-        export SHARED_EMCC_CFLAGS="--no-entry -O3 -msimd128 -fno-rtti -DNDEBUG -flto=full -frtti -fwasm-exceptions -sEXPORT_ALL=1 -sEXPORT_ES6=0 -sMODULARIZE=0 -sINITIAL_MEMORY=128MB -sMAXIMUM_MEMORY=4096MB -sALLOW_MEMORY_GROWTH=1 -sFORCE_FILESYSTEM=1 -sEXPORTED_FUNCTIONS=_main,_wllama_start,_wllama_action,_wllama_exit,_wllama_debug -sEXPORTED_RUNTIME_METHODS=ccall,cwrap -sNO_EXIT_RUNTIME=1"
+        export SHARED_EMCC_CFLAGS="--no-entry -O3 -msimd128 -DNDEBUG -flto=full -frtti -fwasm-exceptions -sEXPORT_ALL=1 -sEXPORT_ES6=0 -sMODULARIZE=0 -sINITIAL_MEMORY=128MB -sMAXIMUM_MEMORY=4096MB -sALLOW_MEMORY_GROWTH=1 -sFORCE_FILESYSTEM=1 -sEXPORTED_FUNCTIONS=_main,_wllama_start,_wllama_action,_wllama_exit,_wllama_debug -sEXPORTED_RUNTIME_METHODS=ccall,cwrap -sNO_EXIT_RUNTIME=1"
 
         # emcc --clear-cache
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@wllama/wllama",`
`3`		`- "version": "2.1.3",`
	`3`	`+ "version": "2.1.4",`
`4`	`4`	`"description": "WebAssembly binding for llama.cpp - Enabling on-browser LLM inference",`
`5`	`5`	`"main": "index.js",`
`6`	`6`	`"type": "module",`