Skip to content

Commit 4c9f0bb

Browse files
ngxsonkhromov
andauthored
Sync with latest upstream, fix useCache, add getLibllamaVersion() (#189)
* fix problem with useCache Co-authored-by: khromov <[email protected]> * bump to latest upstream llama.cpp * add api for getting libllama version number * correct doc * fix CI * v2.3.5 * fix submodule --------- Co-authored-by: khromov <[email protected]>
1 parent c267097 commit 4c9f0bb

File tree

10 files changed

+40
-10
lines changed

10 files changed

+40
-10
lines changed

.github/workflows/verify-generated-code.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ jobs:
1212
steps:
1313
- name: Checkout
1414
uses: actions/checkout@v4
15+
with:
16+
fetch-depth: 0
17+
submodules: 'true'
1518

1619
- name: Setup Node.js
1720
uses: actions/setup-node@v4

cpp/actions.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ glue_msg_load_res action_load(app_t &app, const char *req_raw)
202202
if (req.swa_full.not_null())
203203
cparams.swa_full = req.swa_full.value;
204204
if (req.flash_attn.not_null())
205-
cparams.flash_attn = req.flash_attn.value;
205+
cparams.flash_attn_type = req.flash_attn.value ? LLAMA_FLASH_ATTN_TYPE_AUTO : LLAMA_FLASH_ATTN_TYPE_DISABLED;
206206

207207
// init threadpool
208208
ggml_threadpool_params_default(cparams.n_threads);
@@ -775,7 +775,7 @@ glue_msg_status_res action_current_status(app_t &app, const char *req_raw)
775775
PARSE_REQ(glue_msg_status_req);
776776
glue_msg_status_res res;
777777
res.success.value = true;
778-
res.tokens.arr = std::move(app.tokens);
778+
res.tokens.arr = app.tokens; // copy
779779
return res;
780780
}
781781

llama.cpp

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@wllama/wllama",
3-
"version": "2.3.4",
3+
"version": "2.3.5",
44
"description": "WebAssembly binding for llama.cpp - Enabling on-browser LLM inference",
55
"main": "index.js",
66
"type": "module",

scripts/build_worker.sh

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,22 @@
33
set -e
44

55
CURRENT_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
6+
7+
# change to the llama.cpp directory
68
cd $CURRENT_PATH
7-
cd ..
9+
cd ../llama.cpp
10+
BUILD_NUMBER="$(git rev-list --count HEAD)"
11+
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
812

9-
# we're on the root of the project
13+
# change to the root of the project
14+
cd $CURRENT_PATH
15+
cd ..
1016

1117
echo "// This file is auto-generated" > ./src/workers-code/generated.ts
1218
echo "// To re-generate it, run: npm run build:worker" >> ./src/workers-code/generated.ts
19+
echo "" >> ./src/workers-code/generated.ts
20+
echo "export const LIBLLAMA_VERSION = 'b${BUILD_NUMBER}-${SHORT_HASH}';" >> ./src/workers-code/generated.ts
21+
echo "" >> ./src/workers-code/generated.ts
1322

1423
process_file() {
1524
local file="$1"

src/multi-thread/wllama.wasm

71.5 KB
Binary file not shown.

src/single-thread/wllama.wasm

70.8 KB
Binary file not shown.

src/wasm-from-cdn.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
// Do not edit this file directly
33

44
const WasmFromCDN = {
5-
'single-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].4/src/single-thread/wllama.wasm',
6-
'multi-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].4/src/multi-thread/wllama.wasm',
5+
'single-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].5/src/single-thread/wllama.wasm',
6+
'multi-thread/wllama.wasm': 'https://cdn.jsdelivr.net/npm/@wllama/[email protected].5/src/multi-thread/wllama.wasm',
77
};
88

99
export default WasmFromCDN;

src/wllama.ts

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import type {
3131
GlueMsgTestPerplexityRes,
3232
GlueMsgTokenizeRes,
3333
} from './glue/messages';
34+
import { LIBLLAMA_VERSION } from './workers-code/generated';
3435

3536
const HF_MODEL_ID_REGEX = /^([a-zA-Z0-9_\-\.]+)\/([a-zA-Z0-9_\-\.]+)$/;
3637
const HF_MODEL_ID_REGEX_EXPLAIN =
@@ -115,7 +116,7 @@ export interface LoadModelConfig {
115116
// optimizations
116117
cache_type_k?: 'f32' | 'f16' | 'q8_0' | 'q5_1' | 'q5_0' | 'q4_1' | 'q4_0';
117118
cache_type_v?: 'f32' | 'f16' | 'q8_0' | 'q5_1' | 'q5_0' | 'q4_1' | 'q4_0';
118-
flash_attn?: boolean;
119+
flash_attn?: boolean; // true is auto, false is disabled
119120
}
120121

121122
export interface SamplingConfig {
@@ -326,6 +327,15 @@ export class Wllama {
326327
}
327328
}
328329

330+
/**
331+
* Get the libllama version string, e.g. "b6327-4d74393".
332+
*
333+
* @returns version string embedded at build time.
334+
*/
335+
static getLibllamaVersion(): string {
336+
return LIBLLAMA_VERSION;
337+
}
338+
329339
/**
330340
* Check if the model is loaded via `loadModel()`
331341
*/
@@ -1195,7 +1205,12 @@ export class Wllama {
11951205
if (!result.success) {
11961206
throw new WllamaError('kvRemove unknown error');
11971207
}
1198-
this.nCachedTokens -= nDiscard;
1208+
// When nDiscard is negative (-1), it means remove everything after nKeep
1209+
if (nDiscard < 0) {
1210+
this.nCachedTokens = nKeep;
1211+
} else {
1212+
this.nCachedTokens -= nDiscard;
1213+
}
11991214
}
12001215

12011216
/**

src/workers-code/generated.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
// This file is auto-generated
22
// To re-generate it, run: npm run build:worker
3+
4+
export const LIBLLAMA_VERSION = 'b6327-4d74393';
5+
36
export const LLAMA_CPP_WORKER_CODE = "// Start the main llama.cpp\nlet wllamaMalloc;\nlet wllamaStart;\nlet wllamaAction;\nlet wllamaExit;\nlet wllamaDebug;\n\nlet Module = null;\n\n//////////////////////////////////////////////////////////////\n// UTILS\n//////////////////////////////////////////////////////////////\n\n// send message back to main thread\nconst msg = (data, transfer) => postMessage(data, transfer);\n\n// Convert CPP log into JS log\nconst cppLogToJSLog = (line) => {\n const matched = line.match(/@@(DEBUG|INFO|WARN|ERROR)@@(.*)/);\n return !!matched\n ? {\n level: (matched[1] === 'INFO' ? 'debug' : matched[1]).toLowerCase(),\n text: matched[2],\n }\n : { level: 'log', text: line };\n};\n\n// Get module config that forwards stdout/err to main thread\nconst getWModuleConfig = (_argMainScriptBlob) => {\n var pathConfig = RUN_OPTIONS.pathConfig;\n var pthreadPoolSize = RUN_OPTIONS.nbThread;\n var argMainScriptBlob = _argMainScriptBlob;\n\n if (!pathConfig['wllama.wasm']) {\n throw new Error('\"wllama.wasm\" is missing in pathConfig');\n }\n return {\n noInitialRun: true,\n print: function (text) {\n if (arguments.length > 1)\n text = Array.prototype.slice.call(arguments).join(' ');\n msg({ verb: 'console.log', args: [text] });\n },\n printErr: function (text) {\n if (arguments.length > 1)\n text = Array.prototype.slice.call(arguments).join(' ');\n const logLine = cppLogToJSLog(text);\n msg({ verb: 'console.' + logLine.level, args: [logLine.text] });\n },\n locateFile: function (filename, basePath) {\n const p = pathConfig[filename];\n const truncate = (str) =>\n str.length > 128 ? `${str.substr(0, 128)}...` : str;\n if (filename.match(/wllama\\.worker\\.js/)) {\n msg({\n verb: 'console.error',\n args: [\n '\"wllama.worker.js\" is removed from v2.2.1. Hint: make sure to clear browser\\'s cache.',\n ],\n });\n } else {\n msg({\n verb: 'console.debug',\n args: [`Loading \"${filename}\" from \"${truncate(p)}\"`],\n });\n return p;\n }\n },\n mainScriptUrlOrBlob: argMainScriptBlob,\n pthreadPoolSize,\n wasmMemory: pthreadPoolSize > 1 ? getWasmMemory() : null,\n onAbort: function (text) {\n msg({ verb: 'signal.abort', args: [text] });\n },\n };\n};\n\n// Get the memory to be used by wasm. (Only used in multi-thread mode)\n// Because we have a weird OOM issue on iOS, we need to try some values\n// See: https://github.com/emscripten-core/emscripten/issues/19144\n// https://github.com/godotengine/godot/issues/70621\nconst getWasmMemory = () => {\n let minBytes = 128 * 1024 * 1024;\n let maxBytes = 4096 * 1024 * 1024;\n let stepBytes = 128 * 1024 * 1024;\n while (maxBytes > minBytes) {\n try {\n const wasmMemory = new WebAssembly.Memory({\n initial: minBytes / 65536,\n maximum: maxBytes / 65536,\n shared: true,\n });\n return wasmMemory;\n } catch (e) {\n maxBytes -= stepBytes;\n continue; // retry\n }\n }\n throw new Error('Cannot allocate WebAssembly.Memory');\n};\n\n//////////////////////////////////////////////////////////////\n// MEMFS PATCH\n//////////////////////////////////////////////////////////////\n\n/**\n * By default, emscripten uses memfs. The way it works is by\n * allocating new Uint8Array in javascript heap. This is not good\n * because it requires files to be copied to wasm heap each time\n * a file is read.\n *\n * HeapFS is an alternative, which resolves this problem by\n * allocating space for file directly inside wasm heap. This\n * allows us to mmap without doing any copy.\n *\n * For llama.cpp, this is great because we use MAP_SHARED\n *\n * Ref: https://github.com/ngxson/wllama/pull/39\n * Ref: https://github.com/emscripten-core/emscripten/blob/main/src/library_memfs.js\n *\n * Note 29/05/2024 @ngxson\n * Due to ftell() being limited to MAX_LONG, we cannot load files bigger than 2^31 bytes (or 2GB)\n * Ref: https://github.com/emscripten-core/emscripten/blob/main/system/lib/libc/musl/src/stdio/ftell.c\n */\n\nconst fsNameToFile = {}; // map Name => File\nconst fsIdToFile = {}; // map ID => File\nlet currFileId = 0;\n\n// Patch and redirect memfs calls to wllama\nconst patchMEMFS = () => {\n const m = Module;\n // save functions\n m.MEMFS.stream_ops._read = m.MEMFS.stream_ops.read;\n m.MEMFS.stream_ops._write = m.MEMFS.stream_ops.write;\n m.MEMFS.stream_ops._llseek = m.MEMFS.stream_ops.llseek;\n m.MEMFS.stream_ops._allocate = m.MEMFS.stream_ops.allocate;\n m.MEMFS.stream_ops._mmap = m.MEMFS.stream_ops.mmap;\n m.MEMFS.stream_ops._msync = m.MEMFS.stream_ops.msync;\n\n const patchStream = (stream) => {\n const name = stream.node.name;\n if (fsNameToFile[name]) {\n const f = fsNameToFile[name];\n stream.node.contents = m.HEAPU8.subarray(f.ptr, f.ptr + f.size);\n stream.node.usedBytes = f.size;\n }\n };\n\n // replace \"read\" functions\n m.MEMFS.stream_ops.read = function (\n stream,\n buffer,\n offset,\n length,\n position\n ) {\n patchStream(stream);\n return m.MEMFS.stream_ops._read(stream, buffer, offset, length, position);\n };\n m.MEMFS.ops_table.file.stream.read = m.MEMFS.stream_ops.read;\n\n // replace \"llseek\" functions\n m.MEMFS.stream_ops.llseek = function (stream, offset, whence) {\n patchStream(stream);\n return m.MEMFS.stream_ops._llseek(stream, offset, whence);\n };\n m.MEMFS.ops_table.file.stream.llseek = m.MEMFS.stream_ops.llseek;\n\n // replace \"mmap\" functions\n m.MEMFS.stream_ops.mmap = function (stream, length, position, prot, flags) {\n patchStream(stream);\n const name = stream.node.name;\n if (fsNameToFile[name]) {\n const f = fsNameToFile[name];\n return {\n ptr: f.ptr + position,\n allocated: false,\n };\n } else {\n return m.MEMFS.stream_ops._mmap(stream, length, position, prot, flags);\n }\n };\n m.MEMFS.ops_table.file.stream.mmap = m.MEMFS.stream_ops.mmap;\n\n // mount FS\n m.FS.mkdir('/models');\n m.FS.mount(m.MEMFS, { root: '.' }, '/models');\n};\n\n// Allocate a new file in wllama heapfs, returns file ID\nconst heapfsAlloc = (name, size) => {\n if (size < 1) {\n throw new Error('File size must be bigger than 0');\n }\n const m = Module;\n const ptr = m.mmapAlloc(size);\n const file = {\n ptr: ptr,\n size: size,\n id: currFileId++,\n };\n fsIdToFile[file.id] = file;\n fsNameToFile[name] = file;\n return file.id;\n};\n\n// Add new file to wllama heapfs, return number of written bytes\nconst heapfsWrite = (id, buffer, offset) => {\n const m = Module;\n if (fsIdToFile[id]) {\n const { ptr, size } = fsIdToFile[id];\n const afterWriteByte = offset + buffer.byteLength;\n if (afterWriteByte > size) {\n throw new Error(\n `File ID ${id} write out of bound, afterWriteByte = ${afterWriteByte} while size = ${size}`\n );\n }\n m.HEAPU8.set(buffer, ptr + offset);\n return buffer.byteLength;\n } else {\n throw new Error(`File ID ${id} not found in heapfs`);\n }\n};\n\n//////////////////////////////////////////////////////////////\n// MAIN CODE\n//////////////////////////////////////////////////////////////\n\nconst callWrapper = (name, ret, args) => {\n const fn = Module.cwrap(name, ret, args);\n return async (action, req) => {\n let result;\n try {\n if (args.length === 2) {\n result = await fn(action, req);\n } else {\n result = fn();\n }\n } catch (ex) {\n console.error(ex);\n throw ex;\n }\n return result;\n };\n};\n\nonmessage = async (e) => {\n if (!e.data) return;\n const { verb, args, callbackId } = e.data;\n\n if (!callbackId) {\n msg({ verb: 'console.error', args: ['callbackId is required', e.data] });\n return;\n }\n\n if (verb === 'module.init') {\n const argMainScriptBlob = args[0];\n try {\n Module = getWModuleConfig(argMainScriptBlob);\n Module.onRuntimeInitialized = () => {\n // async call once module is ready\n // init FS\n patchMEMFS();\n // init cwrap\n const pointer = 'number';\n // TODO: note sure why emscripten cannot bind if there is only 1 argument\n wllamaMalloc = callWrapper('wllama_malloc', pointer, [\n 'number',\n pointer,\n ]);\n wllamaStart = callWrapper('wllama_start', 'string', []);\n wllamaAction = callWrapper('wllama_action', pointer, [\n 'string',\n pointer,\n ]);\n wllamaExit = callWrapper('wllama_exit', 'string', []);\n wllamaDebug = callWrapper('wllama_debug', 'string', []);\n msg({ callbackId, result: null });\n };\n wModuleInit();\n } catch (err) {\n msg({ callbackId, err });\n }\n return;\n }\n\n if (verb === 'fs.alloc') {\n const argFilename = args[0];\n const argSize = args[1];\n try {\n // create blank file\n const emptyBuffer = new ArrayBuffer(0);\n Module['FS_createDataFile'](\n '/models',\n argFilename,\n emptyBuffer,\n true,\n true,\n true\n );\n // alloc data on heap\n const fileId = heapfsAlloc(argFilename, argSize);\n msg({ callbackId, result: { fileId } });\n } catch (err) {\n msg({ callbackId, err });\n }\n return;\n }\n\n if (verb === 'fs.write') {\n const argFileId = args[0];\n const argBuffer = args[1];\n const argOffset = args[2];\n try {\n const writtenBytes = heapfsWrite(argFileId, argBuffer, argOffset);\n msg({ callbackId, result: { writtenBytes } });\n } catch (err) {\n msg({ callbackId, err });\n }\n return;\n }\n\n if (verb === 'wllama.start') {\n try {\n const result = await wllamaStart();\n msg({ callbackId, result });\n } catch (err) {\n msg({ callbackId, err });\n }\n return;\n }\n\n if (verb === 'wllama.action') {\n const argAction = args[0];\n const argEncodedMsg = args[1];\n try {\n const inputPtr = await wllamaMalloc(argEncodedMsg.byteLength, 0);\n // copy data to wasm heap\n const inputBuffer = new Uint8Array(\n Module.HEAPU8.buffer,\n inputPtr,\n argEncodedMsg.byteLength\n );\n inputBuffer.set(argEncodedMsg, 0);\n const outputPtr = await wllamaAction(argAction, inputPtr);\n // length of output buffer is written at the first 4 bytes of input buffer\n const outputLen = new Uint32Array(Module.HEAPU8.buffer, inputPtr, 1)[0];\n // copy the output buffer to JS heap\n const outputBuffer = new Uint8Array(outputLen);\n const outputSrcView = new Uint8Array(\n Module.HEAPU8.buffer,\n outputPtr,\n outputLen\n );\n outputBuffer.set(outputSrcView, 0); // copy it\n msg({ callbackId, result: outputBuffer }, [outputBuffer.buffer]);\n } catch (err) {\n msg({ callbackId, err });\n }\n return;\n }\n\n if (verb === 'wllama.exit') {\n try {\n const result = await wllamaExit();\n msg({ callbackId, result });\n } catch (err) {\n msg({ callbackId, err });\n }\n return;\n }\n\n if (verb === 'wllama.debug') {\n try {\n const result = await wllamaDebug();\n msg({ callbackId, result });\n } catch (err) {\n msg({ callbackId, err });\n }\n return;\n }\n};\n";
47

58
export const OPFS_UTILS_WORKER_CODE = "let accessHandle;\nlet abortController = new AbortController();\n\nasync function openFile(filename) {\n const opfsRoot = await navigator.storage.getDirectory();\n const cacheDir = await opfsRoot.getDirectoryHandle('cache', { create: true });\n const fileHandler = await cacheDir.getFileHandle(filename, { create: true });\n accessHandle = await fileHandler.createSyncAccessHandle();\n accessHandle.truncate(0); // clear file content\n}\n\nasync function writeFile(buf) {\n accessHandle.write(buf);\n}\n\nasync function closeFile() {\n accessHandle.flush();\n accessHandle.close();\n}\n\nasync function writeTextFile(filename, str) {\n await openFile(filename);\n await writeFile(new TextEncoder().encode(str));\n await closeFile();\n}\n\nconst throttled = (func, delay) => {\n let lastRun = 0;\n return (...args) => {\n const now = Date.now();\n if (now - lastRun > delay) {\n lastRun = now;\n func.apply(null, args);\n }\n };\n};\n\nconst assertNonNull = (val) => {\n if (val === null || val === undefined) {\n throw new Error('OPFS Worker: Assertion failed');\n }\n};\n\n// respond to main thread\nconst resOK = () => postMessage({ ok: true });\nconst resProgress = (loaded, total) =>\n postMessage({ progress: { loaded, total } });\nconst resErr = (err) => postMessage({ err });\n\nonmessage = async (e) => {\n try {\n if (!e.data) return;\n\n /**\n * @param {Object} e.data\n *\n * Fine-control FS actions:\n * - { action: 'open', filename: 'string' }\n * - { action: 'write', buf: ArrayBuffer }\n * - { action: 'close' }\n *\n * Simple write API:\n * - { action: 'write-simple', filename: 'string', buf: ArrayBuffer }\n *\n * Download API:\n * - { action: 'download', url: 'string', filename: 'string', options: Object, metadataFileName: 'string' }\n * - { action: 'download-abort' }\n */\n const { action, filename, buf, url, options, metadataFileName } = e.data;\n\n if (action === 'open') {\n assertNonNull(filename);\n await openFile(filename);\n return resOK();\n } else if (action === 'write') {\n assertNonNull(buf);\n await writeFile(buf);\n return resOK();\n } else if (action === 'close') {\n await closeFile();\n return resOK();\n } else if (action === 'write-simple') {\n assertNonNull(filename);\n assertNonNull(buf);\n await openFile(filename);\n await writeFile(buf);\n await closeFile();\n return resOK();\n } else if (action === 'download') {\n assertNonNull(url);\n assertNonNull(filename);\n assertNonNull(metadataFileName);\n assertNonNull(options);\n assertNonNull(options.aborted);\n abortController = new AbortController();\n if (options.aborted) abortController.abort();\n const response = await fetch(url, {\n ...options,\n signal: abortController.signal,\n });\n const contentLength = response.headers.get('content-length');\n const etag = (response.headers.get('etag') || '').replace(\n /[^A-Za-z0-9]/g,\n ''\n );\n const total = parseInt(contentLength, 10);\n const reader = response.body.getReader();\n await openFile(filename);\n let loaded = 0;\n const throttledProgress = throttled(resProgress, 100);\n while (true) {\n const { done, value } = await reader.read();\n if (done) break;\n loaded += value.byteLength;\n await writeFile(value);\n throttledProgress(loaded, total);\n }\n resProgress(total, total); // 100% done\n await closeFile();\n // make sure this is in-sync with CacheEntryMetadata\n await writeTextFile(\n metadataFileName,\n JSON.stringify({\n originalURL: url,\n originalSize: total,\n etag,\n })\n );\n return resOK();\n } else if (action === 'download-abort') {\n if (abortController) {\n abortController.abort();\n }\n return;\n }\n\n throw new Error('OPFS Worker: Invalid action', e.data);\n } catch (err) {\n return resErr(err);\n }\n};\n";

0 commit comments

Comments
 (0)