@@ -7,7 +7,8 @@ import fs from "fs-extra";
7
7
import prettyMilliseconds from "pretty-ms" ;
8
8
import { getLlama } from "../../bindings/getLlama.js" ;
9
9
import {
10
- BuildGpu , LlamaLogLevel , LlamaLogLevelGreaterThan , nodeLlamaCppGpuOptions , parseNodeLlamaCppGpuOption
10
+ BuildGpu , LlamaLogLevel , LlamaLogLevelGreaterThan , LlamaNuma , llamaNumaOptions , nodeLlamaCppGpuOptions , parseNodeLlamaCppGpuOption ,
11
+ parseNumaOption
11
12
} from "../../bindings/types.js" ;
12
13
import { LlamaCompletion } from "../../evaluator/LlamaCompletion.js" ;
13
14
import withOra from "../../utils/withOra.js" ;
@@ -51,6 +52,7 @@ type InfillCommand = {
51
52
tokenPredictionDraftModel ?: string ,
52
53
tokenPredictionModelContextSize ?: number ,
53
54
debug : boolean ,
55
+ numa ?: LlamaNuma ,
54
56
meter : boolean ,
55
57
timing : boolean ,
56
58
noMmap : boolean ,
@@ -228,6 +230,20 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
228
230
default : false ,
229
231
description : "Print llama.cpp info and debug logs"
230
232
} )
233
+ . option ( "numa" , {
234
+ type : "string" ,
235
+
236
+ // yargs types don't support passing `false` as a choice, although it is supported by yargs
237
+ choices : llamaNumaOptions as any as Exclude < typeof llamaNumaOptions [ number ] , false > [ ] ,
238
+ coerce : ( value ) => {
239
+ if ( value == null || value == "" )
240
+ return false ;
241
+
242
+ return parseNumaOption ( value ) ;
243
+ } ,
244
+ defaultDescription : "false" ,
245
+ description : "NUMA allocation policy. See the `numa` option on the `getLlama` method for more information"
246
+ } )
231
247
. option ( "meter" , {
232
248
type : "boolean" ,
233
249
default : false ,
@@ -255,14 +271,14 @@ export const InfillCommand: CommandModule<object, InfillCommand> = {
255
271
flashAttention, swaFullCache, threads, temperature, minP, topK,
256
272
topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
257
273
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
258
- debug, meter, timing, noMmap, printTimings
274
+ debug, numa , meter, timing, noMmap, printTimings
259
275
} ) {
260
276
try {
261
277
await RunInfill ( {
262
278
modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
263
279
swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
264
280
repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
265
- tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, meter, timing, noMmap, printTimings
281
+ tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa , meter, timing, noMmap, printTimings
266
282
} ) ;
267
283
} catch ( err ) {
268
284
await new Promise ( ( accept ) => setTimeout ( accept , 0 ) ) ; // wait for logs to finish printing
@@ -277,7 +293,7 @@ async function RunInfill({
277
293
modelPath : modelArg , header : headerArg , gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention,
278
294
swaFullCache, threads, temperature, minP, topK, topP, seed, gpuLayers,
279
295
lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
280
- tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, meter, timing, noMmap, printTimings
296
+ tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa , meter, timing, noMmap, printTimings
281
297
} : InfillCommand ) {
282
298
if ( contextSize === - 1 ) contextSize = undefined ;
283
299
if ( gpuLayers === - 1 ) gpuLayers = undefined ;
@@ -292,11 +308,13 @@ async function RunInfill({
292
308
: LlamaLogLevel . warn ;
293
309
const llama = gpu == null
294
310
? await getLlama ( "lastBuild" , {
295
- logLevel : llamaLogLevel
311
+ logLevel : llamaLogLevel ,
312
+ numa
296
313
} )
297
314
: await getLlama ( {
298
315
gpu,
299
- logLevel : llamaLogLevel
316
+ logLevel : llamaLogLevel ,
317
+ numa
300
318
} ) ;
301
319
const logBatchSize = batchSize != null ;
302
320
const useMmap = ! noMmap && llama . supportsMmap ;
0 commit comments