39
39
40
40
#define LLAMA_MAX_RNG_STATE (64 *1024 )
41
41
42
+ #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
42
43
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
43
44
44
45
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
45
- #define LLAMA_SESSION_VERSION 2
46
+ #define LLAMA_SESSION_VERSION 3
46
47
47
48
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
48
49
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
@@ -126,7 +127,7 @@ extern "C" {
126
127
bool sorted;
127
128
} llama_token_data_array;
128
129
129
- typedef void (*llama_progress_callback)(float progress, void *ctx);
130
+ typedef bool (*llama_progress_callback)(float progress, void *ctx);
130
131
131
132
// Input data for llama_decode
132
133
// A llama_batch object can contain input about one or many sequences
@@ -158,16 +159,38 @@ extern "C" {
158
159
llama_seq_id all_seq_id; // used if seq_id == NULL
159
160
} llama_batch;
160
161
162
+ enum llama_model_kv_override_type {
163
+ LLAMA_KV_OVERRIDE_INT,
164
+ LLAMA_KV_OVERRIDE_FLOAT,
165
+ LLAMA_KV_OVERRIDE_BOOL,
166
+ };
167
+
168
+ struct llama_model_kv_override {
169
+ char key[128 ];
170
+ enum llama_model_kv_override_type tag;
171
+ union {
172
+ int64_t int_value;
173
+ double float_value;
174
+ bool bool_value;
175
+ };
176
+ };
177
+
161
178
struct llama_model_params {
162
179
int32_t n_gpu_layers; // number of layers to store in VRAM
163
180
int32_t main_gpu; // the GPU that is used for scratch and small tensors
164
181
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
165
182
166
- // called with a progress value between 0 and 1, pass NULL to disable
183
+ // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
184
+ // If the provided progress_callback returns true, model loading continues.
185
+ // If it returns false, model loading is immediately aborted.
167
186
llama_progress_callback progress_callback;
187
+
168
188
// context pointer passed to the progress callback
169
189
void * progress_callback_user_data;
170
190
191
+ // override key-value pairs of the model meta data
192
+ const struct llama_model_kv_override * kv_overrides;
193
+
171
194
// Keep the booleans together to avoid misalignment during copy-by-value.
172
195
bool vocab_only; // only load the vocabulary, no weights
173
196
bool use_mmap; // use mmap if possible
@@ -185,17 +208,20 @@ extern "C" {
185
208
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
186
209
float rope_freq_base; // RoPE base frequency, 0 = from model
187
210
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
188
- float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
211
+ float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
189
212
float yarn_attn_factor; // YaRN magnitude scaling factor
190
213
float yarn_beta_fast; // YaRN low correction dim
191
214
float yarn_beta_slow; // YaRN high correction dim
192
215
uint32_t yarn_orig_ctx; // YaRN original context size
193
216
217
+ enum ggml_type type_k; // data type for K cache
218
+ enum ggml_type type_v; // data type for V cache
219
+
194
220
// Keep the booleans together to avoid misalignment during copy-by-value.
195
- bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
196
- bool f16_kv ; // use fp16 for KV cache, fp32 otherwise
197
- bool logits_all; // the llama_eval() call computes all logits, not just the last one
198
- bool embedding; // embedding mode only
221
+ bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
222
+ bool logits_all ; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
223
+ bool embedding; // embedding mode only
224
+ bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
199
225
};
200
226
201
227
// model quantization parameters
@@ -290,7 +316,9 @@ extern "C" {
290
316
291
317
LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
292
318
293
- LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
319
+ // TODO: become more consistent with returned int types across the API
320
+ LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
321
+ LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
294
322
295
323
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
296
324
@@ -301,6 +329,23 @@ extern "C" {
301
329
// Get the model's RoPE frequency scaling factor
302
330
LLAMA_API float llama_rope_freq_scale_train (const struct llama_model * model);
303
331
332
+ // Functions to access the model's GGUF metadata scalar values
333
+ // - The functions return the length of the string on success, or -1 on failure
334
+ // - The output string is always null-terminated and cleared on failure
335
+ // - GGUF array values are not supported by these functions
336
+
337
+ // Get metadata value as a string by key name
338
+ LLAMA_API int llama_model_meta_val_str (const struct llama_model * model, const char * key, char * buf, size_t buf_size);
339
+
340
+ // Get the number of metadata key/value pairs
341
+ LLAMA_API int llama_model_meta_count (const struct llama_model * model);
342
+
343
+ // Get metadata key name by index
344
+ LLAMA_API int llama_model_meta_key_by_index (const struct llama_model * model, int i, char * buf, size_t buf_size);
345
+
346
+ // Get metadata value as a string by index
347
+ LLAMA_API int llama_model_meta_val_str_by_index (const struct llama_model * model, int i, char * buf, size_t buf_size);
348
+
304
349
// Get a string describing the model type
305
350
LLAMA_API int llama_model_desc (const struct llama_model * model, char * buf, size_t buf_size);
306
351
@@ -344,9 +389,60 @@ extern "C" {
344
389
// KV cache
345
390
//
346
391
347
- // Returns the number of tokens in the KV cache
348
- LLAMA_API DEPRECATED (int llama_get_kv_cache_token_count (const struct llama_context * ctx),
349
- "avoid using this, it will be removed in the future, instead - count the tokens in user code");
392
+ // Information associated with an individual cell in the KV cache view.
393
+ struct llama_kv_cache_view_cell {
394
+ // The position for this cell. Takes KV cache shifts into account.
395
+ // May be negative if the cell is not populated.
396
+ llama_pos pos;
397
+ };
398
+
399
+ // An updateable view of the KV cache.
400
+ struct llama_kv_cache_view {
401
+ // Number of KV cache cells. This will be the same as the context size.
402
+ int32_t n_cells;
403
+
404
+ // Maximum number of sequences that can exist in a cell. It's not an error
405
+ // if there are more sequences in a cell than this value, however they will
406
+ // not be visible in the view cells_sequences.
407
+ int32_t n_max_seq;
408
+
409
+ // Number of tokens in the cache. For example, if there are two populated
410
+ // cells, the first with 1 sequence id in it and the second with 2 sequence
411
+ // ids then you'll have 3 tokens.
412
+ int32_t token_count;
413
+
414
+ // Number of populated cache cells.
415
+ int32_t used_cells;
416
+
417
+ // Maximum contiguous empty slots in the cache.
418
+ int32_t max_contiguous;
419
+
420
+ // Index to the start of the max_contiguous slot range. Can be negative
421
+ // when cache is full.
422
+ int32_t max_contiguous_idx;
423
+
424
+ // Information for an individual cell.
425
+ struct llama_kv_cache_view_cell * cells;
426
+
427
+ // The sequences for each cell. There will be n_max_seq items per cell.
428
+ llama_seq_id * cells_sequences;
429
+ };
430
+
431
+ // Create an empty KV cache view. (use only for debugging purposes)
432
+ LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init (const struct llama_context * ctx, int32_t n_max_seq);
433
+
434
+ // Free a KV cache view. (use only for debugging purposes)
435
+ LLAMA_API void llama_kv_cache_view_free (struct llama_kv_cache_view * view);
436
+
437
+ // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
438
+ LLAMA_API void llama_kv_cache_view_update (const struct llama_context * ctx, struct llama_kv_cache_view * view);
439
+
440
+ // Returns the number of tokens in the KV cache (slow, use only for debug)
441
+ // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
442
+ LLAMA_API int llama_get_kv_cache_token_count (const struct llama_context * ctx);
443
+
444
+ // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
445
+ LLAMA_API int llama_get_kv_cache_used_cells (const struct llama_context * ctx);
350
446
351
447
// Clear the KV cache
352
448
LLAMA_API void llama_kv_cache_clear (
@@ -517,6 +613,12 @@ extern "C" {
517
613
LLAMA_API llama_token llama_token_eos (const struct llama_model * model); // end-of-sentence
518
614
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
519
615
616
+ // Returns -1 if unknown, 1 for true or 0 for false.
617
+ LLAMA_API int llama_add_bos_token (const struct llama_model * model);
618
+
619
+ // Returns -1 if unknown, 1 for true or 0 for false.
620
+ LLAMA_API int llama_add_eos_token (const struct llama_model * model);
621
+
520
622
// codellama infill tokens
521
623
LLAMA_API llama_token llama_token_prefix (const struct llama_model * model); // Beginning of infill prefix
522
624
LLAMA_API llama_token llama_token_middle (const struct llama_model * model); // Beginning of infill middle
0 commit comments