Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
443678a
ggml : fix GGML_MAX_N_THREADS + improve formatting (ggml/969)
ggerganov Sep 24, 2024
8fc9239
vulkan : argsort barriers must be under uniform control flow (ggml/951)
smeso Sep 26, 2024
45c860f
vulkan : fix build for GGML_VULKAN_RUN_TESTS, add TFLOPS to log (ggml…
jeffbolznv Sep 27, 2024
ec8e919
vulkan : multithread pipeline creation (ggml/963)
jeffbolznv Sep 29, 2024
44e2d39
CUDA: remove bad assert (ggml/972)
JohannesGaessler Sep 29, 2024
d2eac9f
cann: fix crash when llama-bench is running on multiple cann devices …
bachelor-dou Sep 25, 2024
b0978f0
ggml : remove assert for AArch64 GEMV and GEMM Q4 kernels (llama/9217)
chaxu01 Sep 25, 2024
0448551
mtgpu: enable VMM (llama/9597)
yeahdongcn Sep 26, 2024
fd5cb2b
Enable use to the rebar feature to upload buffers to the device. (lla…
mtavenrath Sep 28, 2024
a98b5fa
ggml : add run-time detection of neon, i8mm and sve (llama/9331)
eddnjjn Sep 28, 2024
9d176ca
ggml : define missing HWCAP flags (llama/9684)
ggerganov Sep 29, 2024
034ed81
ggml: fix gradient allocation logic (ggml/966)
JohannesGaessler Sep 29, 2024
ee8e29c
ggml : fix ggml_cast (ggml/973)
iboB Sep 30, 2024
28a3391
vulkan : mul_mat: fix UB with small warps (ggml/952)
smeso Sep 30, 2024
2eda43a
test: fix OPT_STEP_ADAMW for test-backend-ops (ggml/974)
JohannesGaessler Sep 30, 2024
fce227e
sync : ggml
ggerganov Oct 2, 2024
f083908
metal : reduce command encoding overhead (llama/9698)
ggerganov Oct 2, 2024
b4c9631
talk-llama : sync llama.cpp
ggerganov Oct 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
295 changes: 189 additions & 106 deletions examples/talk-llama/llama-vocab.cpp

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions examples/talk-llama/llama-vocab.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#include <map>
#include <set>

struct llm_tokenizer;

struct llama_vocab {
using id = llama_token;
using token = std::string;
Expand Down Expand Up @@ -65,7 +67,14 @@ struct llama_vocab {

std::vector<char> precompiled_charsmap;

llm_tokenizer * tokenizer = nullptr;

llama_vocab() = default;
~llama_vocab();

int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;

void init_tokenizer();
};

//
Expand Down
399 changes: 381 additions & 18 deletions examples/talk-llama/llama.cpp

Large diffs are not rendered by default.

13 changes: 9 additions & 4 deletions examples/talk-llama/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
};

enum llama_rope_type {
Expand Down Expand Up @@ -192,6 +193,7 @@ extern "C" {
LLAMA_POOLING_TYPE_MEAN = 1,
LLAMA_POOLING_TYPE_CLS = 2,
LLAMA_POOLING_TYPE_LAST = 3,
LLAMA_POOLING_TYPE_RANK = 4, // used by reranking models to attach the classification head to the graph
};

enum llama_attention_type {
Expand All @@ -201,9 +203,9 @@ extern "C" {
};

enum llama_split_mode {
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
};

// TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
Expand Down Expand Up @@ -871,7 +873,8 @@ extern "C" {

// Get the embeddings for a sequence id
// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
// shape: [n_embd] (1-dimensional)
// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
// otherwise: float[n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);

//
Expand Down Expand Up @@ -910,6 +913,8 @@ extern "C" {
//
// Tokenization
//
// The API is thread-safe.
//

/// @details Convert the provided text into tokens.
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
Expand Down
5 changes: 0 additions & 5 deletions ggml/include/ggml-metal.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,6 @@
#include <stddef.h>
#include <stdbool.h>

// max memory buffers that can be mapped to the device
#define GGML_METAL_MAX_BUFFERS 64

struct ggml_tensor;
struct ggml_cgraph;

Expand All @@ -48,8 +45,6 @@ GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);

GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);

GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);

GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);

GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
Expand Down
66 changes: 36 additions & 30 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,14 +229,16 @@
#define GGML_MAX_PARAMS 2048
#define GGML_MAX_CONTEXTS 64
#define GGML_MAX_SRC 10
#ifndef GGML_MAX_NAME
#define GGML_MAX_NAME 64
#define GGML_MAX_N_THREADS 512
#define GGML_MAX_OP_PARAMS 64

#ifndef GGML_MAX_NAME
# define GGML_MAX_NAME 64
#endif
#define GGML_MAX_OP_PARAMS 64

#define GGML_DEFAULT_N_THREADS 4
#define GGML_DEFAULT_GRAPH_SIZE 2048

#if UINTPTR_MAX == 0xFFFFFFFF
#define GGML_MEM_ALIGN 4
#else
Expand All @@ -259,21 +261,21 @@
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))

#ifndef NDEBUG
#define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
# define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
#elif defined(__GNUC__)
#define GGML_UNREACHABLE() __builtin_unreachable()
# define GGML_UNREACHABLE() __builtin_unreachable()
#elif defined(_MSC_VER)
#define GGML_UNREACHABLE() __assume(0)
# define GGML_UNREACHABLE() __assume(0)
#else
#define GGML_UNREACHABLE() ((void) 0)
# define GGML_UNREACHABLE() ((void) 0)
#endif

#ifdef __cplusplus
#define GGML_NORETURN [[noreturn]]
# define GGML_NORETURN [[noreturn]]
#elif defined(_MSC_VER)
#define GGML_NORETURN __declspec(noreturn)
# define GGML_NORETURN __declspec(noreturn)
#else
#define GGML_NORETURN _Noreturn
# define GGML_NORETURN _Noreturn
#endif

#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
Expand Down Expand Up @@ -575,10 +577,10 @@ extern "C" {

// this tensor...
enum ggml_tensor_flag {
GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph
GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph
GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph
GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph
GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
};

// n-dimensional tensor
Expand Down Expand Up @@ -1408,14 +1410,14 @@ extern "C" {
// supports 3D: a->ne[2] == b->ne[1]
GGML_API struct ggml_tensor * ggml_get_rows(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
struct ggml_tensor * a, // data
struct ggml_tensor * b); // row indices

GGML_API struct ggml_tensor * ggml_get_rows_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c);
struct ggml_tensor * a, // gradients of ggml_get_rows result
struct ggml_tensor * b, // row indices
struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape

GGML_API struct ggml_tensor * ggml_diag(
struct ggml_context * ctx,
Expand Down Expand Up @@ -1566,9 +1568,9 @@ extern "C" {
// a - dy
GGML_API struct ggml_tensor * ggml_rope_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
struct ggml_tensor * a, // gradients of ggml_rope result
struct ggml_tensor * b, // positions
struct ggml_tensor * c, // freq factors
int n_dims,
int mode,
int n_ctx_orig,
Expand Down Expand Up @@ -2034,22 +2036,23 @@ extern "C" {
// loss function

GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
struct ggml_context * ctx,
struct ggml_tensor * a, // logits
struct ggml_tensor * b); // labels

GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c);
struct ggml_context * ctx,
struct ggml_tensor * a, // logits
struct ggml_tensor * b, // labels
struct ggml_tensor * c); // gradients of cross_entropy_loss result

// AdamW optimizer step
// Paper: https://arxiv.org/pdf/1711.05101v3.pdf
// PyTorch: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
GGML_API struct ggml_tensor * ggml_opt_step_adamw(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * grad,
float alpha,
float beta1,
float beta2,
Expand All @@ -2064,7 +2067,7 @@ extern "C" {
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);

GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate, bool keep);
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate);

GGML_API void ggml_build_opt_adamw(
struct ggml_context * ctx,
Expand Down Expand Up @@ -2507,6 +2510,9 @@ extern "C" {
GGML_API int ggml_cpu_has_cann (void);
GGML_API int ggml_cpu_has_llamafile (void);

// get the sve vector length in bytes
GGML_API int ggml_cpu_get_sve_cnt(void);

//
// Internal types and functions exposed for tests and benchmarks
//
Expand Down
Loading
Loading