Skip to content

Commit e022a5e

Browse files
committed
backend: update to latest commit of llama.cpp Vulkan PR
1 parent 0a45dd3 commit e022a5e

File tree

9 files changed

+87
-127
lines changed

9 files changed

+87
-127
lines changed

gpt4all-backend/bert.cpp

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -381,10 +381,9 @@ void bert_eval(
381381

382382
struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
383383
// KQ = soft_max(KQ / sqrt(head width))
384-
KQ = ggml_soft_max(ctx0,
385-
ggml_scale(ctx0,
386-
KQ,
387-
ggml_new_f32(ctx0, 1.0f / sqrt((float)d_head))));
384+
KQ = ggml_soft_max(
385+
ctx0, ggml_scale(ctx0, KQ, 1.0f / sqrt((float)d_head))
386+
);
388387

389388
V = ggml_cont(ctx0, ggml_transpose(ctx0, V));
390389
struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
@@ -490,10 +489,6 @@ struct bert_ctx * bert_load_from_file(const char *fname)
490489
#endif
491490

492491
bert_ctx * new_bert = new bert_ctx;
493-
#if defined(GGML_USE_KOMPUTE)
494-
new_bert->buf_compute.force_cpu = true;
495-
new_bert->work_buf.force_cpu = true;
496-
#endif
497492

498493
bert_model & model = new_bert->model;
499494
bert_vocab & vocab = new_bert->vocab;

gpt4all-backend/gptj.cpp

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -414,11 +414,7 @@ bool gptj_eval(
414414
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
415415

416416
// KQ_scaled = KQ / sqrt(n_embd/n_head)
417-
struct ggml_tensor * KQ_scaled =
418-
ggml_scale(ctx0,
419-
KQ,
420-
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
421-
);
417+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrt(float(n_embd)/n_head));
422418

423419
// KQ_masked = mask_past(KQ_scaled)
424420
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);

gpt4all-backend/llama.cpp-mainline

Submodule llama.cpp-mainline updated 239 files

gpt4all-backend/llama.cpp.cmake

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ if (LLAMA_KOMPUTE)
175175
DEPENDS ${LLAMA_DIR}/${source}
176176
${LLAMA_DIR}/kompute-shaders/common.comp
177177
${LLAMA_DIR}/kompute-shaders/op_getrows.comp
178+
${LLAMA_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
178179
${LLAMA_DIR}/kompute-shaders/op_mul_mv_q_n.comp
179180
COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${LLAMA_DIR}/${source}
180181
COMMENT "Compiling ${source} to ${source}.spv"
@@ -231,7 +232,6 @@ if (LLAMA_KOMPUTE)
231232
kompute-shaders/op_add.comp
232233
kompute-shaders/op_addrow.comp
233234
kompute-shaders/op_mul.comp
234-
kompute-shaders/op_mulrow.comp
235235
kompute-shaders/op_silu.comp
236236
kompute-shaders/op_relu.comp
237237
kompute-shaders/op_gelu.comp
@@ -264,7 +264,6 @@ if (LLAMA_KOMPUTE)
264264
shaderop_add.h
265265
shaderop_addrow.h
266266
shaderop_mul.h
267-
shaderop_mulrow.h
268267
shaderop_silu.h
269268
shaderop_relu.h
270269
shaderop_gelu.h

gpt4all-backend/llamamodel.cpp

Lines changed: 53 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ static int llama_sample_top_p_top_k(
9696
struct LLamaPrivate {
9797
const std::string modelPath;
9898
bool modelLoaded;
99+
int device = -1;
99100
llama_model *model = nullptr;
100101
llama_context *ctx = nullptr;
101102
llama_model_params model_params;
@@ -167,24 +168,17 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx)
167168
if (llama_verbose()) {
168169
std::cerr << "llama.cpp: using Metal" << std::endl;
169170
}
170-
// metal always runs the whole model if n_gpu_layers is not 0, at least
171-
// currently
172-
d_ptr->model_params.n_gpu_layers = 1;
173-
#endif
174-
#ifdef GGML_USE_KOMPUTE
175-
if (ggml_vk_has_device()) {
176-
// vulkan always runs the whole model if n_gpu_layers is not 0, at least
177-
// currently
178-
d_ptr->model_params.n_gpu_layers = 1;
171+
d_ptr->model_params.n_gpu_layers = 100;
172+
#elif defined(GGML_USE_KOMPUTE)
173+
if (d_ptr->device != -1) {
174+
d_ptr->model_params.main_gpu = d_ptr->device;
175+
d_ptr->model_params.n_gpu_layers = 100;
179176
}
180177
#endif
181178

182179
d_ptr->model = llama_load_model_from_file_gpt4all(modelPath.c_str(), &d_ptr->model_params);
183180
if (!d_ptr->model) {
184-
#ifdef GGML_USE_KOMPUTE
185-
// Explicitly free the device so next load it doesn't use it
186-
ggml_vk_free_device();
187-
#endif
181+
d_ptr->device = -1;
188182
std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl;
189183
return false;
190184
}
@@ -214,18 +208,15 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx)
214208

215209
d_ptr->ctx = llama_new_context_with_model(d_ptr->model, d_ptr->ctx_params);
216210
if (!d_ptr->ctx) {
217-
#ifdef GGML_USE_KOMPUTE
218-
// Explicitly free the device so next load it doesn't use it
219-
ggml_vk_free_device();
220-
#endif
211+
d_ptr->device = -1;
221212
std::cerr << "LLAMA ERROR: failed to init context for model " << modelPath << std::endl;
222213
return false;
223214
}
224215

225216
d_ptr->end_tokens = {llama_token_eos(d_ptr->model)};
226217

227218
#ifdef GGML_USE_KOMPUTE
228-
if (ggml_vk_has_device()) {
219+
if (usingGPUDevice() && ggml_vk_has_device()) {
229220
std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl;
230221
}
231222
#endif
@@ -339,70 +330,78 @@ const std::vector<LLModel::Token> &LLamaModel::endTokens() const
339330
std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired)
340331
{
341332
#if defined(GGML_USE_KOMPUTE)
342-
std::vector<ggml_vk_device> vkDevices = ggml_vk_available_devices(memoryRequired);
343-
344-
std::vector<LLModel::GPUDevice> devices;
345-
for(const auto& vkDevice : vkDevices) {
346-
LLModel::GPUDevice device;
347-
device.index = vkDevice.index;
348-
device.type = vkDevice.type;
349-
device.heapSize = vkDevice.heapSize;
350-
device.name = vkDevice.name;
351-
device.vendor = vkDevice.vendor;
352-
353-
devices.push_back(device);
354-
}
333+
size_t count = 0;
334+
auto * vkDevices = ggml_vk_available_devices(memoryRequired, &count);
335+
336+
if (vkDevices) {
337+
std::vector<LLModel::GPUDevice> devices;
338+
devices.reserve(count);
339+
340+
for (size_t i = 0; i < count; ++i) {
341+
auto & dev = vkDevices[i];
342+
devices.emplace_back(
343+
/* index = */ dev.index,
344+
/* type = */ dev.type,
345+
/* heapSize = */ dev.heapSize,
346+
/* name = */ dev.name,
347+
/* vendor = */ dev.vendor
348+
);
349+
}
355350

356-
return devices;
357-
#else
358-
return std::vector<LLModel::GPUDevice>();
351+
free(vkDevices);
352+
return devices;
353+
}
359354
#endif
355+
356+
return {};
360357
}
361358

362-
bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string& device)
359+
bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name)
363360
{
364361
#if defined(GGML_USE_KOMPUTE)
365-
return ggml_vk_init_device(memoryRequired, device);
362+
ggml_vk_device device;
363+
bool ok = ggml_vk_get_device(&device, memoryRequired, name.c_str());
364+
if (ok) {
365+
d_ptr->device = device.index;
366+
return true;
367+
}
366368
#else
367-
return false;
369+
(void)memoryRequired;
370+
(void)name;
368371
#endif
372+
return false;
369373
}
370374

371375
bool LLamaModel::initializeGPUDevice(const LLModel::GPUDevice &device, std::string *unavail_reason)
372376
{
373-
bool result = false;
374377
#if defined(GGML_USE_KOMPUTE)
375-
ggml_vk_device vkDevice;
376-
vkDevice.index = device.index;
377-
vkDevice.type = device.type;
378-
vkDevice.heapSize = device.heapSize;
379-
vkDevice.name = device.name;
380-
vkDevice.vendor = device.vendor;
381-
result = ggml_vk_init_device(vkDevice);
382-
if (!result && unavail_reason) {
383-
*unavail_reason = "failed to init GPU";
384-
}
378+
(void)unavail_reason;
379+
d_ptr->device = device.index;
380+
return true;
385381
#else
382+
(void)device;
386383
if (unavail_reason) {
387384
*unavail_reason = "built without Kompute";
388385
}
386+
return false;
389387
#endif
390-
return result;
391388
}
392389

393390
bool LLamaModel::initializeGPUDevice(int device)
394391
{
395392
#if defined(GGML_USE_KOMPUTE)
396-
return ggml_vk_init_device(device);
393+
d_ptr->device = device;
394+
return true;
397395
#else
396+
(void)device;
398397
return false;
399398
#endif
400399
}
401400

402401
bool LLamaModel::hasGPUDevice()
403402
{
404403
#if defined(GGML_USE_KOMPUTE)
405-
return ggml_vk_has_device();
404+
return d_ptr->device != -1;
406405
#else
407406
return false;
408407
#endif
@@ -411,11 +410,12 @@ bool LLamaModel::hasGPUDevice()
411410
bool LLamaModel::usingGPUDevice()
412411
{
413412
#if defined(GGML_USE_KOMPUTE)
414-
return ggml_vk_using_vulkan();
413+
return hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
415414
#elif defined(GGML_USE_METAL)
416415
return true;
417-
#endif
416+
#else
418417
return false;
418+
#endif
419419
}
420420

421421
std::string get_arch_name(gguf_context *ctx_gguf) {

gpt4all-backend/llamamodel_impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ class LLamaModel : public LLModel {
2626
void setThreadCount(int32_t n_threads) override;
2727
int32_t threadCount() const override;
2828
std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) override;
29-
bool initializeGPUDevice(size_t memoryRequired, const std::string& device) override;
29+
bool initializeGPUDevice(size_t memoryRequired, const std::string& name) override;
3030
bool initializeGPUDevice(const GPUDevice &device, std::string *unavail_reason) override;
3131
bool initializeGPUDevice(int device) override;
3232
bool hasGPUDevice() override;

gpt4all-backend/llmodel.h

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,14 @@ class LLModel {
1717
using Token = int32_t;
1818

1919
struct GPUDevice {
20-
int index = 0;
21-
int type = 0;
22-
size_t heapSize = 0;
20+
int index;
21+
int type;
22+
size_t heapSize;
2323
std::string name;
2424
std::string vendor;
25+
26+
GPUDevice(int index, int type, size_t heapSize, std::string name, std::string vendor):
27+
index(index), type(type), heapSize(heapSize), name(std::move(name)), vendor(std::move(vendor)) {}
2528
};
2629

2730
class Implementation {
@@ -98,14 +101,25 @@ class LLModel {
98101
return *m_implementation;
99102
}
100103

101-
virtual std::vector<GPUDevice> availableGPUDevices(size_t /*memoryRequired*/) { return std::vector<GPUDevice>(); }
102-
virtual bool initializeGPUDevice(size_t /*memoryRequired*/, const std::string& /*device*/) { return false; }
103-
virtual bool initializeGPUDevice(const GPUDevice &/*device*/, std::string *unavail_reason = nullptr) {
104+
virtual std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) {
105+
(void)memoryRequired;
106+
return {};
107+
}
108+
109+
virtual bool initializeGPUDevice(size_t memoryRequired, const std::string& name) {
110+
(void)memoryRequired;
111+
(void)name;
112+
return false;
113+
}
114+
115+
virtual bool initializeGPUDevice(const GPUDevice & device, std::string *unavail_reason = nullptr) {
116+
(void)device;
104117
if (unavail_reason) {
105118
*unavail_reason = "model has no GPU support";
106119
}
107120
return false;
108121
}
122+
109123
virtual bool initializeGPUDevice(int /*device*/) { return false; }
110124
virtual bool hasGPUDevice() { return false; }
111125
virtual bool usingGPUDevice() { return false; }

gpt4all-backend/llmodel_c.cpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -230,12 +230,13 @@ bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryReq
230230

231231
bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device)
232232
{
233-
LLModel::GPUDevice d;
234-
d.index = device->index;
235-
d.type = device->type;
236-
d.heapSize = device->heapSize;
237-
d.name = device->name;
238-
d.vendor = device->vendor;
233+
LLModel::GPUDevice d(
234+
/* index = */ device->index,
235+
/* type = */ device->type,
236+
/* heapSize = */ device->heapSize,
237+
/* name = */ device->name,
238+
/* vendor = */ device->vendor
239+
);
239240
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
240241
return wrapper->llModel->initializeGPUDevice(d);
241242
}

gpt4all-backend/llmodel_shared.h

Lines changed: 0 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -4,50 +4,6 @@
44
#include <vector>
55
#include <ggml.h>
66

7-
#if defined(GGML_USE_KOMPUTE)
8-
#include "ggml-kompute.h"
9-
struct llm_buffer {
10-
uint8_t * addr = NULL;
11-
size_t size = 0;
12-
ggml_vk_memory memory;
13-
bool force_cpu = false;
14-
15-
llm_buffer() = default;
16-
17-
void resize(size_t size) {
18-
free();
19-
20-
if (!ggml_vk_has_device() || force_cpu) {
21-
this->addr = new uint8_t[size];
22-
this->size = size;
23-
} else {
24-
this->memory = ggml_vk_allocate(size);
25-
this->addr = (uint8_t*)memory.data;
26-
this->size = size;
27-
}
28-
}
29-
30-
void free() {
31-
if (!memory.primaryMemory) {
32-
delete[] addr;
33-
} else if (memory.data) {
34-
ggml_vk_free_memory(memory);
35-
}
36-
this->addr = NULL;
37-
this->size = 0;
38-
}
39-
40-
~llm_buffer() {
41-
free();
42-
}
43-
44-
// disable copy and move
45-
llm_buffer(const llm_buffer&) = delete;
46-
llm_buffer(llm_buffer&&) = delete;
47-
llm_buffer& operator=(const llm_buffer&) = delete;
48-
llm_buffer& operator=(llm_buffer&&) = delete;
49-
};
50-
#else
517
struct llm_buffer {
528
uint8_t * addr = NULL;
539
size_t size = 0;
@@ -62,7 +18,6 @@ struct llm_buffer {
6218
delete[] addr;
6319
}
6420
};
65-
#endif
6621

6722
struct llm_kv_cache {
6823
struct ggml_tensor * k;

0 commit comments

Comments
 (0)