Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 3 additions & 8 deletions gpt4all-backend/bert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -381,10 +381,9 @@ void bert_eval(

struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
// KQ = soft_max(KQ / sqrt(head width))
KQ = ggml_soft_max(ctx0,
ggml_scale(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f / sqrt((float)d_head))));
KQ = ggml_soft_max(
ctx0, ggml_scale(ctx0, KQ, 1.0f / sqrt((float)d_head))
);

V = ggml_cont(ctx0, ggml_transpose(ctx0, V));
struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
Expand Down Expand Up @@ -490,10 +489,6 @@ struct bert_ctx * bert_load_from_file(const char *fname)
#endif

bert_ctx * new_bert = new bert_ctx;
#if defined(GGML_USE_KOMPUTE)
new_bert->buf_compute.force_cpu = true;
new_bert->work_buf.force_cpu = true;
#endif

bert_model & model = new_bert->model;
bert_vocab & vocab = new_bert->vocab;
Expand Down
6 changes: 1 addition & 5 deletions gpt4all-backend/gptj.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -414,11 +414,7 @@ bool gptj_eval(
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);

// KQ_scaled = KQ / sqrt(n_embd/n_head)
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
);
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrt(float(n_embd)/n_head));

// KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
Expand Down
2 changes: 1 addition & 1 deletion gpt4all-backend/llama.cpp-mainline
3 changes: 1 addition & 2 deletions gpt4all-backend/llama.cpp.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ if (LLAMA_KOMPUTE)
DEPENDS ${LLAMA_DIR}/${source}
${LLAMA_DIR}/kompute-shaders/common.comp
${LLAMA_DIR}/kompute-shaders/op_getrows.comp
${LLAMA_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
${LLAMA_DIR}/kompute-shaders/op_mul_mv_q_n.comp
COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${LLAMA_DIR}/${source}
COMMENT "Compiling ${source} to ${source}.spv"
Expand Down Expand Up @@ -231,7 +232,6 @@ if (LLAMA_KOMPUTE)
kompute-shaders/op_add.comp
kompute-shaders/op_addrow.comp
kompute-shaders/op_mul.comp
kompute-shaders/op_mulrow.comp
kompute-shaders/op_silu.comp
kompute-shaders/op_relu.comp
kompute-shaders/op_gelu.comp
Expand Down Expand Up @@ -264,7 +264,6 @@ if (LLAMA_KOMPUTE)
shaderop_add.h
shaderop_addrow.h
shaderop_mul.h
shaderop_mulrow.h
shaderop_silu.h
shaderop_relu.h
shaderop_gelu.h
Expand Down
106 changes: 53 additions & 53 deletions gpt4all-backend/llamamodel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ static int llama_sample_top_p_top_k(
struct LLamaPrivate {
const std::string modelPath;
bool modelLoaded;
int device = -1;
llama_model *model = nullptr;
llama_context *ctx = nullptr;
llama_model_params model_params;
Expand Down Expand Up @@ -167,24 +168,17 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx)
if (llama_verbose()) {
std::cerr << "llama.cpp: using Metal" << std::endl;
}
// metal always runs the whole model if n_gpu_layers is not 0, at least
// currently
d_ptr->model_params.n_gpu_layers = 1;
#endif
#ifdef GGML_USE_KOMPUTE
if (ggml_vk_has_device()) {
// vulkan always runs the whole model if n_gpu_layers is not 0, at least
// currently
d_ptr->model_params.n_gpu_layers = 1;
d_ptr->model_params.n_gpu_layers = 100;
#elif defined(GGML_USE_KOMPUTE)
if (d_ptr->device != -1) {
d_ptr->model_params.main_gpu = d_ptr->device;
d_ptr->model_params.n_gpu_layers = 100;
}
#endif

d_ptr->model = llama_load_model_from_file_gpt4all(modelPath.c_str(), &d_ptr->model_params);
if (!d_ptr->model) {
#ifdef GGML_USE_KOMPUTE
// Explicitly free the device so next load it doesn't use it
ggml_vk_free_device();
#endif
d_ptr->device = -1;
std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl;
return false;
}
Expand Down Expand Up @@ -214,18 +208,15 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx)

d_ptr->ctx = llama_new_context_with_model(d_ptr->model, d_ptr->ctx_params);
if (!d_ptr->ctx) {
#ifdef GGML_USE_KOMPUTE
// Explicitly free the device so next load it doesn't use it
ggml_vk_free_device();
#endif
d_ptr->device = -1;
std::cerr << "LLAMA ERROR: failed to init context for model " << modelPath << std::endl;
return false;
}

d_ptr->end_tokens = {llama_token_eos(d_ptr->model)};

#ifdef GGML_USE_KOMPUTE
if (ggml_vk_has_device()) {
if (usingGPUDevice() && ggml_vk_has_device()) {
std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl;
}
#endif
Expand Down Expand Up @@ -339,70 +330,78 @@ const std::vector<LLModel::Token> &LLamaModel::endTokens() const
std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired)
{
#if defined(GGML_USE_KOMPUTE)
std::vector<ggml_vk_device> vkDevices = ggml_vk_available_devices(memoryRequired);

std::vector<LLModel::GPUDevice> devices;
for(const auto& vkDevice : vkDevices) {
LLModel::GPUDevice device;
device.index = vkDevice.index;
device.type = vkDevice.type;
device.heapSize = vkDevice.heapSize;
device.name = vkDevice.name;
device.vendor = vkDevice.vendor;

devices.push_back(device);
}
size_t count = 0;
auto * vkDevices = ggml_vk_available_devices(memoryRequired, &count);

if (vkDevices) {
std::vector<LLModel::GPUDevice> devices;
devices.reserve(count);

for (size_t i = 0; i < count; ++i) {
auto & dev = vkDevices[i];
devices.emplace_back(
/* index = */ dev.index,
/* type = */ dev.type,
/* heapSize = */ dev.heapSize,
/* name = */ dev.name,
/* vendor = */ dev.vendor
);
}

return devices;
#else
return std::vector<LLModel::GPUDevice>();
free(vkDevices);
return devices;
}
#endif

return {};
}

bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string& device)
bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name)
{
#if defined(GGML_USE_KOMPUTE)
return ggml_vk_init_device(memoryRequired, device);
ggml_vk_device device;
bool ok = ggml_vk_get_device(&device, memoryRequired, name.c_str());
if (ok) {
d_ptr->device = device.index;
return true;
}
#else
return false;
(void)memoryRequired;
(void)name;
#endif
return false;
}

bool LLamaModel::initializeGPUDevice(const LLModel::GPUDevice &device, std::string *unavail_reason)
{
bool result = false;
#if defined(GGML_USE_KOMPUTE)
ggml_vk_device vkDevice;
vkDevice.index = device.index;
vkDevice.type = device.type;
vkDevice.heapSize = device.heapSize;
vkDevice.name = device.name;
vkDevice.vendor = device.vendor;
result = ggml_vk_init_device(vkDevice);
if (!result && unavail_reason) {
*unavail_reason = "failed to init GPU";
}
(void)unavail_reason;
d_ptr->device = device.index;
return true;
#else
(void)device;
if (unavail_reason) {
*unavail_reason = "built without Kompute";
}
return false;
#endif
return result;
}

bool LLamaModel::initializeGPUDevice(int device)
{
#if defined(GGML_USE_KOMPUTE)
return ggml_vk_init_device(device);
d_ptr->device = device;
return true;
#else
(void)device;
return false;
#endif
}

bool LLamaModel::hasGPUDevice()
{
#if defined(GGML_USE_KOMPUTE)
return ggml_vk_has_device();
return d_ptr->device != -1;
#else
return false;
#endif
Expand All @@ -411,11 +410,12 @@ bool LLamaModel::hasGPUDevice()
bool LLamaModel::usingGPUDevice()
{
#if defined(GGML_USE_KOMPUTE)
return ggml_vk_using_vulkan();
return hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
#elif defined(GGML_USE_METAL)
return true;
#endif
#else
return false;
#endif
}

std::string get_arch_name(gguf_context *ctx_gguf) {
Expand Down
2 changes: 1 addition & 1 deletion gpt4all-backend/llamamodel_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class LLamaModel : public LLModel {
void setThreadCount(int32_t n_threads) override;
int32_t threadCount() const override;
std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) override;
bool initializeGPUDevice(size_t memoryRequired, const std::string& device) override;
bool initializeGPUDevice(size_t memoryRequired, const std::string& name) override;
bool initializeGPUDevice(const GPUDevice &device, std::string *unavail_reason) override;
bool initializeGPUDevice(int device) override;
bool hasGPUDevice() override;
Expand Down
26 changes: 20 additions & 6 deletions gpt4all-backend/llmodel.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,14 @@ class LLModel {
using Token = int32_t;

struct GPUDevice {
int index = 0;
int type = 0;
size_t heapSize = 0;
int index;
int type;
size_t heapSize;
std::string name;
std::string vendor;

GPUDevice(int index, int type, size_t heapSize, std::string name, std::string vendor):
index(index), type(type), heapSize(heapSize), name(std::move(name)), vendor(std::move(vendor)) {}
};

class Implementation {
Expand Down Expand Up @@ -98,14 +101,25 @@ class LLModel {
return *m_implementation;
}

virtual std::vector<GPUDevice> availableGPUDevices(size_t /*memoryRequired*/) { return std::vector<GPUDevice>(); }
virtual bool initializeGPUDevice(size_t /*memoryRequired*/, const std::string& /*device*/) { return false; }
virtual bool initializeGPUDevice(const GPUDevice &/*device*/, std::string *unavail_reason = nullptr) {
virtual std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) {
(void)memoryRequired;
return {};
}

virtual bool initializeGPUDevice(size_t memoryRequired, const std::string& name) {
(void)memoryRequired;
(void)name;
return false;
}

virtual bool initializeGPUDevice(const GPUDevice & device, std::string *unavail_reason = nullptr) {
(void)device;
if (unavail_reason) {
*unavail_reason = "model has no GPU support";
}
return false;
}

virtual bool initializeGPUDevice(int /*device*/) { return false; }
virtual bool hasGPUDevice() { return false; }
virtual bool usingGPUDevice() { return false; }
Expand Down
13 changes: 7 additions & 6 deletions gpt4all-backend/llmodel_c.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,12 +230,13 @@ bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryReq

bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device)
{
LLModel::GPUDevice d;
d.index = device->index;
d.type = device->type;
d.heapSize = device->heapSize;
d.name = device->name;
d.vendor = device->vendor;
LLModel::GPUDevice d(
/* index = */ device->index,
/* type = */ device->type,
/* heapSize = */ device->heapSize,
/* name = */ device->name,
/* vendor = */ device->vendor
);
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
return wrapper->llModel->initializeGPUDevice(d);
}
Expand Down
45 changes: 0 additions & 45 deletions gpt4all-backend/llmodel_shared.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,50 +4,6 @@
#include <vector>
#include <ggml.h>

#if defined(GGML_USE_KOMPUTE)
#include "ggml-kompute.h"
struct llm_buffer {
uint8_t * addr = NULL;
size_t size = 0;
ggml_vk_memory memory;
bool force_cpu = false;

llm_buffer() = default;

void resize(size_t size) {
free();

if (!ggml_vk_has_device() || force_cpu) {
this->addr = new uint8_t[size];
this->size = size;
} else {
this->memory = ggml_vk_allocate(size);
this->addr = (uint8_t*)memory.data;
this->size = size;
}
}

void free() {
if (!memory.primaryMemory) {
delete[] addr;
} else if (memory.data) {
ggml_vk_free_memory(memory);
}
this->addr = NULL;
this->size = 0;
}

~llm_buffer() {
free();
}

// disable copy and move
llm_buffer(const llm_buffer&) = delete;
llm_buffer(llm_buffer&&) = delete;
llm_buffer& operator=(const llm_buffer&) = delete;
llm_buffer& operator=(llm_buffer&&) = delete;
};
#else
struct llm_buffer {
uint8_t * addr = NULL;
size_t size = 0;
Expand All @@ -62,7 +18,6 @@ struct llm_buffer {
delete[] addr;
}
};
#endif

struct llm_kv_cache {
struct ggml_tensor * k;
Expand Down