From 8db7d25efd85be16ad80e2efac4db62deeacf2d6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 16 Jun 2024 18:38:46 +0300 Subject: [PATCH 1/8] scripts : stop sync whisper example from ggml --- scripts/sync-ggml-am.sh | 19 ------------------- scripts/sync-ggml.sh | 7 ------- 2 files changed, 26 deletions(-) diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index 8dcf067319e..4614b45919a 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -65,12 +65,6 @@ while read c; do examples/common.cpp \ examples/common-ggml.h \ examples/common-ggml.cpp \ - examples/whisper/grammar-parser.h \ - examples/whisper/grammar-parser.cpp \ - examples/whisper/whisper.h \ - examples/whisper/whisper.cpp \ - examples/whisper/main.cpp \ - examples/whisper/quantize.cpp \ LICENSE \ scripts/gen-authors.sh \ >> $SRC_WHISPER/ggml-src.patch @@ -128,13 +122,6 @@ if [ -f $SRC_WHISPER/ggml-src.patch ]; then # examples/common.cpp -> examples/common.cpp # examples/common-ggml.h -> examples/common-ggml.h # examples/common-ggml.cpp -> examples/common-ggml.cpp - # examples/whisper/grammar-parser.h -> examples/grammar-parser.h - # examples/whisper/grammar-parser.cpp -> examples/grammar-parser.cpp - # - # examples/whisper/whisper.h -> whisper.h - # examples/whisper/whisper.cpp -> whisper.cpp - # examples/whisper/main.cpp -> examples/main/main.cpp - # examples/whisper/quantize.cpp -> examples/quantize/quantize.cpp # # LICENSE -> LICENSE # ggml/scripts/gen-authors.sh -> scripts/gen-authors.sh @@ -169,12 +156,6 @@ if [ -f $SRC_WHISPER/ggml-src.patch ]; then -e 's/examples\/common\.cpp/examples\/common.cpp/g' \ -e 's/examples\/common-ggml\.h/examples\/common-ggml.h/g' \ -e 's/examples\/common-ggml\.cpp/examples\/common-ggml.cpp/g' \ - -e 's/examples\/whisper\/grammar-parser\.h/examples\/grammar-parser.h/g' \ - -e 's/examples\/whisper\/grammar-parser\.cpp/examples\/grammar-parser.cpp/g' \ - -e 's/examples\/whisper\/whisper\.h/whisper.h/g' \ - -e 's/examples\/whisper\/whisper\.cpp/whisper.cpp/g' \ - -e 's/examples\/whisper\/main\.cpp/examples\/main\/main.cpp/g' \ - -e 's/examples\/whisper\/quantize\.cpp/examples\/quantize\/quantize.cpp/g' \ -e 's/LICENSE/LICENSE/g' \ -e 's/ggml\/scripts\/gen-authors\.sh/scripts\/gen-authors.sh/g' \ > ggml-src.patch.tmp diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index 18187b059df..c718a07bb61 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -32,13 +32,6 @@ cp -rpv ../ggml/examples/common.h ./examples/common.h cp -rpv ../ggml/examples/common.cpp ./examples/common.cpp cp -rpv ../ggml/examples/common-ggml.h ./examples/common-ggml.h cp -rpv ../ggml/examples/common-ggml.cpp ./examples/common-ggml.cpp -cp -rpv ../ggml/examples/whisper/grammar-parser.h ./examples/grammar-parser.h -cp -rpv ../ggml/examples/whisper/grammar-parser.cpp ./examples/grammar-parser.cpp - -cp -rpv ../ggml/examples/whisper/whisper.h ./whisper.h -cp -rpv ../ggml/examples/whisper/whisper.cpp ./whisper.cpp -cp -rpv ../ggml/examples/whisper/main.cpp ./examples/main/main.cpp -cp -rpv ../ggml/examples/whisper/quantize.cpp ./examples/quantize/quantize.cpp cp -rpv ../LICENSE ./LICENSE cp -rpv ../ggml/scripts/gen-authors.sh ./scripts/gen-authors.sh From f4b829af2fa1164a9866f8508012df0510f08ef1 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 16 Jun 2024 07:17:31 +0200 Subject: [PATCH 2/8] Vulkan Shader Refactor, Memory Debugging Option (llama/7947) * Refactor shaders, extract GLSL code from ggml_vk_generate_shaders.py into vulkan-shaders directory * Improve debug log code * Add memory debug output option * Fix flake8 * Fix unnecessary high llama-3 VRAM use --- ggml-vulkan.cpp | 612 ++++++++++++++++++++---------------------------- 1 file changed, 259 insertions(+), 353 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index e2d17a3523a..f389934ead3 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -57,6 +58,12 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA } \ } while (0) +#ifdef GGML_VULKAN_DEBUG +#define VK_LOG_DEBUG(msg) std::cerr << msg << std::endl +#else +#define VK_LOG_DEBUG(msg) ((void) 0) +#endif // GGML_VULKAN_DEBUG + struct ggml_backend_vk_context; struct vk_queue { @@ -159,9 +166,7 @@ struct vk_device { std::vector pipelines; ~vk_device() { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "destroy device " << name << std::endl; -#endif + VK_LOG_DEBUG("destroy device " << name); device.destroyCommandPool(compute_queue.pool); if (!single_queue) { device.destroyCommandPool(transfer_queue.pool); @@ -196,9 +201,7 @@ struct vk_buffer_struct { if (size == 0) { return; } -#ifdef GGML_VULKAN_DEBUG - std::cerr << "~vk_buffer_struct(" << buffer << ", " << size << ")" << std::endl; -#endif + VK_LOG_DEBUG("~vk_buffer_struct(" << buffer << ", " << size << ")"); device->device.freeMemory(device_memory); device->device.destroyBuffer(buffer); @@ -355,6 +358,49 @@ struct ggml_vk_garbage_collector { std::vector contexts; }; +#if defined(GGML_VULKAN_MEMORY_DEBUG) || defined(GGML_VULKAN_DEBUG) +#include + +#define VK_LOG_MEMORY(msg) std::cerr << "ggml_vulkan memory: " << msg << std::endl + +static std::string format_size(size_t size) { + const size_t kib = 1024; + const size_t mib = kib * 1024; + const size_t gib = mib * 1024; + + std::ostringstream oss; + oss << std::fixed << std::setprecision(2); + + if (size >= gib) { + oss << static_cast(size) / gib << " GiB"; + } else if (size >= mib) { + oss << static_cast(size) / mib << " MiB"; + } else if (size >= kib) { + oss << static_cast(size) / kib << " KiB"; + } else { + oss << size << " B"; + } + + return oss.str(); +} + +static std::mutex log_mutex; + +class vk_memory_logger { +public: + vk_memory_logger(): total_device(0), total_host(0) {} + void log_allocation(vk_buffer_ref buf_ref, size_t size); + void log_deallocation(vk_buffer_ref buf_ref); + +private: + std::map allocations; // Track allocations + size_t total_device; + size_t total_host; +}; +#else +#define VK_LOG_MEMORY(msg) ((void) 0) +#endif // GGML_VULKAN_MEMORY_DEBUG + struct ggml_backend_vk_context { std::string name; @@ -379,8 +425,45 @@ struct ggml_backend_vk_context { bool initialized; size_t idx; + +#ifdef GGML_VULKAN_MEMORY_DEBUG + vk_memory_logger memory_logger; +#endif }; +#ifdef GGML_VULKAN_MEMORY_DEBUG +void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) { + std::lock_guard guard(log_mutex); + vk_buffer buf = buf_ref.lock(); + const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal); + const std::string type = device ? "device" : "host"; + allocations[buf->buffer] = size; + total_device += device ? size : 0; + total_host += device ? 0 : size; + VK_LOG_MEMORY("VULKAN" << buf->ctx->idx << ": +" << format_size(size) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host)); +} + +void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) { + if (buf_ref.expired() || buf_ref.lock()->size == 0) { + return; + } + + std::lock_guard guard(log_mutex); + vk_buffer buf = buf_ref.lock(); + const bool device = bool(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eDeviceLocal); + std::string type = device ? "device" : "host"; + auto it = allocations.find(buf->buffer); + total_device -= device ? it->second : 0; + total_host -= device ? 0 : it->second; + if (it != allocations.end()) { + VK_LOG_MEMORY("VULKAN" << buf->ctx->idx << ": -" << format_size(it->second) << " " << type << " at " << buf->buffer << ". Total device: " << format_size(total_device) << ", total host: " << format_size(total_host)); + allocations.erase(it); + } else { + VK_LOG_MEMORY("ERROR VULKAN" << buf->ctx->idx << ": Attempted to deallocate unknown " << type << " memory at " << buf->buffer); + } +} +#endif // GGML_VULKAN_MEMORY_DEBUG + struct vk_instance_t { vk::Instance instance; @@ -393,15 +476,11 @@ struct vk_instance_t { }; static std::shared_ptr ggml_vk_get_device(size_t idx) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_get_device(" << idx << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")"); static std::weak_ptr devices[GGML_VK_MAX_DEVICES]; if (devices[idx].expired()) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "Initializing new vk_device" << std::endl; -#endif + VK_LOG_DEBUG("Initializing new vk_device"); std::shared_ptr device = std::make_shared(); device->initialized = false; devices[idx] = device; @@ -428,9 +507,7 @@ static vk_instance_t vk_instance; GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend); static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, std::vector&& specialization_constants, uint32_t align) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")"); GGML_ASSERT(parameter_count > 0); GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT @@ -531,9 +608,7 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline& } static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_pipeline_destroy_pipeline(" << pipeline->name << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")"); for (auto& pool : pipeline->descriptor_pools) { device.destroyDescriptorPool(pool); } @@ -551,9 +626,7 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) } static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, uint32_t n) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")"); if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) { // Enough descriptors are available return; @@ -583,16 +656,12 @@ static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx } static void ggml_pipeline_cleanup(vk_pipeline& pipeline) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_pipeline_cleanup(" << pipeline->name << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_pipeline_cleanup(" << pipeline->name << ")"); pipeline->descriptor_set_idx = 0; } static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx, vk_queue& q) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_create_cmd_buffer()" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()"); if (q.cmd_buffers.size() > q.cmd_buffer_idx) { // Reuse command buffer return q.cmd_buffers[q.cmd_buffer_idx++]; @@ -612,9 +681,7 @@ static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx } static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_create_submission()" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_create_submission()"); vk_submission s; s.buffer = ggml_vk_create_cmd_buffer(ctx, q); s.wait_semaphores = std::move(wait_semaphores); @@ -623,9 +690,7 @@ static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk } static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")"); if (ctx->seqs.empty()) { return; } @@ -699,9 +764,7 @@ static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) { } static uint32_t ggml_vk_find_queue_family_index(std::vector& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_find_queue_family_index()" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_find_queue_family_index()"); const uint32_t qfsize = queue_family_props.size(); // Try with avoid preferences first @@ -747,9 +810,7 @@ static uint32_t ggml_vk_find_queue_family_index(std::vectorgc.contexts.emplace_back(); vk_context * result = &ctx->gc.contexts[ctx->gc.contexts.size() - 1]; memset((void *) result, 0, sizeof(vk_context)); @@ -775,9 +834,7 @@ static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_que } static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * ctx) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()"); vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 }; vk::SemaphoreCreateInfo ci{}; ci.setPNext(&tci); @@ -787,9 +844,7 @@ static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * } static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context * ctx) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()"); if (ctx->semaphore_idx >= ctx->gc.tl_semaphores.size()) { vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 }; vk::SemaphoreCreateInfo ci{}; @@ -808,9 +863,7 @@ static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) { } static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_queue_cleanup()" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_queue_cleanup()"); // Requires command buffers to be done ctx->device->device.resetCommandPool(q.pool); @@ -830,9 +883,7 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr } static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")"); vk_buffer buf = std::make_shared(); if (size == 0) { @@ -892,8 +943,8 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz buf->device = ctx->device; -#ifdef GGML_VULKAN_DEBUG - std::cerr << "Created buffer " << buf->buffer << std::endl; +#ifdef GGML_VULKAN_MEMORY_DEBUG + ctx->memory_logger.log_allocation(buf, size); #endif return buf; @@ -928,6 +979,14 @@ static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, siz } static void ggml_vk_destroy_buffer(vk_buffer& buf) { + if (buf == nullptr) { + return; + } + +#ifdef GGML_VULKAN_MEMORY_DEBUG + buf->ctx->memory_logger.log_deallocation(buf); +#endif + buf.reset(); } @@ -936,9 +995,7 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) { } static void ggml_vk_sync_buffers(vk_context * ctx) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_sync_buffers()" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_sync_buffers()"); const std::vector mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } }; ctx->s->buffer.pipelineBarrier( @@ -952,9 +1009,7 @@ static void ggml_vk_sync_buffers(vk_context * ctx) { } static void ggml_vk_wait_events(vk_context * ctx, std::vector&& events) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_wait_events()" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_wait_events()"); if (events.empty()) { return; } @@ -989,9 +1044,7 @@ static bool ggml_vk_build_shader(ggml_type type) { } static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_load_shaders(" << ctx->name << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_load_shaders(" << ctx->name << ")"); const std::shared_ptr device = ctx->device; @@ -1042,12 +1095,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared(); if (device->fp16) { - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_len, matmul_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_len, matmul_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_len, matmul_f32_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); @@ -1140,12 +1193,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_len, matmul_id_f32_f32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_len, matmul_id_f32_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1); @@ -1231,12 +1284,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_mmq_m, m_align); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_mmq_s, s_align); } else { - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_f32_fp32_len, matmul_f32_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_f32_aligned_fp32_len, matmul_f32_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); @@ -1329,12 +1382,12 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_f32_fp32_len, matmul_id_f32_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_f32_aligned_fp32_len, matmul_id_f32_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), s_wg_denoms, warptile_s, s_align); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), l_wg_denoms, warptile_l, 1); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_id_push_constants), m_wg_denoms, warptile_m, 1); @@ -1429,11 +1482,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f32_f32", mul_mat_vec_q2_K_f32_f32_len, mul_mat_vec_q2_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f32_f32", mul_mat_vec_q3_K_f32_f32_len, mul_mat_vec_q3_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f32_f32", mul_mat_vec_q4_K_f32_f32_len, mul_mat_vec_q4_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f32_f32", mul_mat_vec_q5_K_f32_f32_len, mul_mat_vec_q5_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f32_f32", mul_mat_vec_q6_K_f32_f32_len, mul_mat_vec_q6_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); @@ -1442,11 +1495,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f16_f32", mul_mat_vec_q2_K_f16_f32_len, mul_mat_vec_q2_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f16_f32", mul_mat_vec_q3_K_f16_f32_len, mul_mat_vec_q3_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f16_f32", mul_mat_vec_q4_K_f16_f32_len, mul_mat_vec_q4_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f16_f32", mul_mat_vec_q5_K_f16_f32_len, mul_mat_vec_q5_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f16_f32", mul_mat_vec_q6_K_f16_f32_len, mul_mat_vec_q6_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); @@ -1455,11 +1508,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_K_f32", mul_mat_vec_id_q2_K_f32_len, mul_mat_vec_id_q2_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_K_f32", mul_mat_vec_id_q3_K_f32_len, mul_mat_vec_id_q3_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_K_f32", mul_mat_vec_id_q4_K_f32_len, mul_mat_vec_id_q4_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_K_f32", mul_mat_vec_id_q5_K_f32_len, mul_mat_vec_id_q5_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_K_f32", mul_mat_vec_id_q6_K_f32_len, mul_mat_vec_id_q6_K_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); // dequant shaders ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); @@ -1468,11 +1521,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_K", dequant_q2_K_len, dequant_q2_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_K", dequant_q3_K_len, dequant_q3_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_K", dequant_q4_K_len, dequant_q4_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_K", dequant_q5_K_len, dequant_q5_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_k", dequant_q2_k_len, dequant_q2_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_k", dequant_q3_k_len, dequant_q3_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_k", dequant_q4_k_len, dequant_q4_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_k", dequant_q5_k_len, dequant_q5_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_k", dequant_q6_k_len, dequant_q6_k_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); // get_rows ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); @@ -1538,9 +1591,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { static void ggml_vk_print_gpu_info(size_t idx) { GGML_ASSERT(idx < vk_instance.device_indices.size()); size_t dev_num = vk_instance.device_indices[idx]; -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_print_gpu_info(" << dev_num << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_print_gpu_info(" << dev_num << ")"); GGML_ASSERT(vk_instance.initialized); std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); @@ -1617,9 +1668,7 @@ void ggml_vk_instance_init() { if (vk_instance_initialized) { return; } -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_instance_init()" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_instance_init()"); vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION }; @@ -1710,9 +1759,7 @@ void ggml_vk_instance_init() { } else { // There can be two physical devices corresponding to the same GPU if there are 2 different drivers // This can cause error when splitting layers aross the devices, need to keep only 1 -#ifdef GGML_VULKAN_DEBUG - std::cerr << "Device " << i << " and device " << *old_device << " have the same device id" << std::endl; -#endif + VK_LOG_DEBUG("Device " << i << " and device " << *old_device << " have the same device id"); vk::PhysicalDeviceProperties2 old_prop; vk::PhysicalDeviceDriverProperties old_driver; @@ -1760,16 +1807,11 @@ void ggml_vk_instance_init() { vk_instance.device_indices.erase(r, vk_instance.device_indices.end()); vk_instance.device_indices.push_back(i); -#ifdef GGML_VULKAN_DEBUG - std::cerr << "Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName << std::endl; -#endif + VK_LOG_DEBUG("Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName); } -#ifdef GGML_VULKAN_DEBUG else { - std::cerr << "Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl; - + VK_LOG_DEBUG("Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl); } -#endif } } } @@ -1792,9 +1834,7 @@ void ggml_vk_instance_init() { static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { GGML_ASSERT(idx < vk_instance.device_indices.size()); size_t dev_num = vk_instance.device_indices[idx]; -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_init(" << ctx->name << ", " << dev_num << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << dev_num << ")"); ggml_vk_instance_init(); std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); @@ -1967,9 +2007,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { } static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_get_to_fp16()" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_get_to_fp16()"); switch (type) { case GGML_TYPE_F32: case GGML_TYPE_Q4_0: @@ -1991,9 +2029,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type } static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_get_mul_mat_mat_pipeline()" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline()"); if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { return ctx->device->pipeline_matmul_f32; } @@ -2029,9 +2065,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte } static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()"); GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16); switch (a_type) { @@ -2056,9 +2090,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * } static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_get_mul_mat_mat_id_pipeline()" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_id_pipeline()"); if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { return ctx->device->pipeline_matmul_id_f32; } @@ -2091,9 +2123,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co } static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_get_dequantize_mul_mat_vec()"); GGML_ASSERT(b_type == GGML_TYPE_F32); switch (a_type) { @@ -2118,9 +2148,9 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context } static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_pool_malloc(" << size << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_pool_malloc(" << size << ")"); + VK_LOG_MEMORY("ggml_vk_pool_malloc"); + int best_i = -1; size_t best_size = std::numeric_limits::max(); //smallest unused buffer that fits our needs int worst_i = -1; @@ -2148,13 +2178,11 @@ static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) ggml_vk_destroy_buffer(b); } - return ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal); + return ggml_vk_create_buffer_device(ctx, size); } static void ggml_vk_pool_free(ggml_backend_vk_context * ctx, vk_buffer& buffer) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_pool_free(" << buffer->size << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_pool_free(" << buffer->size << ")"); for (int i = 0; i < MAX_VK_BUFFERS; ++i) { vk_buffer& b = ctx->buffer_pool[i]; if (b == nullptr) { @@ -2175,6 +2203,8 @@ static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_ } } + VK_LOG_MEMORY("ggml_vk_create_buffer_temp(" << size << ")"); + // Otherwise create new buffer vk_buffer buf = ggml_vk_pool_malloc(ctx, size); ctx->gc.temp_buffers.push_back(buf); @@ -2183,9 +2213,7 @@ static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_ } static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl; -#endif + VK_LOG_MEMORY("ggml_vk_host_malloc(" << size << ")"); vk_buffer buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); @@ -2207,9 +2235,7 @@ static void ggml_vk_host_free(ggml_backend_vk_context * ctx, void* ptr) { if (ptr == nullptr) { return; } -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_host_free(" << ptr << ")" << std::endl; -#endif + VK_LOG_MEMORY("ggml_vk_host_free(" << ptr << ")"); vk_buffer buf; size_t index; for (size_t i = 0; i < ctx->pinned_memory.size(); i++) { @@ -2261,13 +2287,11 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]); const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]); const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]); -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline->name << ", {"; + VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {"; for (auto& buffer : buffers) { std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.size << "), "; } - std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))" << std::endl; -#endif + std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))"); std::vector descriptor_buffer_infos; std::vector write_descriptor_sets; GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size()); @@ -2300,9 +2324,7 @@ static void ggml_vk_end_submission(vk_submission& s, std::vector w } static void ggml_vk_ctx_end(vk_context * ctx) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")"); if (ctx->s == nullptr) { return; } @@ -2312,9 +2334,7 @@ static void ggml_vk_ctx_end(vk_context * ctx) { } static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_ctx_begin(" << ctx << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_ctx_begin(" << ctx << ")"); if (subctx->s != nullptr) { ggml_vk_ctx_end(subctx); } @@ -2324,9 +2344,7 @@ static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx } static size_t ggml_vk_align_size(size_t width, size_t align) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_align_size(" << width << ", " << align << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_align_size(" << width << ", " << align << ")"); return CEIL_DIV(width, align) * align; } @@ -2340,6 +2358,7 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) { if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) { + VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")"); ggml_vk_destroy_buffer(ctx->sync_staging); ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, @@ -2348,9 +2367,7 @@ static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, si } static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_buffer_write_nc_async(" << tensor << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")"); GGML_ASSERT(!ggml_is_contiguous(tensor)); // Buffer is already mapped if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { @@ -2455,9 +2472,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont } static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")"); // Make sure ctx owns the buffer GGML_ASSERT(dst->ctx == ctx); @@ -2492,9 +2507,7 @@ static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_cont subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices); return; } -#ifdef GGML_VULKAN_DEBUG - std::cerr << "STAGING" << std::endl; -#endif + VK_LOG_DEBUG("STAGING"); // Staging buffer required vk_buffer staging = ctx->staging; @@ -2529,16 +2542,12 @@ static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_cont } static void ggml_vk_buffer_write_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_buffer_write_async(" << size << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")"); return ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, src, size, size, 1, sync_staging); } static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_buffer_write_2d(" << width << ", " << height << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_buffer_write_2d(" << width << ", " << height << ")"); // Buffer is already mapped if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent); @@ -2563,16 +2572,12 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds } static void ggml_vk_buffer_write(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t size) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_buffer_write(" << size << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_buffer_write(" << size << ")"); ggml_vk_buffer_write_2d(ctx, dst, offset, src, 0, size, 1); } static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")"); GGML_ASSERT(width > 0); GGML_ASSERT(height > 0); GGML_ASSERT(src != nullptr); @@ -2606,9 +2611,7 @@ static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_conte return; } -#ifdef GGML_VULKAN_DEBUG - std::cerr << "STAGING" << std::endl; -#endif + VK_LOG_DEBUG("STAGING"); // Fall back to staging buffer vk_buffer staging = ctx->staging; @@ -2635,9 +2638,7 @@ static void ggml_vk_buffer_read_async(ggml_backend_vk_context * ctx, vk_context } static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, size_t offset, void * dst, size_t size) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_buffer_read(" << offset << ", " << size << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_buffer_read(" << offset << ", " << size << ")"); if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent); @@ -2659,9 +2660,7 @@ static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, s } static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_buffer_copy_async(" << size << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")"); // Make sure both buffers are on same ctx GGML_ASSERT(src->ctx == dst->ctx); @@ -2672,9 +2671,7 @@ static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t d static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { if (src->ctx == dst->ctx) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")"); // Copy within the device ggml_backend_vk_context * ctx = src->ctx; @@ -2686,9 +2683,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences"); ctx->device->device.resetFences({ ctx->fence }); } else { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")"); // Copy device to device ggml_backend_vk_context * src_ctx = src->ctx; ggml_backend_vk_context * dst_ctx = dst->ctx; @@ -2706,9 +2701,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr } static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")"); // Make sure ctx owns the buffer GGML_ASSERT(dst->ctx == ctx); @@ -2723,9 +2716,7 @@ static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst, } static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")"); const uint64_t ne0 = src->ne[0]; const uint64_t ne1 = src->ne[1]; const uint64_t nb0 = src->nb[0]; @@ -2753,9 +2744,7 @@ static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * su } static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, const ggml_tensor * dst) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_d2h_tensor_2d()" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_d2h_tensor_2d()"); const uint64_t ne0 = dst->ne[0]; const uint64_t ne1 = dst->ne[1]; const uint64_t ne2 = dst->ne[2]; @@ -2779,9 +2768,7 @@ static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * su } static uint32_t ggml_vk_guess_split_k(int m, int n, int k) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")"); // if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) { // return 4; // } @@ -2813,9 +2800,7 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context * } static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")"); switch (ctx->device->vendor_id) { case VK_VENDOR_ID_AMD: return ggml_vk_guess_matmul_pipeline_amd(ctx, mmp, m, n, aligned); @@ -2837,9 +2822,7 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, } static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")"); return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true)->align; } @@ -2849,9 +2832,7 @@ static void ggml_vk_matmul( uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d, uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")"); ggml_vk_sync_buffers(subctx); if (split_k == 1) { const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3 }; @@ -2875,12 +2856,10 @@ static void ggml_vk_matmul_id( uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d, uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " << + VK_LOG_DEBUG("ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " << "m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " << "batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " << - "n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")" << std::endl; -#endif + "n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")"); ggml_vk_sync_buffers(subctx); const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, nei0, nei1, nbi1, ne11 }; @@ -2910,10 +2889,8 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_ } static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), "; - std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), "; + std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")"); const int tensor_type_size = ggml_type_size(tensor->type); const uint32_t ne = ggml_nelements(tensor); @@ -2930,11 +2907,9 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context } static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; + VK_LOG_DEBUG("ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; - std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl; -#endif + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT @@ -3105,11 +3080,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su } static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; + VK_LOG_DEBUG("ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; - std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl; -#endif + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT @@ -3260,11 +3233,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context } static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; + VK_LOG_DEBUG("ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; - std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl; -#endif + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT @@ -3333,11 +3304,9 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c } static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; + VK_LOG_DEBUG("ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; - std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl; -#endif + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_permuted(src0)); @@ -3410,9 +3379,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con } static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")"); if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && dst->ne[1] == 1) { ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst); } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && dst->ne[1] == 1) { @@ -3425,12 +3392,10 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, } static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; + VK_LOG_DEBUG("ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; - std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl; -#endif + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT GGML_ASSERT(ids->type == GGML_TYPE_I32); @@ -3616,12 +3581,10 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * } static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; + VK_LOG_DEBUG("ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; - std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl; -#endif + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)"); GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT GGML_ASSERT(ids->type == GGML_TYPE_I32); @@ -3784,9 +3747,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte } static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_mul_mat_id(" << src0 << ", " << src1 << ", " << src2 << ", " << dst << ")"); if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) { ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst); } else { @@ -4020,16 +3981,14 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) { template static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; + VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; if (src1 != nullptr) { std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; } if (src2 != nullptr) { std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3]; } - std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl; -#endif + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")"); GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT GGML_ASSERT(dst->extra != nullptr); @@ -4527,9 +4486,7 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0 template static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, int split_k, int shader_size) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")"); const size_t x_ne = m * k * batch; const size_t y_ne = k * n * batch; const size_t d_ne = m * n * batch; @@ -4943,9 +4900,7 @@ static void ggml_vk_test_h2d_nc(ggml_backend_vk_context * ctx, size_t ne0, size_ } static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool pinned) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_test_transfer(" << ne << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_test_transfer(" << ne << ")"); // Check transfers are correct vk_buffer buffer = ggml_vk_create_buffer_check(ctx, sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal); @@ -5029,9 +4984,7 @@ static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml } static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_test_dequant(" << ne << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_test_dequant(" << ne << ")"); const size_t x_sz = sizeof(float) * ne; const size_t x_sz_f16 = sizeof(ggml_fp16_t) * ne; const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant); @@ -5108,9 +5061,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_ } static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, size_t split_k, size_t shader_size, ggml_type quant) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_test_dequant_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << ggml_type_name(quant) << ")"); const size_t x_ne = m * k * batch; const size_t y_ne = k * n * batch; const size_t d_ne = m * n * batch; @@ -5294,9 +5245,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, #endif static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))"); ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu; extra->reset(); tensor->extra = extra; @@ -5304,9 +5253,7 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) } static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){ -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_preallocate_buffers_graph(" << node << ")"); ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra; if (extra == nullptr) { @@ -5341,7 +5288,7 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false; - const bool qx_needs_dequant = use_src0 && (mmp || x_non_contig); + const bool qx_needs_dequant = use_src0 && (!mmp || x_non_contig); const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig); int split_k; @@ -5419,9 +5366,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm } static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl; -#endif #if defined(GGML_VULKAN_RUN_TESTS) ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, @@ -5560,6 +5504,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { #endif if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) { + VK_LOG_MEMORY("ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << ")"); // Resize buffer if (ctx->prealloc_x != nullptr) { ggml_vk_destroy_buffer(ctx->prealloc_x); @@ -5567,6 +5512,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { ctx->prealloc_x = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_x); } if (ctx->prealloc_y == nullptr || (ctx->prealloc_size_y > 0 && ctx->prealloc_y->size < ctx->prealloc_size_y)) { + VK_LOG_MEMORY("ggml_vk_preallocate_buffers(y_size: " << ctx->prealloc_size_y << ")"); // Resize buffer if (ctx->prealloc_y != nullptr) { ggml_vk_destroy_buffer(ctx->prealloc_y); @@ -5574,6 +5520,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { ctx->prealloc_y = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_y); } if (ctx->prealloc_split_k == nullptr || (ctx->prealloc_size_split_k > 0 && ctx->prealloc_split_k->size < ctx->prealloc_size_split_k)) { + VK_LOG_MEMORY("ggml_vk_preallocate_buffers(split_k_size: " << ctx->prealloc_size_split_k << ")"); // Resize buffer if (ctx->prealloc_split_k != nullptr) { ggml_vk_destroy_buffer(ctx->prealloc_split_k); @@ -5581,6 +5528,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { ctx->prealloc_split_k = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_split_k); } if (ctx->staging == nullptr || (ctx->staging_size > 0 && ctx->staging->size < ctx->staging_size)) { + VK_LOG_MEMORY("ggml_vk_preallocate_buffers(staging_size: " << ctx->staging_size << ")"); // Resize buffer if (ctx->staging != nullptr) { ggml_vk_destroy_buffer(ctx->staging); @@ -5598,9 +5546,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod return; } -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")"); ctx->semaphore_idx = 0; ctx->staging_offset = 0; @@ -5823,9 +5769,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_ return true; } -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")"); #ifdef GGML_VULKAN_CHECK_RESULTS ggml_vk_check_results_0(ctx, params, tensor); @@ -5860,9 +5804,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_ // Clean up after graph processing is done static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_graph_cleanup()" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_graph_cleanup()"); for (auto& buffer : ctx->gc.temp_buffers) { ggml_vk_pool_free(ctx, buffer); } @@ -5906,9 +5848,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) { // Clean up on backend free static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_cleanup(" << ctx->idx << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_cleanup(" << ctx->idx << ")"); ggml_vk_graph_cleanup(ctx); ggml_vk_destroy_buffer(ctx->prealloc_x); @@ -6003,9 +5943,7 @@ GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) { } GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_backend_vk_buffer_free_buffer()" << std::endl; -#endif + VK_LOG_MEMORY("ggml_backend_vk_buffer_free_buffer()"); ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_vk_destroy_buffer(ctx->dev_buffer); delete ctx; @@ -6018,9 +5956,7 @@ GGML_CALL static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t bu } GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")"); ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; if (tensor->view_src != nullptr) { @@ -6036,9 +5972,7 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b } GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; @@ -6049,9 +5983,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu } GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; @@ -6109,9 +6041,7 @@ GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buff } GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl; -#endif + VK_LOG_MEMORY("ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")"); ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; vk_buffer dev_buffer = nullptr; @@ -6154,9 +6084,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) { ggml_vk_instance_init(); -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_backend_vk_buffer_type(" << dev_num << ")"); GGML_ASSERT(dev_num < vk_instance.device_indices.size()); @@ -6180,16 +6108,12 @@ GGML_CALL static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buff } GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_backend_vk_host_buffer_free_buffer()" << std::endl; -#endif + VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()"); ggml_vk_host_free(&vk_instance.contexts[0], buffer->context); } GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl; -#endif + VK_LOG_MEMORY("ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")"); size += 32; // Behave like the CPU buffer type void * ptr = nullptr; try { @@ -6246,9 +6170,7 @@ GGML_CALL static const char * ggml_backend_vk_name(ggml_backend_t backend) { GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) { ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_backend_vk_free(" << ctx->name << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_backend_vk_free(" << ctx->name << ")"); size_t idx = ctx->idx; @@ -6272,9 +6194,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_t } GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_backend_vk_set_tensor_async(" << size << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); @@ -6292,9 +6212,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g } GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_backend_vk_get_tensor_async(" << size << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); @@ -6312,9 +6230,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c } GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_backend_vk_cpy_tensor_async()" << std::endl; -#endif + VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; if ((dst->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) { ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra; @@ -6337,9 +6253,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c } GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_backend_vk_synchronize()" << std::endl; -#endif + VK_LOG_DEBUG("ggml_backend_vk_synchronize()"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; if(ctx->transfer_ctx == nullptr) { return; @@ -6367,9 +6281,7 @@ static bool ggml_vk_is_empty(ggml_tensor * node) { } GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl; -#endif + VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; for (int i = 0; i < cgraph->n_nodes; i++) { @@ -6582,9 +6494,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) { if (vk_instance.initialized[dev_num]) { return vk_instance.backends[dev_num]; } -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_backend_vk_init(" << dev_num << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")"); ggml_backend_vk_context * ctx = &vk_instance.contexts[dev_num]; ggml_vk_init(ctx, dev_num); @@ -6800,9 +6710,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ return; } -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_check_results_0(" << tensor->name << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_check_results_0(" << tensor->name << ")"); ggml_tensor * src0 = tensor->src[0]; ggml_tensor * src1 = tensor->src[1]; @@ -7108,9 +7016,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_ return; } -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_check_results_1(" << tensor->name << ")" << std::endl; -#endif + VK_LOG_DEBUG("ggml_vk_check_results_1(" << tensor->name << ")"); ggml_tensor * src0 = tensor->src[0]; ggml_tensor * src1 = tensor->src[1]; From c63274d8313af7d3b1eaae441c6d0bdc20ae0398 Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 16 Jun 2024 13:57:37 +0300 Subject: [PATCH 3/8] move BLAS to a separate backend (cont) (llama/6210) ggml-ci --- examples/common.h | 2 +- ggml-blas.cpp | 363 ++++++++++++++++++++++++++++++++++++++++++++++ ggml-blas.h | 23 +++ src/ggml-blas.cpp | 363 ++++++++++++++++++++++++++++++++++++++++++++++ src/ggml-blas.h | 23 +++ 5 files changed, 773 insertions(+), 1 deletion(-) create mode 100644 ggml-blas.cpp create mode 100644 ggml-blas.h create mode 100644 src/ggml-blas.cpp create mode 100644 src/ggml-blas.h diff --git a/examples/common.h b/examples/common.h index de895858ab0..89ab37457e2 100644 --- a/examples/common.h +++ b/examples/common.h @@ -21,7 +21,7 @@ struct gpt_params { int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); int32_t n_predict = 200; // new tokens to predict int32_t n_parallel = 1; // number of parallel streams - int32_t n_batch = 8; // batch size for prompt processing + int32_t n_batch = 32; // batch size for prompt processing int32_t n_ctx = 2048; // context size (this is the KV cache max size) int32_t n_gpu_layers = 0; // number of layers to offlload to the GPU diff --git a/ggml-blas.cpp b/ggml-blas.cpp new file mode 100644 index 00000000000..d709a357bbf --- /dev/null +++ b/ggml-blas.cpp @@ -0,0 +1,363 @@ +#include "ggml-blas.h" +#include "ggml-backend-impl.h" + +#include +#include + +#if defined(GGML_USE_ACCELERATE) +# include +#elif defined(GGML_BLAS_USE_MKL) +# include +#else +# include +# ifdef BLIS_ENABLE_CBLAS +# include +# endif +#endif + +struct ggml_backend_blas_context { + int n_threads = GGML_DEFAULT_N_THREADS; + std::unique_ptr work_data; + size_t work_size = 0; +#ifndef GGML_USE_OPENMP + std::vector> tasks; +#endif +}; + +// helper function to determine if it is better to use BLAS or not +// for large matrices, BLAS is faster +static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + const int64_t ne10 = src1->ne[0]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + + // TODO: find the optimal values for these + if (ggml_is_contiguous(src0) && + ggml_is_contiguous(src1) && + src1->type == GGML_TYPE_F32 && + (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { + + /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ + return true; + } + + return false; +} + +static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const enum ggml_type type = src0->type; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + const int64_t ne_plane = ne01*ne00; + const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float); + + if (ctx->work_size < desired_wsize) { + ctx->work_data.reset(new char[desired_wsize]); + ctx->work_size = desired_wsize; + } + void * wdata = ctx->work_data.get(); + + // convert src0 to float + if (type != GGML_TYPE_F32) { + ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type); + ggml_to_float_t const to_float = type_traits.to_float; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + float * const wplane = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; + + const int min_cols_per_thread = 4096; + const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1); + const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1); + +#ifdef GGML_USE_OPENMP + #pragma omp parallel for num_threads(n_threads) + for (int64_t i01 = 0; i01 < ne01; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } +#else + for (int i = 1; i < n_threads; i++) { + const int64_t start = i*ne01/n_threads; + const int64_t end = (i + 1)*ne01/n_threads; + if (start < end) { + ctx->tasks.push_back(std::async(std::launch::async, [=]() { + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } + })); + } + } + { + // reuse the current thread for the first task + const int64_t start = 0; + const int64_t end = ne01/n_threads; + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } + } +#endif + } + } + +#ifndef GGML_USE_OPENMP + // wait for all tasks to finish + for (auto & task : ctx->tasks) { + task.get(); + } + ctx->tasks.clear(); +#endif + } + +#if defined(OPENBLAS_VERSION) + openblas_set_num_threads(ctx->n_threads); +#endif + +#if defined(BLIS_ENABLE_CBLAS) + bli_thread_set_num_threads(ctx->n_threads); +#endif + + for (int64_t i13 = 0; i13 < ne13; i13++) { + for (int64_t i12 = 0; i12 < ne12; i12++) { + const int64_t i03 = i13/r3; + const int64_t i02 = i12/r2; + + const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); + const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); + + if (type != GGML_TYPE_F32) { + x = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; + } + + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + ne1, ne01, ne10, + 1.0f, y, ne10, + x, ne00, + 0.0f, d, ne01); + } + } +} + +static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT(ne0 == ne00); + GGML_ASSERT(ne1 == ne10); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne3 == ne13); + GGML_ASSERT(ne03 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + // GGML_ASSERT(nb0 <= nb1); + // GGML_ASSERT(nb1 <= nb2); + // GGML_ASSERT(nb2 <= nb3); + + // Arguments to ggml_compute_forward_out_prod (expressed as major,minor) + // src0: (k,n) + // src1: (k,m) + // dst: (m,n) + // + // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f) + // Also expressed as (major,minor) + // a: (m,k): so src1 transposed + // b: (k,n): so src0 + // c: (m,n) + // + // However, if ggml_is_transposed(src1) is true, then + // src1->data already contains a transposed version, so sgemm mustn't + // transpose it further. + + int n = src0->ne[0]; + int k = src0->ne[1]; + int m = src1->ne[0]; + + CBLAS_TRANSPOSE transposeA; + int lda; + + if (!ggml_is_transposed(src1)) { + transposeA = CblasTrans; + lda = m; + } else { + transposeA = CblasNoTrans; + lda = k; + } + + float * a = (float *) ((char *) src1->data); + float * b = (float *) ((char *) src0->data); + float * c = (float *) ((char *) dst->data); + + cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n); + + GGML_UNUSED(ctx); +} + +// backend interface + +GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) { + return "BLAS"; + + GGML_UNUSED(backend); +} + +GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) { + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; + delete ctx; + delete backend; +} + +GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) { + return ggml_backend_cpu_buffer_type(); + + GGML_UNUSED(backend); +} + +GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; + + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; + + switch (node->op) { + case GGML_OP_MUL_MAT: + ggml_backend_blas_mul_mat(ctx, node); + break; + + case GGML_OP_OUT_PROD: + ggml_backend_blas_out_prod(ctx, node); + break; + + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + break; + + default: + fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node)); + GGML_ASSERT(false); + } + } + + return GGML_STATUS_SUCCESS; + + GGML_UNUSED(backend); +} + +GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * src1 = op->src[1]; + + return (op->op == GGML_OP_MUL_MAT && ggml_backend_blas_use_blas(op)) || + (op->op == GGML_OP_OUT_PROD && op->src[0]->type == GGML_TYPE_F32 && + op->src[1]->type == GGML_TYPE_F32 && + ggml_is_matrix(src0) && + ggml_is_matrix(src1) && + ggml_is_contiguous(src0) && + (ggml_is_contiguous(src1) || ggml_is_transposed(src1))); + + GGML_UNUSED(backend); +} + +GGML_CALL static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + return ggml_backend_buft_is_host(buft); + + GGML_UNUSED(backend); +} + +static struct ggml_backend_i blas_backend_i = { + /* .get_name = */ ggml_backend_blas_name, + /* .free = */ ggml_backend_blas_free, + /* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_blas_graph_compute, + /* .supports_op = */ ggml_backend_blas_supports_op, + /* .supports_buft = */ ggml_backend_blas_supports_buft, + /* .offload_op = */ NULL, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, + /* .event_synchronize = */ NULL, +}; + +static ggml_guid_t ggml_backend_blas_guid(void) { + static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d }; + return &guid; +} + +ggml_backend_t ggml_backend_blas_init(void) { + ggml_backend_blas_context * ctx = new ggml_backend_blas_context; + + ggml_backend_t backend = new ggml_backend { + /* .guid = */ ggml_backend_blas_guid(), + /* .interface = */ blas_backend_i, + /* .context = */ ctx, + }; + +#if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP) + if (openblas_get_parallel() != OPENBLAS_OPENMP) { + fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__); + } +#endif + +#if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP) + fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__); +#endif + + return backend; +} + +GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) { + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid()); +} + +void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) { + GGML_ASSERT(ggml_backend_is_blas(backend_blas)); + + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context; + ctx->n_threads = n_threads; +} diff --git a/ggml-blas.h b/ggml-blas.h new file mode 100644 index 00000000000..f2e37de06f6 --- /dev/null +++ b/ggml-blas.h @@ -0,0 +1,23 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +// backend API +GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void); + +GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend); + +// number of threads used for conversion to float +// for openblas and blis, this will also set the number of threads used for blas operations +GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads); + + +#ifdef __cplusplus +} +#endif diff --git a/src/ggml-blas.cpp b/src/ggml-blas.cpp new file mode 100644 index 00000000000..d709a357bbf --- /dev/null +++ b/src/ggml-blas.cpp @@ -0,0 +1,363 @@ +#include "ggml-blas.h" +#include "ggml-backend-impl.h" + +#include +#include + +#if defined(GGML_USE_ACCELERATE) +# include +#elif defined(GGML_BLAS_USE_MKL) +# include +#else +# include +# ifdef BLIS_ENABLE_CBLAS +# include +# endif +#endif + +struct ggml_backend_blas_context { + int n_threads = GGML_DEFAULT_N_THREADS; + std::unique_ptr work_data; + size_t work_size = 0; +#ifndef GGML_USE_OPENMP + std::vector> tasks; +#endif +}; + +// helper function to determine if it is better to use BLAS or not +// for large matrices, BLAS is faster +static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + const int64_t ne10 = src1->ne[0]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + + // TODO: find the optimal values for these + if (ggml_is_contiguous(src0) && + ggml_is_contiguous(src1) && + src1->type == GGML_TYPE_F32 && + (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { + + /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ + return true; + } + + return false; +} + +static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const enum ggml_type type = src0->type; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + const int64_t ne_plane = ne01*ne00; + const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float); + + if (ctx->work_size < desired_wsize) { + ctx->work_data.reset(new char[desired_wsize]); + ctx->work_size = desired_wsize; + } + void * wdata = ctx->work_data.get(); + + // convert src0 to float + if (type != GGML_TYPE_F32) { + ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type); + ggml_to_float_t const to_float = type_traits.to_float; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + float * const wplane = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; + + const int min_cols_per_thread = 4096; + const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1); + const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1); + +#ifdef GGML_USE_OPENMP + #pragma omp parallel for num_threads(n_threads) + for (int64_t i01 = 0; i01 < ne01; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } +#else + for (int i = 1; i < n_threads; i++) { + const int64_t start = i*ne01/n_threads; + const int64_t end = (i + 1)*ne01/n_threads; + if (start < end) { + ctx->tasks.push_back(std::async(std::launch::async, [=]() { + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } + })); + } + } + { + // reuse the current thread for the first task + const int64_t start = 0; + const int64_t end = ne01/n_threads; + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } + } +#endif + } + } + +#ifndef GGML_USE_OPENMP + // wait for all tasks to finish + for (auto & task : ctx->tasks) { + task.get(); + } + ctx->tasks.clear(); +#endif + } + +#if defined(OPENBLAS_VERSION) + openblas_set_num_threads(ctx->n_threads); +#endif + +#if defined(BLIS_ENABLE_CBLAS) + bli_thread_set_num_threads(ctx->n_threads); +#endif + + for (int64_t i13 = 0; i13 < ne13; i13++) { + for (int64_t i12 = 0; i12 < ne12; i12++) { + const int64_t i03 = i13/r3; + const int64_t i02 = i12/r2; + + const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); + const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); + + if (type != GGML_TYPE_F32) { + x = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; + } + + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + ne1, ne01, ne10, + 1.0f, y, ne10, + x, ne00, + 0.0f, d, ne01); + } + } +} + +static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT(ne0 == ne00); + GGML_ASSERT(ne1 == ne10); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne3 == ne13); + GGML_ASSERT(ne03 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + // GGML_ASSERT(nb0 <= nb1); + // GGML_ASSERT(nb1 <= nb2); + // GGML_ASSERT(nb2 <= nb3); + + // Arguments to ggml_compute_forward_out_prod (expressed as major,minor) + // src0: (k,n) + // src1: (k,m) + // dst: (m,n) + // + // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f) + // Also expressed as (major,minor) + // a: (m,k): so src1 transposed + // b: (k,n): so src0 + // c: (m,n) + // + // However, if ggml_is_transposed(src1) is true, then + // src1->data already contains a transposed version, so sgemm mustn't + // transpose it further. + + int n = src0->ne[0]; + int k = src0->ne[1]; + int m = src1->ne[0]; + + CBLAS_TRANSPOSE transposeA; + int lda; + + if (!ggml_is_transposed(src1)) { + transposeA = CblasTrans; + lda = m; + } else { + transposeA = CblasNoTrans; + lda = k; + } + + float * a = (float *) ((char *) src1->data); + float * b = (float *) ((char *) src0->data); + float * c = (float *) ((char *) dst->data); + + cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n); + + GGML_UNUSED(ctx); +} + +// backend interface + +GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) { + return "BLAS"; + + GGML_UNUSED(backend); +} + +GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) { + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; + delete ctx; + delete backend; +} + +GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) { + return ggml_backend_cpu_buffer_type(); + + GGML_UNUSED(backend); +} + +GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; + + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; + + switch (node->op) { + case GGML_OP_MUL_MAT: + ggml_backend_blas_mul_mat(ctx, node); + break; + + case GGML_OP_OUT_PROD: + ggml_backend_blas_out_prod(ctx, node); + break; + + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + break; + + default: + fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node)); + GGML_ASSERT(false); + } + } + + return GGML_STATUS_SUCCESS; + + GGML_UNUSED(backend); +} + +GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * src1 = op->src[1]; + + return (op->op == GGML_OP_MUL_MAT && ggml_backend_blas_use_blas(op)) || + (op->op == GGML_OP_OUT_PROD && op->src[0]->type == GGML_TYPE_F32 && + op->src[1]->type == GGML_TYPE_F32 && + ggml_is_matrix(src0) && + ggml_is_matrix(src1) && + ggml_is_contiguous(src0) && + (ggml_is_contiguous(src1) || ggml_is_transposed(src1))); + + GGML_UNUSED(backend); +} + +GGML_CALL static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + return ggml_backend_buft_is_host(buft); + + GGML_UNUSED(backend); +} + +static struct ggml_backend_i blas_backend_i = { + /* .get_name = */ ggml_backend_blas_name, + /* .free = */ ggml_backend_blas_free, + /* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_blas_graph_compute, + /* .supports_op = */ ggml_backend_blas_supports_op, + /* .supports_buft = */ ggml_backend_blas_supports_buft, + /* .offload_op = */ NULL, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, + /* .event_synchronize = */ NULL, +}; + +static ggml_guid_t ggml_backend_blas_guid(void) { + static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d }; + return &guid; +} + +ggml_backend_t ggml_backend_blas_init(void) { + ggml_backend_blas_context * ctx = new ggml_backend_blas_context; + + ggml_backend_t backend = new ggml_backend { + /* .guid = */ ggml_backend_blas_guid(), + /* .interface = */ blas_backend_i, + /* .context = */ ctx, + }; + +#if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP) + if (openblas_get_parallel() != OPENBLAS_OPENMP) { + fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__); + } +#endif + +#if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP) + fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__); +#endif + + return backend; +} + +GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) { + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid()); +} + +void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) { + GGML_ASSERT(ggml_backend_is_blas(backend_blas)); + + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context; + ctx->n_threads = n_threads; +} diff --git a/src/ggml-blas.h b/src/ggml-blas.h new file mode 100644 index 00000000000..f2e37de06f6 --- /dev/null +++ b/src/ggml-blas.h @@ -0,0 +1,23 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +// backend API +GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void); + +GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend); + +// number of threads used for conversion to float +// for openblas and blis, this will also set the number of threads used for blas operations +GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads); + + +#ifdef __cplusplus +} +#endif From b1cf99e0cbd73b5abb7acb2f11c0908a254852dd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 16 Jun 2024 18:40:07 +0300 Subject: [PATCH 4/8] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 91e13e0a728..37fc64a561b 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -1ba593bdbef06041b8c94e327161e0b9ad4a348a +169738dc6658c28dccad876e9db5ff2180940186 From ac3a77393a21818eee9ca78bbf2e1855f9eea491 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 16 Jun 2024 19:10:20 +0300 Subject: [PATCH 5/8] build : update make / cmake --- .gitignore | 1 + CMakeLists.txt | 174 ++++++++++++++++++++++++------------------------- Makefile | 65 ++++++++++-------- 3 files changed, 126 insertions(+), 114 deletions(-) diff --git a/.gitignore b/.gitignore index 295cb74e625..e3319ad0329 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ /CMakeSettings.json build/ +build-blas/ build-coreml/ build-em/ build-debug/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 6c3429711d1..4055c2bbd87 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -80,11 +80,10 @@ if (APPLE) option(WHISPER_COREML "whisper: enable Core ML framework" OFF) option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF) option(WHISPER_METAL_EMBED_LIBRARY "whisper: embed Metal library" OFF) + option(WHISPER_BLAS "whisper: use BLAS" ON) + set (WHISPER_BLAS_VENDOR "Apple" CACHE STRING + "whisper: BLAS library vendor") else() - option(WHISPER_BLAS "whisper: use BLAS libraries" OFF) - option(WHISPER_BLAS_VENDOR "whisper: BLAS library vendor" Generic) - option(WHISPER_OPENBLAS "whisper: prefer OpenBLAS" OFF) - option(WHISPER_OPENBLAS_INTERFACE64 "whisper: use OpenBLAS w/ 64-bit interface" OFF) option(WHISPER_CUDA "whisper: support for CUDA" OFF) option(WHISPER_CUDA_FA_ALL_QUANTS "whisper: compile all quants for FlashAttention" OFF) option(WHISPER_CUBLAS "whisper: support for CUDA (deprecated)" OFF) @@ -93,6 +92,9 @@ else() option(WHISPER_MKL "whisper: use Intel Math Kernel Library (MKL)" OFF) option(WHISPER_SYCL "whisper: use SYCL" OFF) option(WHISPER_SYCL_F16 "whisper: use 16 bit floats for sycl calculations" OFF) + option(WHISPER_BLAS "whisper: use BLAS" OFF) + set (WHISPER_BLAS_VENDOR "Generic" CACHE STRING + "whisper: BLAS library vendor") endif() option(WHISPER_PERF "whisper: enable perf timings" OFF) @@ -246,93 +248,90 @@ if (APPLE) endif() endif() -if (WHISPER_OPENBLAS) - set(WHISPER_BLAS_VENDOR "OpenBLAS") - set(WHISPER_BLAS ON) - # BLA_PKGCONFIG_BLAS is supported since CMake 3.25. - # FindBLAS.cmake pkg-config logic seems incomplete, because when - # BLA_SIZEOF_INTEGER is 8, then it should search for blas64 instead of blas. - # blas.pc/blas64.pc are not always provided, so let's be more specific - # and go with openblas.pc/openblas64.pc if WHISPER_OPENBLAS is on. - if (WHISPER_OPENBLAS_INTERFACE64) - set(WHISPER_BLAS_LIB "openblas64") - else () - set(WHISPER_BLAS_LIB "openblas") - endif () - set(BLA_PKGCONFIG_BLAS ${WHISPER_BLAS_LIB}) - # OpenBLAS prebuilt libraries for Windows do not have "64" suffix in filename. - # (But .pc file has "64" suffix in filename for USE_64BITINT=1 Windows build.) - if (MSVC) - set(WHISPER_BLAS_LIB "openblas") - endif () -endif() - if (WHISPER_BLAS) - if (NOT "$ENV{OPENBLAS_PATH}" STREQUAL "") - if (WHISPER_STATIC) - set(WHISPER_BLAS_LIB_PREFIX ${CMAKE_STATIC_LIBRARY_PREFIX}) - set(WHISPER_BLAS_LIB_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX}) - else () - if (CMAKE_IMPORT_LIBRARY_SUFFIX) - set(WHISPER_BLAS_LIB_PREFIX ${CMAKE_IMPORT_LIBRARY_PREFIX}) - set(WHISPER_BLAS_LIB_SUFFIX ${CMAKE_IMPORT_LIBRARY_SUFFIX}) - else () - set(WHISPER_BLAS_LIB_PREFIX ${CMAKE_SHARED_LIBRARY_PREFIX}) - set(WHISPER_BLAS_LIB_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}) - endif () - endif () - # OpenBLAS prebuilt libraries hardcode "lib" prefix in filename even on Windows - if (WHISPER_OPENBLAS) - set(WHISPER_BLAS_LIB_PREFIX "lib") - endif () - message(STATUS "BLAS compatible library path provided") - set(BLAS_LIBRARIES "$ENV{OPENBLAS_PATH}/lib/${WHISPER_BLAS_LIB_PREFIX}${WHISPER_BLAS_LIB}${WHISPER_BLAS_LIB_SUFFIX}") - message(STATUS "Libraries ${BLAS_LIBRARIES}") - set(BLAS_INCLUDE_DIRS "$ENV{OPENBLAS_PATH}/include") - message(STATUS "Include dirs ${BLAS_INCLUDE_DIRS}") - if (NOT EXISTS "${BLAS_LIBRARIES}") - message(FATAL_ERROR "BLAS library was not found. Environment variable OPENBLAS_PATH misdefined.") - endif () - set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS) - include_directories(${BLAS_INCLUDE_DIRS}) - set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES}) - else () - if (WHISPER_STATIC) - # FindBLAS.cmake pkg-config logic seems incomplete, because when - # BLA_STATIC is on, then it should use pkg_check_modules_static - # instead of pkg_check_modules. - # Some manual variable overriding may be necessary if you don't - # achieve desired results. - set(BLA_STATIC 1) - endif () - set(BLA_VENDOR ${WHISPER_BLAS_VENDOR}) - if (WHISPER_OPENBLAS_INTERFACE64) - set(BLA_SIZEOF_INTEGER 8) - else () - set(BLA_SIZEOF_INTEGER 4) - endif() - set(BLA_PREFER_PKGCONFIG 1) - find_package(BLAS) - - if(BLAS_FOUND) - message(STATUS "BLAS compatible library found") - message(STATUS "Libraries ${BLAS_LIBRARIES}") - if (NOT DEFINED BLAS_INCLUDE_DIRS) - if (PKGC_BLAS_FOUND) - set(BLAS_INCLUDE_DIRS "${PKGC_BLAS_INCLUDE_DIRS}") - else () - find_path(BLAS_INCLUDE_DIRS cblas.h /usr/include/openblas) + if (WHISPER_STATIC) + set(BLA_STATIC ON) + endif() + #if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22) + # set(BLA_SIZEOF_INTEGER 8) + #endif() + + set(BLA_VENDOR ${WHISPER_BLAS_VENDOR}) + find_package(BLAS) + + if (BLAS_FOUND) + message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}") + + if (("${BLAS_INCLUDE_DIRS}" STREQUAL "") AND NOT (${WHISPER_BLAS_VENDOR} MATCHES "Apple")) + # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake. + # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268 + find_package(PkgConfig REQUIRED) + if (${WHISPER_BLAS_VENDOR} MATCHES "Generic") + pkg_check_modules(DepBLAS REQUIRED blas) + elseif (${WHISPER_BLAS_VENDOR} MATCHES "OpenBLAS") + # As of openblas v0.3.22, the 64-bit is named openblas64.pc + pkg_check_modules(DepBLAS openblas64) + if (NOT DepBLAS_FOUND) + pkg_check_modules(DepBLAS REQUIRED openblas) + endif() + elseif (${WHISPER_BLAS_VENDOR} MATCHES "FLAME") + pkg_check_modules(DepBLAS REQUIRED blis) + elseif (${WHISPER_BLAS_VENDOR} MATCHES "ATLAS") + pkg_check_modules(DepBLAS REQUIRED blas-atlas) + elseif (${WHISPER_BLAS_VENDOR} MATCHES "FlexiBLAS") + pkg_check_modules(DepBLAS REQUIRED flexiblas_api) + elseif (${WHISPER_BLAS_VENDOR} MATCHES "Intel") + # all Intel* libraries share the same include path + pkg_check_modules(DepBLAS REQUIRED mkl-sdl) + elseif (${WHISPER_BLAS_VENDOR} MATCHES "NVHPC") + # this doesn't provide pkg-config + # suggest to assign BLAS_INCLUDE_DIRS on your own + if ("${NVHPC_VERSION}" STREQUAL "") + message(WARNING "Better to set NVHPC_VERSION") + else() + set(DepBLAS_FOUND ON) + set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include") endif() endif() - message(STATUS "Include dirs ${BLAS_INCLUDE_DIRS}") - set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS) - include_directories(${BLAS_INCLUDE_DIRS}) - set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES}) - else() - message(FATAL_ERROR "BLAS library was not found") + if (DepBLAS_FOUND) + set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS}) + else() + message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically" + " detected by pkgconfig, trying to find cblas.h from possible paths...") + find_path(BLAS_INCLUDE_DIRS + NAMES cblas.h + HINTS + /usr/include + /usr/local/include + /usr/include/openblas + /opt/homebrew/opt/openblas/include + /usr/local/opt/openblas/include + /usr/include/x86_64-linux-gnu/openblas/include + ) + endif() endif() - endif () -endif () + + message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") + + add_compile_options(${BLAS_LINKER_FLAGS}) + + add_compile_definitions(GGML_USE_BLAS) + + if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${WHISPER_BLAS_VENDOR} MATCHES "Generic" OR ${WHISPER_BLAS_VENDOR} MATCHES "Intel")) + add_compile_definitions(GGML_BLAS_USE_MKL) + endif() + + set(GGML_HEADERS_BLAS ggml-blas.h) + set(GGML_SOURCES_BLAS ggml-blas.cpp) + + set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES}) + set(WHISPER_EXTRA_INCLUDES ${WHISPER_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) + else() + message(WARNING "BLAS not found, please refer to " + "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" + " to set correct WHISPER_BLAS_VENDOR") + endif() +endif() if (WHISPER_MKL) find_package(MKL CONFIG REQUIRED PATHS $ENV{MKLROOT}) @@ -712,6 +711,7 @@ add_library(${TARGET} ${GGML_SOURCES_CUDA} ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL} ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} + ${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS} whisper.h whisper.cpp ) diff --git a/Makefile b/Makefile index cc576018a47..adcbdbfe80d 100644 --- a/Makefile +++ b/Makefile @@ -35,6 +35,8 @@ CXXV := $(shell $(CXX) --version | head -n 1) # Mac OS + Arm can report x86_64 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 ifeq ($(UNAME_S),Darwin) + WHISPER_NO_OPENMP := 1 + ifneq ($(UNAME_P),arm) SYSCTL_M := $(shell sysctl -n hw.optional.arm64) ifeq ($(SYSCTL_M),1) @@ -222,10 +224,14 @@ endif ifndef WHISPER_NO_ACCELERATE # Mac M1 - include Accelerate framework ifeq ($(UNAME_S),Darwin) - CFLAGS += -DGGML_USE_ACCELERATE - CFLAGS += -DACCELERATE_NEW_LAPACK - CFLAGS += -DACCELERATE_LAPACK_ILP64 - LDFLAGS += -framework Accelerate + CFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS + CFLAGS += -DACCELERATE_NEW_LAPACK + CFLAGS += -DACCELERATE_LAPACK_ILP64 + CXXFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS + CXXFLAGS += -DACCELERATE_NEW_LAPACK + CXXFLAGS += -DACCELERATE_LAPACK_ILP64 + LDFLAGS += -framework Accelerate + WHISPER_OBJ += ggml-blas.o endif endif @@ -248,29 +254,31 @@ ifndef WHISPER_NO_METAL endif endif -ifneq ($(filter-out 0,$(WHISPER_OPENBLAS)),) # OpenBLAS - WHISPER_OPENBLAS_INTERFACE64 ?= 0 # use 32-bit interface by default - ifneq ($(filter-out 0,$(WHISPER_OPENBLAS_INTERFACE64)),) - WHISPER_BLAS_LIB := openblas64 - else - WHISPER_BLAS_LIB := openblas - endif - ifneq ($(OPENBLAS_PATH),) - WHISPER_BLAS_CFLAGS := -I$(OPENBLAS_PATH)/include - WHISPER_BLAS_LDFLAGS := -L$(OPENBLAS_PATH)/lib -l$(WHISPER_BLAS_LIB) - else - WHISPER_BLAS_LIB_PC_EXISTS := $(shell pkg-config --exists $(WHISPER_BLAS_LIB) && echo 1) - ifneq ($(filter-out 0,$(WHISPER_BLAS_LIB_PC_EXISTS)),) - WHISPER_BLAS_CFLAGS := $(shell pkg-config --cflags $(WHISPER_BLAS_LIB)) - WHISPER_BLAS_LDFLAGS := $(shell pkg-config --libs $(WHISPER_BLAS_LIB)) - else - WHISPER_BLAS_CFLAGS := -I/usr/include/openblas - WHISPER_BLAS_LDFLAGS := -l$(WHISPER_BLAS_LIB) - endif - endif - CFLAGS += $(WHISPER_BLAS_CFLAGS) -DGGML_USE_OPENBLAS - LDFLAGS += $(WHISPER_BLAS_LDFLAGS) -endif +ifndef WHISPER_NO_OPENMP + CXXFLAGS += -DGGML_USE_OPENMP + CFLAGS += -fopenmp + CXXFLAGS += -fopenmp +endif # WHISPER_NO_OPENMP + +ifdef WHISPER_OPENBLAS + CXXFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas) + CFLAGS += $(shell pkg-config --cflags-only-other openblas) + LDFLAGS += $(shell pkg-config --libs openblas) + WHISPER_OBJ += ggml-blas.o +endif # WHISPER_OPENBLAS + +ifdef WHISPER_OPENBLAS64 + CXXFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64) + CFLAGS += $(shell pkg-config --cflags-only-other openblas64) + LDFLAGS += $(shell pkg-config --libs openblas64) + WHISPER_OBJ += ggml-blas.o +endif # WHISPER_OPENBLAS64 + +ifdef WHISPER_BLIS + CXXFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis + LDFLAGS += -lblis -L/usr/local/lib + WHISPER_OBJ += ggml-blas.o +endif # WHISPER_BLIS ifdef WHISPER_CUBLAS # WHISPER_CUBLAS is deprecated and will be removed in the future @@ -402,6 +410,9 @@ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h $(CC) $(CFLAGS) -c $< -o $@ +ggml-blas.o: ggml-blas.cpp ggml-blas.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + WHISPER_OBJ += ggml.o ggml-alloc.o ggml-backend.o ggml-quants.o whisper.o: whisper.cpp whisper.h whisper-mel.hpp ggml.h ggml-cuda.h From 8afc68f19a55ff7710c0de4edcb6e29e23dc18ea Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 16 Jun 2024 19:23:32 +0300 Subject: [PATCH 6/8] scripts : sync ggml-blas --- scripts/sync-ggml-am.sh | 4 ++++ scripts/sync-ggml.sh | 2 ++ 2 files changed, 6 insertions(+) diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index 4614b45919a..397f6c8687d 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -96,6 +96,8 @@ if [ -f $SRC_WHISPER/ggml-src.patch ]; then # src/ggml-alloc.c -> ggml-alloc.c # src/ggml-backend-impl.h -> ggml-backend-impl.h # src/ggml-backend.c -> ggml-backend.c + # src/ggml-blas.cpp -> ggml-blas.cpp + # src/ggml-blas.h -> ggml-blas.h # src/ggml-common.h -> ggml-common.h # src/ggml-cuda/* -> ggml-cuda/ # src/ggml-cuda.cu -> ggml-cuda.cu @@ -131,6 +133,8 @@ if [ -f $SRC_WHISPER/ggml-src.patch ]; then -e 's/src\/ggml-alloc\.c/ggml-alloc.c/g' \ -e 's/src\/ggml-backend-impl\.h/ggml-backend-impl.h/g' \ -e 's/src\/ggml-backend\.c/ggml-backend.c/g' \ + -e 's/src\/ggml-blas\.cpp/ggml-blas.cpp/g' \ + -e 's/src\/ggml-blas\.h/ggml-blas.h/g' \ -e 's/src\/ggml-common\.h/ggml-common.h/g' \ -e 's/src\/ggml-cuda\//ggml-cuda\//g' \ -e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \ diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index c718a07bb61..9c70a82db28 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -5,6 +5,8 @@ cp -rpv ../ggml/src/ggml-impl.h ./ggml-impl.h cp -rpv ../ggml/src/ggml-alloc.c ./ggml-alloc.c cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml-backend-impl.h cp -rpv ../ggml/src/ggml-backend.c ./ggml-backend.c +cp -rpv ../ggml/src/ggml-blas.cpp ./ggml-blas.cpp +cp -rpv ../ggml/src/ggml-blas.h ./ggml-blas.h cp -rpv ../ggml/src/ggml-common.h ./ggml-common.h cp -rpv ../ggml/src/ggml-cuda/* ./ggml-cuda/ cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu From 70dd7b64cd29cd2cdd2de68cd873281e88887dc7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 16 Jun 2024 19:23:55 +0300 Subject: [PATCH 7/8] fix : remove extra files --- src/ggml-blas.cpp | 363 ---------------------------------------------- src/ggml-blas.h | 23 --- 2 files changed, 386 deletions(-) delete mode 100644 src/ggml-blas.cpp delete mode 100644 src/ggml-blas.h diff --git a/src/ggml-blas.cpp b/src/ggml-blas.cpp deleted file mode 100644 index d709a357bbf..00000000000 --- a/src/ggml-blas.cpp +++ /dev/null @@ -1,363 +0,0 @@ -#include "ggml-blas.h" -#include "ggml-backend-impl.h" - -#include -#include - -#if defined(GGML_USE_ACCELERATE) -# include -#elif defined(GGML_BLAS_USE_MKL) -# include -#else -# include -# ifdef BLIS_ENABLE_CBLAS -# include -# endif -#endif - -struct ggml_backend_blas_context { - int n_threads = GGML_DEFAULT_N_THREADS; - std::unique_ptr work_data; - size_t work_size = 0; -#ifndef GGML_USE_OPENMP - std::vector> tasks; -#endif -}; - -// helper function to determine if it is better to use BLAS or not -// for large matrices, BLAS is faster -static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) { - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - const int64_t ne10 = src1->ne[0]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - - // TODO: find the optimal values for these - if (ggml_is_contiguous(src0) && - ggml_is_contiguous(src1) && - src1->type == GGML_TYPE_F32 && - (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { - - /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ - return true; - } - - return false; -} - -static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_TENSOR_BINARY_OP_LOCALS - - const enum ggml_type type = src0->type; - - GGML_ASSERT(ne0 == ne01); - GGML_ASSERT(ne1 == ne11); - GGML_ASSERT(ne2 == ne12); - GGML_ASSERT(ne3 == ne13); - - // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == ggml_type_size(type)); - GGML_ASSERT(nb10 == ggml_type_size(src1->type)); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); - - // broadcast factors - const int64_t r2 = ne12/ne02; - const int64_t r3 = ne13/ne03; - - const int64_t ne_plane = ne01*ne00; - const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float); - - if (ctx->work_size < desired_wsize) { - ctx->work_data.reset(new char[desired_wsize]); - ctx->work_size = desired_wsize; - } - void * wdata = ctx->work_data.get(); - - // convert src0 to float - if (type != GGML_TYPE_F32) { - ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type); - ggml_to_float_t const to_float = type_traits.to_float; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - const void * x = (char *) src0->data + i02*nb02 + i03*nb03; - float * const wplane = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; - - const int min_cols_per_thread = 4096; - const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1); - const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1); - -#ifdef GGML_USE_OPENMP - #pragma omp parallel for num_threads(n_threads) - for (int64_t i01 = 0; i01 < ne01; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); - } -#else - for (int i = 1; i < n_threads; i++) { - const int64_t start = i*ne01/n_threads; - const int64_t end = (i + 1)*ne01/n_threads; - if (start < end) { - ctx->tasks.push_back(std::async(std::launch::async, [=]() { - for (int64_t i01 = start; i01 < end; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); - } - })); - } - } - { - // reuse the current thread for the first task - const int64_t start = 0; - const int64_t end = ne01/n_threads; - for (int64_t i01 = start; i01 < end; i01++) { - to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); - } - } -#endif - } - } - -#ifndef GGML_USE_OPENMP - // wait for all tasks to finish - for (auto & task : ctx->tasks) { - task.get(); - } - ctx->tasks.clear(); -#endif - } - -#if defined(OPENBLAS_VERSION) - openblas_set_num_threads(ctx->n_threads); -#endif - -#if defined(BLIS_ENABLE_CBLAS) - bli_thread_set_num_threads(ctx->n_threads); -#endif - - for (int64_t i13 = 0; i13 < ne13; i13++) { - for (int64_t i12 = 0; i12 < ne12; i12++) { - const int64_t i03 = i13/r3; - const int64_t i02 = i12/r2; - - const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); - const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); - float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - - if (type != GGML_TYPE_F32) { - x = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; - } - - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - ne1, ne01, ne10, - 1.0f, y, ne10, - x, ne00, - 0.0f, d, ne01); - } - } -} - -static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT(ne0 == ne00); - GGML_ASSERT(ne1 == ne10); - GGML_ASSERT(ne2 == ne02); - GGML_ASSERT(ne02 == ne12); - GGML_ASSERT(ne3 == ne13); - GGML_ASSERT(ne03 == ne13); - - // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == sizeof(float)); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - // GGML_ASSERT(nb0 <= nb1); - // GGML_ASSERT(nb1 <= nb2); - // GGML_ASSERT(nb2 <= nb3); - - // Arguments to ggml_compute_forward_out_prod (expressed as major,minor) - // src0: (k,n) - // src1: (k,m) - // dst: (m,n) - // - // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f) - // Also expressed as (major,minor) - // a: (m,k): so src1 transposed - // b: (k,n): so src0 - // c: (m,n) - // - // However, if ggml_is_transposed(src1) is true, then - // src1->data already contains a transposed version, so sgemm mustn't - // transpose it further. - - int n = src0->ne[0]; - int k = src0->ne[1]; - int m = src1->ne[0]; - - CBLAS_TRANSPOSE transposeA; - int lda; - - if (!ggml_is_transposed(src1)) { - transposeA = CblasTrans; - lda = m; - } else { - transposeA = CblasNoTrans; - lda = k; - } - - float * a = (float *) ((char *) src1->data); - float * b = (float *) ((char *) src0->data); - float * c = (float *) ((char *) dst->data); - - cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n); - - GGML_UNUSED(ctx); -} - -// backend interface - -GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) { - return "BLAS"; - - GGML_UNUSED(backend); -} - -GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) { - ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; - delete ctx; - delete backend; -} - -GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) { - return ggml_backend_cpu_buffer_type(); - - GGML_UNUSED(backend); -} - -GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; - - for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * node = cgraph->nodes[i]; - - switch (node->op) { - case GGML_OP_MUL_MAT: - ggml_backend_blas_mul_mat(ctx, node); - break; - - case GGML_OP_OUT_PROD: - ggml_backend_blas_out_prod(ctx, node); - break; - - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - break; - - default: - fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node)); - GGML_ASSERT(false); - } - } - - return GGML_STATUS_SUCCESS; - - GGML_UNUSED(backend); -} - -GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { - const struct ggml_tensor * src0 = op->src[0]; - const struct ggml_tensor * src1 = op->src[1]; - - return (op->op == GGML_OP_MUL_MAT && ggml_backend_blas_use_blas(op)) || - (op->op == GGML_OP_OUT_PROD && op->src[0]->type == GGML_TYPE_F32 && - op->src[1]->type == GGML_TYPE_F32 && - ggml_is_matrix(src0) && - ggml_is_matrix(src1) && - ggml_is_contiguous(src0) && - (ggml_is_contiguous(src1) || ggml_is_transposed(src1))); - - GGML_UNUSED(backend); -} - -GGML_CALL static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { - return ggml_backend_buft_is_host(buft); - - GGML_UNUSED(backend); -} - -static struct ggml_backend_i blas_backend_i = { - /* .get_name = */ ggml_backend_blas_name, - /* .free = */ ggml_backend_blas_free, - /* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type, - /* .set_tensor_async = */ NULL, - /* .get_tensor_async = */ NULL, - /* .cpy_tensor_async = */ NULL, - /* .synchronize = */ NULL, - /* .graph_plan_create = */ NULL, - /* .graph_plan_free = */ NULL, - /* .graph_plan_update = */ NULL, - /* .graph_plan_compute = */ NULL, - /* .graph_compute = */ ggml_backend_blas_graph_compute, - /* .supports_op = */ ggml_backend_blas_supports_op, - /* .supports_buft = */ ggml_backend_blas_supports_buft, - /* .offload_op = */ NULL, - /* .event_new = */ NULL, - /* .event_free = */ NULL, - /* .event_record = */ NULL, - /* .event_wait = */ NULL, - /* .event_synchronize = */ NULL, -}; - -static ggml_guid_t ggml_backend_blas_guid(void) { - static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d }; - return &guid; -} - -ggml_backend_t ggml_backend_blas_init(void) { - ggml_backend_blas_context * ctx = new ggml_backend_blas_context; - - ggml_backend_t backend = new ggml_backend { - /* .guid = */ ggml_backend_blas_guid(), - /* .interface = */ blas_backend_i, - /* .context = */ ctx, - }; - -#if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP) - if (openblas_get_parallel() != OPENBLAS_OPENMP) { - fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__); - } -#endif - -#if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP) - fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__); -#endif - - return backend; -} - -GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) { - return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid()); -} - -void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) { - GGML_ASSERT(ggml_backend_is_blas(backend_blas)); - - ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context; - ctx->n_threads = n_threads; -} diff --git a/src/ggml-blas.h b/src/ggml-blas.h deleted file mode 100644 index f2e37de06f6..00000000000 --- a/src/ggml-blas.h +++ /dev/null @@ -1,23 +0,0 @@ -#pragma once - -#include "ggml.h" -#include "ggml-backend.h" - - -#ifdef __cplusplus -extern "C" { -#endif - -// backend API -GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void); - -GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend); - -// number of threads used for conversion to float -// for openblas and blis, this will also set the number of threads used for blas operations -GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads); - - -#ifdef __cplusplus -} -#endif From 2c76334d68c179c2cac1473abcdd1aaa15a0c63c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Jun 2024 09:37:20 +0300 Subject: [PATCH 8/8] whisper : use ggml_backend_sched (#2239) * whisper : use ggml_backend_sched (wip) * use sched in whisper_allocr * whisper : single backend in whisper_context * whisper : remove whisper_state->backends_used * whisper : remove whisper_context->backend * whisper : reset scheduler after init * whisper : fix external encoder (e.g. CoreML) * whisper : cleanup * whisper : handle null GPU buffer types + fix sycl --------- Co-authored-by: slaren --- ggml-backend.c | 15 ++- ggml-backend.h | 3 + whisper.cpp | 281 ++++++++++++++++++++++++++++++------------------- 3 files changed, 189 insertions(+), 110 deletions(-) diff --git a/ggml-backend.c b/ggml-backend.c index 2bec7bea38a..17429794950 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1706,14 +1706,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { bool backend_ids_changed = false; for (int i = 0; i < sched->graph->n_nodes; i++) { - if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i]) { + if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] && + sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) { backend_ids_changed = true; break; } } if (!backend_ids_changed) { for (int i = 0; i < sched->graph->n_leafs; i++) { - if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i]) { + if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] && + sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) { backend_ids_changed = true; break; } @@ -1977,6 +1979,15 @@ int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) { return sched->n_copies; } +int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) { + return sched->n_backends; +} + +ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) { + GGML_ASSERT(i >= 0 && i < sched->n_backends); + return sched->backends[i]; +} + size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) { int backend_index = ggml_backend_sched_backend_id(sched, backend); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); diff --git a/ggml-backend.h b/ggml-backend.h index 47fd8147517..4a38eeb5c23 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -182,6 +182,9 @@ extern "C" { // Initialize backend buffers from a measure graph GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); + GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched); + GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i); + // Get the number of splits of the last graph GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched); diff --git a/whisper.cpp b/whisper.cpp index 0a53a03d03d..4b96e8bcb66 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -17,6 +17,10 @@ #include "ggml-sycl.h" #endif +#ifdef GGML_USE_BLAS +#include "ggml-blas.h" +#endif + #ifdef WHISPER_USE_OPENVINO #include "openvino/whisper-openvino-encoder.h" #endif @@ -179,18 +183,30 @@ static bool ggml_graph_compute_helper( } static bool ggml_graph_compute_helper( - struct ggml_backend * backend, + ggml_backend_sched_t sched, struct ggml_cgraph * graph, int n_threads) { - if (ggml_backend_is_cpu(backend)) { - ggml_backend_cpu_set_n_threads(backend, n_threads); - } + + for (int i = 0; i < ggml_backend_sched_get_n_backends(sched); ++i) { + ggml_backend_t backend = ggml_backend_sched_get_backend(sched, i); + if (ggml_backend_is_cpu(backend)) { + ggml_backend_cpu_set_n_threads(backend, n_threads); + } +#ifdef GGML_USE_BLAS + if (ggml_backend_is_blas(backend)) { + ggml_backend_blas_set_n_threads(backend, n_threads); + } +#endif #ifdef GGML_USE_METAL - if (ggml_backend_is_metal(backend)) { - ggml_backend_metal_set_n_cb(backend, n_threads); - } + if (ggml_backend_is_metal(backend)) { + ggml_backend_metal_set_n_cb(backend, n_threads); + } #endif - return ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS; + } + + bool t = ggml_backend_sched_graph_compute(sched, graph) == GGML_STATUS_SUCCESS; + ggml_backend_sched_reset(sched); + return t; } // faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad" @@ -490,33 +506,41 @@ struct whisper_pair { whisper_pair() : first(A()), second(B()) {} }; -// ggml_allocr wrapper for whisper usage -struct whisper_allocr { - ggml_gallocr_t alloc = nullptr; +// ggml_backend_sched wrapper for whisper usage +struct whisper_sched { + ggml_backend_sched_t sched = nullptr; std::vector meta; }; -static size_t whisper_allocr_size(struct whisper_allocr & allocr) { - return allocr.meta.size() + ggml_gallocr_get_buffer_size(allocr.alloc, 0); +static size_t whisper_sched_size(struct whisper_sched & allocr) { + size_t size = allocr.meta.size(); + for (int i = 0; i < ggml_backend_sched_get_n_backends(allocr.sched); ++i) { + ggml_backend_t backend = ggml_backend_sched_get_backend(allocr.sched, i); + size += ggml_backend_sched_get_buffer_size(allocr.sched, backend); + } + return size; } // measure the memory usage of a graph and prepare the allocr's internal data buffer -static bool whisper_allocr_graph_init(struct whisper_allocr & allocr, ggml_backend_t backend, std::function && get_graph) { - auto & alloc = allocr.alloc; +static bool whisper_sched_graph_init(struct whisper_sched & allocr, std::vector backends, std::function && get_graph) { + auto & sched = allocr.sched; auto & meta = allocr.meta; - alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + sched = ggml_backend_sched_new(backends.data(), nullptr, backends.size(), WHISPER_MAX_NODES, false); meta.resize(ggml_tensor_overhead()*WHISPER_MAX_NODES + ggml_graph_overhead()); // since there are dependencies between the different graphs, // we need to allocate them instead of only reserving to get the correct compute buffer size - if (!ggml_gallocr_alloc_graph(alloc, get_graph())) { + if (!ggml_backend_sched_alloc_graph(sched, get_graph())) { // failed to allocate the compute buffer WHISPER_LOG_ERROR("%s: failed to allocate the compute buffer\n", __func__); return false; } + + ggml_backend_sched_reset(sched); + return true; } @@ -808,15 +832,13 @@ struct whisper_state { whisper_decoder decoders[WHISPER_MAX_DECODERS]; - ggml_backend_t backend = nullptr; + std::vector backends; - // ggml-alloc: // - stores meta info about the intermediate tensors into the `meta` buffers - // - stores the actual tensor data into the `data` buffers - whisper_allocr alloc_conv; - whisper_allocr alloc_encode; - whisper_allocr alloc_cross; - whisper_allocr alloc_decode; + whisper_sched sched_conv; + whisper_sched sched_encode; + whisper_sched sched_cross; + whisper_sched sched_decode; // result of the encoder struct ggml_tensor * embd_conv = nullptr; @@ -874,8 +896,6 @@ struct whisper_context { whisper_state * state = nullptr; - ggml_backend_t backend = nullptr; - std::string path_model; // populated by whisper_init_from_file_with_params() }; @@ -1061,20 +1081,16 @@ static void whisper_kv_cache_seq_cp( } static uint32_t whisper_kv_cache_get_padding(const struct whisper_context & wctx) { - if (!wctx.params.flash_attn) { + if (!wctx.params.flash_attn || !wctx.params.use_gpu) { return 1u; } #ifdef GGML_USE_METAL - if (ggml_backend_is_metal(wctx.backend)) { - return 32u; - } + return 32u; #endif #ifdef GGML_USE_CUDA - if (ggml_backend_is_cuda(wctx.backend)) { - return 256u; - } + return 256u; #endif return 1u; @@ -1211,15 +1227,14 @@ static size_t aheads_masks_nbytes(struct whisper_aheads_masks & aheads_masks) { return size; } -static ggml_backend_t whisper_backend_init(const whisper_context_params & params) { - ggml_backend_t backend_gpu = NULL; +static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) { + ggml_backend_t result = NULL; - // initialize the backends #ifdef GGML_USE_CUDA if (params.use_gpu) { WHISPER_LOG_INFO("%s: using CUDA backend\n", __func__); - backend_gpu = ggml_backend_cuda_init(params.gpu_device); - if (!backend_gpu) { + result = ggml_backend_cuda_init(params.gpu_device); + if (!result) { WHISPER_LOG_ERROR("%s: ggml_backend_cuda_init() failed\n", __func__); } } @@ -1229,13 +1244,13 @@ static ggml_backend_t whisper_backend_init(const whisper_context_params & params if (params.use_gpu) { WHISPER_LOG_INFO("%s: using Metal backend\n", __func__); ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data); - backend_gpu = ggml_backend_metal_init(); - if (!backend_gpu) { + result = ggml_backend_metal_init(); + if (!result) { WHISPER_LOG_ERROR("%s: ggml_backend_metal_init() failed\n", __func__); - } else if (!ggml_backend_metal_supports_family(backend_gpu, 7)) { + } else if (!ggml_backend_metal_supports_family(result, 7)) { WHISPER_LOG_ERROR("%s: Metal GPU does not support family 7 - falling back to CPU\n", __func__); - ggml_backend_free(backend_gpu); - backend_gpu = NULL; + ggml_backend_free(result); + result = NULL; } } #endif @@ -1243,20 +1258,64 @@ static ggml_backend_t whisper_backend_init(const whisper_context_params & params #ifdef GGML_USE_SYCL if (params.use_gpu) { WHISPER_LOG_INFO("%s: using SYCL backend\n", __func__); - backend_gpu = ggml_backend_sycl_init(params.gpu_device); - if (!backend_gpu) { + result = ggml_backend_sycl_init(params.gpu_device); + if (!result) { WHISPER_LOG_ERROR("%s: ggml_backend_sycl_init() failed\n", __func__); } } #endif - GGML_UNUSED(params); + return result; +} + +static std::vector whisper_backend_init(const whisper_context_params & params) { + std::vector result; + + ggml_backend_t backend_gpu = whisper_backend_init_gpu(params); if (backend_gpu) { - return backend_gpu; + result.push_back(backend_gpu); + } + +#ifdef GGML_USE_BLAS + { + WHISPER_LOG_INFO("%s: using BLAS backend\n", __func__); + ggml_backend_t backend_blas = ggml_backend_blas_init(); + if (!backend_blas) { + WHISPER_LOG_ERROR("%s: ggml_backend_blas_init() failed\n", __func__); + } else { + result.push_back(backend_blas); + } } +#endif + + GGML_UNUSED(params); + + result.push_back(ggml_backend_cpu_init()); + + return result; +} + +static ggml_backend_buffer_type_t whisper_default_buffer_type(const whisper_context_params & params) { + ggml_backend_buffer_type_t result = nullptr; - return ggml_backend_cpu_init(); + params.use_gpu || (result = ggml_backend_cpu_buffer_type()); + +#ifdef GGML_USE_CUDA + result || (result = ggml_backend_cuda_buffer_type(params.gpu_device)); +#endif + +#ifdef GGML_USE_METAL + result || (result = ggml_backend_metal_buffer_type()); +#endif + +#ifdef GGML_USE_SYCL + result || (result = ggml_backend_sycl_buffer_type(params.gpu_device)); +#endif + + result || (result = ggml_backend_cpu_buffer_type()); + + return result; } // load the model from a ggml file @@ -1683,21 +1742,15 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con } } - wctx.backend = whisper_backend_init(wctx.params); - if (!wctx.backend) { - WHISPER_LOG_ERROR("%s: failed to initialize the backend\n", __func__); - return false; - } - // allocate tensors in the backend buffers - model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, wctx.backend); + model.buffer = ggml_backend_alloc_ctx_tensors_from_buft(model.ctx, whisper_default_buffer_type(wctx.params)); if (!model.buffer) { WHISPER_LOG_ERROR("%s: failed to allocate memory for the model\n", __func__); return false; } size_t size_main = ggml_backend_buffer_get_size(model.buffer); - WHISPER_LOG_INFO("%s: %8s total size = %8.2f MB\n", __func__, ggml_backend_name(wctx.backend), size_main / 1e6); + WHISPER_LOG_INFO("%s: %8s total size = %8.2f MB\n", __func__, ggml_backend_buffer_name(model.buffer), size_main / 1e6); // load weights { @@ -1792,6 +1845,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con } } + ggml_backend_buffer_set_usage(model.buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + wctx.t_load_us = ggml_time_us() - t_start_us; return true; @@ -1828,8 +1883,8 @@ static struct ggml_cgraph * whisper_build_graph_conv( const int n_mels = hparams.n_mels; struct ggml_init_params params = { - /*.mem_size =*/ wstate.alloc_conv.meta.size(), - /*.mem_buffer =*/ wstate.alloc_conv.meta.data(), + /*.mem_size =*/ wstate.sched_conv.meta.size(), + /*.mem_buffer =*/ wstate.sched_conv.meta.data(), /*.no_alloc =*/ true, }; @@ -1837,9 +1892,13 @@ static struct ggml_cgraph * whisper_build_graph_conv( ggml_cgraph * gf = ggml_new_graph(ctx0); + GGML_ASSERT(wstate.mel.tensor); + ggml_tensor * mel_inp = wstate.mel.tensor; + ggml_set_input(mel_inp); + ggml_tensor * mel; - if (mel_inp) { + { const int n_len = int(mel_inp->ne[0]); const int out_s = 2 * n_ctx; const int i0 = std::min(mel_offset, n_len); @@ -1853,16 +1912,12 @@ static struct ggml_cgraph * whisper_build_graph_conv( if (mel_s < out_s) { mel = ggml_pad(ctx0, cur, out_s - mel_s, 0, 0, 0); - } - else { + } else { mel = ggml_cont(ctx0, cur); } } - else { - // just create some tensor so that the graph/buffer size estimation is correct - mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2 * n_ctx, n_mels); - } - ggml_set_name(mel, "mel"); // used with external encoding + + ggml_set_name(mel, "mel"); struct ggml_tensor * cur = nullptr; @@ -1886,6 +1941,7 @@ static struct ggml_cgraph * whisper_build_graph_conv( ggml_build_forward_expand(gf, mel); cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx); + ggml_set_input(cur); // the external encoder will write into this tensor ggml_set_name(cur, "embd_enc"); wstate.embd_enc = cur; @@ -1920,8 +1976,8 @@ static struct ggml_cgraph * whisper_build_graph_encoder( const int n_ctx_pad = GGML_PAD(n_ctx, 256); struct ggml_init_params params = { - /*.mem_size =*/ wstate.alloc_encode.meta.size(), - /*.mem_buffer =*/ wstate.alloc_encode.meta.data(), + /*.mem_size =*/ wstate.sched_encode.meta.size(), + /*.mem_buffer =*/ wstate.sched_encode.meta.data(), /*.no_alloc =*/ true, }; @@ -2160,8 +2216,8 @@ static struct ggml_cgraph * whisper_build_graph_cross( const int n_ctx_pad = GGML_PAD(n_ctx, 256); struct ggml_init_params params = { - /*.mem_size =*/ wstate.alloc_cross.meta.size(), - /*.mem_buffer =*/ wstate.alloc_cross.meta.data(), + /*.mem_size =*/ wstate.sched_cross.meta.size(), + /*.mem_buffer =*/ wstate.sched_cross.meta.data(), /*.no_alloc =*/ true, }; @@ -2242,16 +2298,16 @@ static bool whisper_encode_internal( // conv { - auto & alloc = wstate.alloc_conv.alloc; + auto & sched = wstate.sched_conv.sched; ggml_cgraph * gf = whisper_build_graph_conv(wctx, wstate, mel_offset); - if (!ggml_gallocr_alloc_graph(alloc, gf)) { + if (!ggml_backend_sched_alloc_graph(sched, gf)) { // should never happen as we pre-allocate the memory return false; } - if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) { + if (!ggml_graph_compute_helper(sched, gf, n_threads)) { return false; } @@ -2269,32 +2325,32 @@ static bool whisper_encode_internal( // encoder if (!whisper_encode_external(wstate)) { - auto & alloc = wstate.alloc_encode.alloc; + auto & sched = wstate.sched_encode.sched; ggml_cgraph * gf = whisper_build_graph_encoder(wctx, wstate); - if (!ggml_gallocr_alloc_graph(alloc, gf)) { + if (!ggml_backend_sched_alloc_graph(sched, gf)) { // should never happen as we pre-allocate the memory return false; } - if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) { + if (!ggml_graph_compute_helper(sched, gf, n_threads)) { return false; } } // cross { - auto & alloc = wstate.alloc_cross.alloc; + auto & sched = wstate.sched_cross.sched; ggml_cgraph * gf = whisper_build_graph_cross(wctx, wstate); - if (!ggml_gallocr_alloc_graph(alloc, gf)) { + if (!ggml_backend_sched_alloc_graph(sched, gf)) { // should never happen as we pre-allocate the memory return false; } - if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) { + if (!ggml_graph_compute_helper(sched, gf, n_threads)) { return false; } } @@ -2336,8 +2392,8 @@ static struct ggml_cgraph * whisper_build_graph_decoder( //WHISPER_LOG_DEBUG("%s: n_past = %d, n_tokens = %d, n_audio_ctx = %d, n_ctx = %d\n", __func__, n_past, n_tokens, n_audio_ctx, n_ctx); struct ggml_init_params params = { - /*.mem_size =*/ wstate.alloc_decode.meta.size(), - /*.mem_buffer =*/ wstate.alloc_decode.meta.data(), + /*.mem_size =*/ wstate.sched_decode.meta.size(), + /*.mem_buffer =*/ wstate.sched_decode.meta.data(), /*.no_alloc =*/ true, }; @@ -2736,11 +2792,11 @@ static bool whisper_decode_internal( // decoder { - auto & alloc = wstate.alloc_decode.alloc; + auto & sched = wstate.sched_decode.sched; ggml_cgraph * gf = whisper_build_graph_decoder(wctx, wstate, batch, save_alignment_heads_QKs, false); - if (!ggml_gallocr_alloc_graph(alloc, gf)) { + if (!ggml_backend_sched_alloc_graph(sched, gf)) { // should never happen as we pre-allocate the memory return false; } @@ -2795,7 +2851,7 @@ static bool whisper_decode_internal( logits = gf->nodes[gf->n_nodes - 1]; - if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) { + if (!ggml_graph_compute_helper(sched, gf, n_threads)) { return false; } } @@ -3299,20 +3355,29 @@ static std::string whisper_openvino_get_path_cache(std::string path_bin) { struct whisper_state * whisper_init_state(whisper_context * ctx) { whisper_state * state = new whisper_state; - state->backend = whisper_backend_init(ctx->params); - if (!state->backend) { + state->backends = whisper_backend_init(ctx->params); + if (state->backends.empty()) { WHISPER_LOG_ERROR("%s: whisper_backend_init() failed\n", __func__); whisper_free_state(state); return nullptr; } - state->mel_calc = whisper_mel_calc_create(state->backend, ctx->model.filters); + state->mel_calc = whisper_mel_calc_create(state->backends[0], ctx->model.filters); + + // init 60s of random mel data + { + const int n_len = 2*100*WHISPER_CHUNK_SIZE; + const int n_mel = ctx->model.filters.n_mel; + + whisper_mel_free(state->mel); + whisper_mel_init(state->mel, state->backends[0], n_len, n_len, n_mel); + } // at this point, we don't know yet how many decoders will be used, so we overallocate 3x ctx // in theory, there can be a case where this is not enough, but in practice it should always be enough const int factor = 3; - if (!whisper_kv_cache_init(state->kv_self, state->backend, ctx->itype, + if (!whisper_kv_cache_init(state->kv_self, state->backends[0], ctx->itype, ctx->model.hparams.n_text_state, ctx->model.hparams.n_text_layer, GGML_PAD(ctx->model.hparams.n_text_ctx, 256)*factor)) { @@ -3326,7 +3391,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { WHISPER_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1e6); } - if (!whisper_kv_cache_init(state->kv_cross, state->backend, ctx->itype, + if (!whisper_kv_cache_init(state->kv_cross, state->backends[0], ctx->itype, ctx->model.hparams.n_text_state, ctx->model.hparams.n_text_layer, GGML_PAD(ctx->model.hparams.n_audio_ctx, 256))) { @@ -3340,7 +3405,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { WHISPER_LOG_INFO("%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1e6); } - if (!whisper_kv_cache_init(state->kv_pad, state->backend, ctx->itype, + if (!whisper_kv_cache_init(state->kv_pad, state->backends[0], ctx->itype, ctx->model.hparams.n_audio_state, 1, GGML_PAD(ctx->model.hparams.n_audio_ctx, 256))) { @@ -3356,7 +3421,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { // [EXPERIMENTAL] Token-level timestamps with DTW if (ctx->params.dtw_token_timestamps) { - if (!aheads_masks_init(ctx->params, ctx->model.hparams, state->aheads_masks, state->backend)) { + if (!aheads_masks_init(ctx->params, ctx->model.hparams, state->aheads_masks, state->backends[0])) { WHISPER_LOG_ERROR("%s: aheads_masks_init() failed for alignment heads masks\n", __func__); whisper_free_state(state); return nullptr; @@ -3399,7 +3464,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { // conv allocator { - bool ok = whisper_allocr_graph_init(state->alloc_conv, state->backend, + bool ok = whisper_sched_graph_init(state->sched_conv, state->backends, [&]() { return whisper_build_graph_conv(*ctx, *state, 0); }); @@ -3410,12 +3475,12 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { return nullptr; } - WHISPER_LOG_INFO("%s: compute buffer (conv) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_conv) / 1e6); + WHISPER_LOG_INFO("%s: compute buffer (conv) = %7.2f MB\n", __func__, whisper_sched_size(state->sched_conv) / 1e6); } // encoder allocator if (!whisper_encode_external(*state)) { - bool ok = whisper_allocr_graph_init(state->alloc_encode, state->backend, + bool ok = whisper_sched_graph_init(state->sched_encode, state->backends, [&]() { return whisper_build_graph_encoder(*ctx, *state); }); @@ -3426,12 +3491,12 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { return nullptr; } - WHISPER_LOG_INFO("%s: compute buffer (encode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_encode) / 1e6); + WHISPER_LOG_INFO("%s: compute buffer (encode) = %7.2f MB\n", __func__, whisper_sched_size(state->sched_encode) / 1e6); } // cross allocator { - bool ok = whisper_allocr_graph_init(state->alloc_cross, state->backend, + bool ok = whisper_sched_graph_init(state->sched_cross, state->backends, [&]() { return whisper_build_graph_cross(*ctx, *state); }); @@ -3442,12 +3507,12 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { return nullptr; } - WHISPER_LOG_INFO("%s: compute buffer (cross) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_cross) / 1e6); + WHISPER_LOG_INFO("%s: compute buffer (cross) = %7.2f MB\n", __func__, whisper_sched_size(state->sched_cross) / 1e6); } // decoder allocator { - bool ok = whisper_allocr_graph_init(state->alloc_decode, state->backend, + bool ok = whisper_sched_graph_init(state->sched_decode, state->backends, [&]() { const auto & hparams = ctx->model.hparams; @@ -3466,7 +3531,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { return nullptr; } - WHISPER_LOG_INFO("%s: compute buffer (decode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_decode) / 1e6); + WHISPER_LOG_INFO("%s: compute buffer (decode) = %7.2f MB\n", __func__, whisper_sched_size(state->sched_decode) / 1e6); } return state; @@ -3746,12 +3811,14 @@ void whisper_free_state(struct whisper_state * state) { whisper_batch_free(state->batch); - ggml_gallocr_free(state->alloc_conv.alloc); - ggml_gallocr_free(state->alloc_encode.alloc); - ggml_gallocr_free(state->alloc_cross.alloc); - ggml_gallocr_free(state->alloc_decode.alloc); + ggml_backend_sched_free(state->sched_conv.sched); + ggml_backend_sched_free(state->sched_encode.sched); + ggml_backend_sched_free(state->sched_cross.sched); + ggml_backend_sched_free(state->sched_decode.sched); - ggml_backend_free(state->backend); + for (auto & backend : state->backends) { + ggml_backend_free(backend); + } // [EXPERIMENTAL] Token-level timestamps with DTW aheads_masks_free(state->aheads_masks); @@ -3768,8 +3835,6 @@ void whisper_free(struct whisper_context * ctx) { whisper_free_state(ctx->state); - ggml_backend_free(ctx->backend); - delete ctx; } } @@ -3800,7 +3865,7 @@ int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_s // 2. the time to transcribe audios this long will be dominated by the decoding time, so the mel calculation // taking longer is not a major concern if (!state->mel_calc_fallback) { - state->mel_calc_fallback = new mel_calc_cpu(state->backend, ctx->model.filters); + state->mel_calc_fallback = new mel_calc_cpu(state->backends[0], ctx->model.filters); } state->mel = state->mel_calc_fallback->calculate({samples, n_samples}, n_threads); } @@ -3837,7 +3902,7 @@ int whisper_set_mel_with_state( } whisper_mel_free(state->mel); - whisper_mel_init(state->mel, ctx->backend, n_len, n_len, n_mel); + whisper_mel_init(state->mel, state->backends[0], n_len, n_len, n_mel); ggml_backend_tensor_set(state->mel.tensor, data, 0, ggml_nbytes(state->mel.tensor));