Skip to content

Commit 3689d49

Browse files
committed
whisper : reduce ggml_context usage
1 parent 55e4221 commit 3689d49

File tree

2 files changed

+16
-14
lines changed

2 files changed

+16
-14
lines changed

ggml/src/ggml.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3853,7 +3853,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
38533853
}
38543854

38553855
if (ctx == NULL) {
3856-
GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);
3856+
GGML_LOG_ERROR("%s: ran out of contexts (max = %d)\n", __func__, GGML_MAX_CONTEXTS);
38573857

38583858
ggml_critical_section_end();
38593859

src/whisper.cpp

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -699,9 +699,9 @@ struct whisper_kv_cache {
699699
struct ggml_tensor * k;
700700
struct ggml_tensor * v;
701701

702-
struct ggml_context * ctx = nullptr;
703-
704702
ggml_backend_buffer_t buffer = nullptr;
703+
704+
std::vector<uint8_t> ctx_buf;
705705
};
706706

707707
struct whisper_model {
@@ -941,9 +941,11 @@ static bool whisper_kv_cache_init(
941941
const int64_t n_mem = n_text_layer*n_ctx;
942942
const int64_t n_elements = n_text_state*n_mem;
943943

944+
cache.ctx_buf.resize(2*ggml_tensor_overhead());
945+
944946
struct ggml_init_params params = {
945-
/*.mem_size =*/ 2*ggml_tensor_overhead(),
946-
/*.mem_buffer =*/ nullptr,
947+
/*.mem_size =*/ cache.ctx_buf.size(),
948+
/*.mem_buffer =*/ cache.ctx_buf.data(),
947949
/*.no_alloc =*/ true,
948950
};
949951

@@ -953,31 +955,31 @@ static bool whisper_kv_cache_init(
953955
cache.cells.clear();
954956
cache.cells.resize(n_ctx);
955957

956-
cache.ctx = ggml_init(params);
958+
struct ggml_context * ctx = ggml_init(params);
957959

958-
if (!cache.ctx) {
960+
if (!ctx) {
959961
WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache context\n", __func__);
960962
return false;
961963
}
962964

963-
cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
964-
cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
965+
cache.k = ggml_new_tensor_1d(ctx, wtype, n_elements);
966+
cache.v = ggml_new_tensor_1d(ctx, wtype, n_elements);
965967

966-
cache.buffer = ggml_backend_alloc_ctx_tensors(cache.ctx, backend);
968+
cache.buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
967969
if (!cache.buffer) {
968970
WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache\n", __func__);
969971
return false;
970972
}
971973

972974
ggml_backend_buffer_clear(cache.buffer, 0);
973975

976+
ggml_free(ctx);
977+
974978
return true;
975979
}
976980

977981
static void whisper_kv_cache_free(struct whisper_kv_cache & cache) {
978-
ggml_free(cache.ctx);
979982
ggml_backend_buffer_free(cache.buffer);
980-
cache.ctx = nullptr;
981983
}
982984

983985
static bool whisper_kv_cache_find_slot(
@@ -2002,7 +2004,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
20022004

20032005
auto & kv_pad = wstate.kv_pad;
20042006

2005-
WHISPER_ASSERT(!!kv_pad.ctx);
2007+
WHISPER_ASSERT(!!kv_pad.buffer);
20062008

20072009
const int n_ctx_pad = GGML_PAD(n_ctx, 256);
20082010

@@ -2416,7 +2418,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
24162418

24172419
auto & kv_self = wstate.kv_self;
24182420

2419-
WHISPER_ASSERT(!!kv_self.ctx);
2421+
WHISPER_ASSERT(!!kv_self.buffer);
24202422

24212423
const int n_ctx = kv_self.size;
24222424
const int n_state = hparams.n_text_state;

0 commit comments

Comments
 (0)