ggml : allocate contexts on the heap

ggerganov · ggerganov · commit 2d9c313b49b9 · 2024-10-31T12:46:20.000+02:00
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -217,7 +217,6 @@
 
 #define GGML_MAX_DIMS           4
 #define GGML_MAX_PARAMS         2048
-#define GGML_MAX_CONTEXTS       64
 #define GGML_MAX_SRC            10
 #define GGML_MAX_N_THREADS      512
 #define GGML_MAX_OP_PARAMS      64
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
@@ -3129,7 +3129,7 @@ static enum ggml_status ggml_metal_graph_compute(
 
 // default buffer
 static id<MTLDevice> g_backend_device = nil;
-static int g_backend_device_ref_count = 0;
+static int g_backend_device_ref_count = 0; // TODO: make thread-safe
 
 static id<MTLDevice> ggml_backend_metal_get_device(void) {
     if (g_backend_device == nil) {
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -308,6 +308,8 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
 }
 
 #define GGML_DEBUG 0
+#define GGML_MAX_CONTEXTS 64 // pre-allocated contexts in static memory
+
 #define GGML_GELU_FP16
 #define GGML_GELU_QUICK_FP16
 
@@ -1985,7 +1987,7 @@ static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
 
 struct ggml_context {
     size_t mem_size;
-    void* mem_buffer;
+    void * mem_buffer;
     bool   mem_buffer_owned;
     bool   no_alloc;
     bool   no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
@@ -3839,7 +3841,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         is_first_call = false;
     }
 
-    // find non-used context in g_state
+    // find non-used static context in g_state
     struct ggml_context * ctx = NULL;
 
     for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
@@ -3852,12 +3854,12 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         }
     }
 
-    if (ctx == NULL) {
-        GGML_LOG_ERROR("%s: ran out of contexts (max = %d)\n", __func__, GGML_MAX_CONTEXTS);
+    ggml_critical_section_end();
 
-        ggml_critical_section_end();
+    if (ctx == NULL) {
+        GGML_PRINT_DEBUG("%s: no static contexts available, allocating on the heap\n", __func__);
 
-        return NULL;
+        ctx = GGML_ALIGNED_MALLOC(sizeof(struct ggml_context));
     }
 
     // allow to call ggml_init with 0 size
@@ -3886,8 +3888,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
 
     GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
 
-    ggml_critical_section_end();
-
     return ctx;
 }
 
@@ -3896,6 +3896,10 @@ void ggml_free(struct ggml_context * ctx) {
         return;
     }
 
+    if (ctx->mem_buffer_owned) {
+        GGML_ALIGNED_FREE(ctx->mem_buffer);
+    }
+
     // make this function thread safe
     ggml_critical_section_start();
 
@@ -3905,23 +3909,19 @@ void ggml_free(struct ggml_context * ctx) {
         if (&g_state.contexts[i].context == ctx) {
             g_state.contexts[i].used = false;
 
-            GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
-                    __func__, i, ggml_used_mem(ctx));
-
-            if (ctx->mem_buffer_owned) {
-                GGML_ALIGNED_FREE(ctx->mem_buffer);
-            }
+            GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n", __func__, i, ggml_used_mem(ctx));
 
             found = true;
             break;
         }
     }
 
+    ggml_critical_section_end();
+
     if (!found) {
-        GGML_PRINT_DEBUG("%s: context not found\n", __func__);
+        // this is a heap-allocated context
+        GGML_ALIGNED_FREE(ctx);
     }
-
-    ggml_critical_section_end();
 }
 
 size_t ggml_used_mem(const struct ggml_context * ctx) {