Add option not to abort on cuda OOM

WilliamTambellini · WilliamTambellini · commit 5c4d9b66a292 · 2025-02-13T10:11:15.000-08:00
Warning: Not ready for merge.
Add option not to abort on cuda OOM but throw/return a ggml_status.
The goal in this ticket is NOT to be able to continue inference when
OOM, but just to do a clean controlled exit at higher level.
No change to default behavior (abort).
Retouch ggml_tallocr_alloc to return a ggml_status.
Ass a new unit test to check the no abort flow (skiped if the envvar
GGML_CUDA_NO_ABORT is not set).
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,6 @@
 build/
+release/
+debug/
 build-*/
 out/
 tmp/
diff --git a/include/ggml-alloc.h b/include/ggml-alloc.h
@@ -53,8 +53,8 @@ GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);
 // call with a worst-case graph to avoid buffer reallocations
 // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
 // returns false if the buffer allocation failed
-GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
-GGML_API bool ggml_gallocr_reserve_n(
+GGML_API enum ggml_status ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+GGML_API enum ggml_status ggml_gallocr_reserve_n(
     ggml_gallocr_t galloc,
     struct ggml_cgraph * graph,
     const int * node_buffer_ids,
diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c
@@ -94,7 +94,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
     size = GGML_PAD(size, talloc->alignment);
 
     if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
-        GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
+        GGML_LOG_ERROR("%s: not enough space in the buffer to allocate tensor '%s' (needed %zu, available %zu)\n",
                 __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
         GGML_ABORT("not enough space in the buffer");
     }
@@ -378,6 +378,7 @@ struct ggml_gallocr {
 };
 
 ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
+    //GGML_LOG_TRACE("%s: nbufs=%d\n", __func__, n_bufs);
     ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
     GGML_ASSERT(galloc != NULL);
 
@@ -670,7 +671,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
     }
 }
 
-bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
+enum ggml_status ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
+    //GGML_LOG_DEBUG("%s:  \n", __func__);
     size_t min_hash_size = graph->n_nodes + graph->n_leafs;
     // add 25% margin to avoid hash collisions
     min_hash_size += min_hash_size / 4;
@@ -771,16 +773,16 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
             galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
             if (galloc->buffers[i] == NULL) {
                 GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
-                return false;
+                return GGML_STATUS_ALLOC_FAILED;
             }
             ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
         }
     }
 
-    return true;
+    return GGML_STATUS_SUCCESS;
 }
 
-bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
+enum ggml_status ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
     return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
 }
 
@@ -865,13 +867,16 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
     return false;
 }
 
+// Check with reviewers: any cons to return a ggml_status here?
 bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
     if (ggml_gallocr_needs_realloc(galloc, graph)) {
         if (galloc->n_buffers == 1) {
 #ifndef NDEBUG
             GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
 #endif
-            if (!ggml_gallocr_reserve(galloc, graph)) {
+            enum ggml_status s = ggml_gallocr_reserve(galloc, graph);
+            if (s != GGML_STATUS_SUCCESS) {
+                GGML_LOG_INFO("%s: ggml_gallocr_reserve failed to reserve. status=%d \n", __func__, s);
                 return false;
             }
         } else {
diff --git a/src/ggml-backend.cpp b/src/ggml-backend.cpp
@@ -39,8 +39,14 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
         // return a dummy buffer for zero-sized allocations
         return ggml_backend_buffer_init(buft, {}, NULL, 0);
     }
-
-    return buft->iface.alloc_buffer(buft, size);
+    ggml_backend_buffer_t b = NULL;
+    try {
+      b = buft->iface.alloc_buffer(buft, size);
+    }  catch (const std::exception &e) {
+        GGML_LOG_ERROR("%s: iface.alloc_buffer failed: %s \n", __func__, e.what());
+        return NULL;
+    }
+    return b;
 }
 
 size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
@@ -172,6 +178,7 @@ enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer
 }
 
 ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
+    assert(buffer);
     return buffer->buft;
 }
 
@@ -329,7 +336,16 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
 }
 
 enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    return backend->iface.graph_compute(backend, cgraph);
+    ggml_status s;
+    try {
+        s = backend->iface.graph_compute(backend, cgraph);
+    } catch(std::bad_alloc &e) {
+        return GGML_STATUS_ALLOC_FAILED;
+    }  catch (std::exception &e) {
+        GGML_LOG_INFO("%s: graph_compute threw: %s", __func__, e.what());
+        return  GGML_STATUS_FAILED;
+    }
+    return s;
 }
 
 bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
diff --git a/src/ggml-cpu/amx/amx.cpp b/src/ggml-cpu/amx/amx.cpp
@@ -54,6 +54,7 @@ static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, st
     tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
 
     GGML_UNUSED(buffer);
+    return GGML_STATUS_SUCCESS;
 }
 
 static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
diff --git a/src/ggml-cuda/common.cuh b/src/ggml-cuda/common.cuh
@@ -120,8 +120,8 @@ static int ggml_cuda_highest_compiled_arch(const int arch) {
 
 #define GGML_CUDA_MAX_STREAMS 8
 
-[[noreturn]]
-void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
+// Print the error. Will also either abort or throw an exception.
+[[noreturn]] void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
 
 #define CUDA_CHECK_GEN(err, success, error_fn)                                      \
      do {                                                                           \
@@ -162,6 +162,7 @@ static const char * cu_get_error_str(CUresult err) {
     cuGetErrorString(err, &err_str);
     return err_str;
 }
+// Will print error and abort/throw
 #define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
 #endif
 
diff --git a/src/ggml-cuda/ggml-cuda.cu b/src/ggml-cuda/ggml-cuda.cu
@@ -70,7 +70,13 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
     GGML_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func, file, line);
     GGML_LOG_ERROR("  %s\n", stmt);
     // abort with GGML_ABORT to get a stack trace
-    GGML_ABORT(GGML_CUDA_NAME " error");
+    static const char* GGML_CUDA_NO_ABORT = getenv("GGML_CUDA_NO_ABORT");
+    if (!GGML_CUDA_NO_ABORT) {
+        GGML_ABORT(GGML_CUDA_NAME " error");
+    }
+#ifndef __CUDA_ARCH__
+    throw std::runtime_error(msg);
+#endif
 }
 
 // this is faster on Windows
@@ -92,6 +98,7 @@ int ggml_cuda_get_device() {
     return id;
 }
 
+// Note: Does not abort/throw because does not use CUDA_CHECK
 static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
     ggml_cuda_set_device(device);
 #if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
@@ -536,7 +543,8 @@ static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
 
 static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-
+    GGML_ASSERT(tensor);
+    GGML_LOG_DEBUG("%s: t=%p %s\n", __func__, tensor, tensor->name);
     if (tensor->view_src != NULL) {
         assert(tensor->view_src->buffer->buft == buffer->buft);
         return;
@@ -945,8 +953,14 @@ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(gg
     // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
     // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
     ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
-
-    return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
+    ggml_backend_buffer_t b = NULL;
+    try {
+        b = ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
+    }  catch (std::exception &e) {
+        GGML_LOG_ERROR("%s: ggml_backend_buffer_init threw: %s \n", __func__, e.what());
+        return NULL;
+    }
+    return b;
 }
 
 static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
diff --git a/src/ggml.c b/src/ggml.c
@@ -1681,6 +1681,7 @@ void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes) {
 }
 
 struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
+    assert(src);
     return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
 }
 
@@ -2328,6 +2329,8 @@ struct ggml_tensor * ggml_concat(
     struct ggml_tensor  * b,
     int                   dim) {
     GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
+    assert(a);
+    assert(b);
 
     int64_t ne[GGML_MAX_DIMS];
     for (int d = 0; d < GGML_MAX_DIMS; ++d) {
@@ -2695,6 +2698,8 @@ struct ggml_tensor * ggml_mul_mat(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b) {
+    assert(a);
+    assert(b);
     GGML_ASSERT(ggml_can_mul_mat(a, b));
     GGML_ASSERT(!ggml_is_transposed(a));
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -412,3 +412,9 @@ add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
 target_link_libraries(${TEST_TARGET} PRIVATE ggml)
 add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
 set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+
+set(TEST_TARGET test-oom)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
diff --git a/tests/test-arange.cpp b/tests/test-arange.cpp
@@ -76,7 +76,7 @@ int main(int /*argc*/, const char** /*argv*/) {
             ggml_backend_cpu_set_n_threads(backend, n_threads);
         }
 
-        ggml_backend_graph_compute(backend, graph);
+        GGML_ASSERT(ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS);
 
         float * output = new float[ggml_nelements(t)];
         ggml_backend_tensor_get(t, output, 0, ggml_nbytes(t));
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -633,7 +633,9 @@ struct test_case {
         ggml_build_forward_expand(gf, out);
 
         // warmup run
-        ggml_backend_graph_compute(backend, gf);
+        ggml_status status = ggml_backend_graph_compute(backend, gf);
+        if (status != GGML_STATUS_SUCCESS)
+            printf("Warning: ggml_backend_graph_compute warmup failed: ggml status=%d \n", status);
 
         // determine number of runs
         int n_runs;
diff --git a/tests/test-mul-mat.cpp b/tests/test-mul-mat.cpp
@@ -151,8 +151,9 @@ struct ggml_tensor* compute(const test_model & model, ggml_gallocr_t allocr) {
         ggml_backend_cpu_set_n_threads(model.backend, n_threads);
     }
 
-
-    ggml_backend_graph_compute(model.backend, gf);
+    ggml_status status = ggml_backend_graph_compute(model.backend, gf);
+    if (status != GGML_STATUS_SUCCESS)
+        return nullptr;
 
     //ggml_graph_print(gf);
 
@@ -313,6 +314,10 @@ int main(void)
     }
 
     struct ggml_tensor * result = compute(model, allocr);
+    if (!result) {
+        printf("ggml_mul_mat: failed to compute graph");
+        return EXIT_FAILURE;
+    }
 
     std::vector<float> out_data(ggml_nelements(result));
 
diff --git a/tests/test-oom.cpp b/tests/test-oom.cpp
diff --git a/tests/test-timestep_embedding.cpp b/tests/test-timestep_embedding.cpp

-Original file line number
+Diff line change
@@ @@ -1,4 +1,6 @@ @@
 build/
 +release/
 +debug/
 build-*/
 out/
 tmp/
Original file line number	Diff line number	Diff line change
`@@ -94,7 +94,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso`
`94`	`94`	`size = GGML_PAD(size, talloc->alignment);`
`95`	`95`
`96`	`96`	`if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {`
`97`		`- GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",`
	`97`	`+ GGML_LOG_ERROR("%s: not enough space in the buffer to allocate tensor '%s' (needed %zu, available %zu)\n",`
`98`	`98`	`__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);`
`99`	`99`	`GGML_ABORT("not enough space in the buffer");`
`100`	`100`	`}`
`@@ -378,6 +378,7 @@ struct ggml_gallocr {`
`378`	`378`	`};`
`379`	`379`
`380`	`380`	`ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {`
	`381`	`+ //GGML_LOG_TRACE("%s: nbufs=%d\n", __func__, n_bufs);`
`381`	`382`	`ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));`
`382`	`383`	`GGML_ASSERT(galloc != NULL);`
`383`	`384`
`@@ -670,7 +671,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr`
`670`	`671`	`}`
`671`	`672`	`}`
`672`	`673`
`673`		`-bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {`
	`674`	`+enum ggml_status ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {`
	`675`	`+ //GGML_LOG_DEBUG("%s: \n", __func__);`
`674`	`676`	`size_t min_hash_size = graph->n_nodes + graph->n_leafs;`
`675`	`677`	`// add 25% margin to avoid hash collisions`
`676`	`678`	`min_hash_size += min_hash_size / 4;`
`@@ -771,16 +773,16 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c`
`771`	`773`	`galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);`
`772`	`774`	`if (galloc->buffers[i] == NULL) {`
`773`	`775`	`GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);`
`774`		`- return false;`
	`776`	`+ return GGML_STATUS_ALLOC_FAILED;`
`775`	`777`	`}`
`776`	`778`	`ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);`
`777`	`779`	`}`
`778`	`780`	`}`
`779`	`781`
`780`		`- return true;`
	`782`	`+ return GGML_STATUS_SUCCESS;`
`781`	`783`	`}`
`782`	`784`
`783`		`-bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {`
	`785`	`+enum ggml_status ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {`
`784`	`786`	`return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);`
`785`	`787`	`}`
`786`	`788`
`@@ -865,13 +867,16 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph`
`865`	`867`	`return false;`
`866`	`868`	`}`
`867`	`869`
	`870`	`+// Check with reviewers: any cons to return a ggml_status here?`
`868`	`871`	`bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {`
`869`	`872`	`if (ggml_gallocr_needs_realloc(galloc, graph)) {`
`870`	`873`	`if (galloc->n_buffers == 1) {`
`871`	`874`	`#ifndef NDEBUG`
`872`	`875`	`GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);`
`873`	`876`	`#endif`
`874`		`- if (!ggml_gallocr_reserve(galloc, graph)) {`
	`877`	`+ enum ggml_status s = ggml_gallocr_reserve(galloc, graph);`
	`878`	`+ if (s != GGML_STATUS_SUCCESS) {`
	`879`	`+ GGML_LOG_INFO("%s: ggml_gallocr_reserve failed to reserve. status=%d \n", __func__, s);`
`875`	`880`	`return false;`
`876`	`881`	`}`
`877`	`882`	`} else {`
Original file line number	Diff line number	Diff line change
`@@ -54,6 +54,7 @@ static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, st`
`54`	`54`	`tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);`
`55`	`55`
`56`	`56`	`GGML_UNUSED(buffer);`
	`57`	`+ return GGML_STATUS_SUCCESS;`
`57`	`58`	`}`
`58`	`59`
`59`	`60`	`static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,`
Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ int main(int /argc/, const char** /argv/) {`
`76`	`76`	`ggml_backend_cpu_set_n_threads(backend, n_threads);`
`77`	`77`	`}`
`78`	`78`
`79`		`- ggml_backend_graph_compute(backend, graph);`
	`79`	`+ GGML_ASSERT(ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS);`
`80`	`80`
`81`	`81`	`float * output = new float[ggml_nelements(t)];`
`82`	`82`	`ggml_backend_tensor_get(t, output, 0, ggml_nbytes(t));`