Skip to content

Commit 54dbc37

Browse files
authored
metal : report OOM errors (#16274)
1 parent b995a10 commit 54dbc37

File tree

2 files changed

+30
-1
lines changed

2 files changed

+30
-1
lines changed

ggml/src/ggml-metal/ggml-metal-context.m

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,28 @@ void ggml_metal_synchronize(ggml_metal_t ctx) {
222222
ctx->cmd_buf_last = nil;
223223
}
224224

225-
// release any completed command buffers
225+
// check status of all command buffers
226+
{
227+
const int n_cb = ctx->n_cb;
228+
229+
for (int cb_idx = 0; cb_idx <= n_cb; ++cb_idx) {
230+
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[cb_idx].obj;
231+
if (!cmd_buf) {
232+
continue;
233+
}
234+
235+
MTLCommandBufferStatus status = [cmd_buf status];
236+
if (status != MTLCommandBufferStatusCompleted) {
237+
GGML_LOG_ERROR("%s: error: command buffer %d failed with status %d\n", __func__, cb_idx, (int) status);
238+
if (status == MTLCommandBufferStatusError) {
239+
GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
240+
}
241+
GGML_ABORT("fatal error");
242+
}
243+
}
244+
}
245+
246+
// release any completed extra command buffers
226247
if (ctx->cmd_bufs_ext.count > 0) {
227248
for (size_t i = 0; i < ctx->cmd_bufs_ext.count; ++i) {
228249
id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs_ext[i];
@@ -260,6 +281,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
260281
length:size
261282
options:MTLResourceStorageModeShared];
262283

284+
GGML_ASSERT(buf_src);
285+
263286
struct ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(tensor);
264287
if (bid_dst.metal == nil) {
265288
GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
@@ -299,6 +322,8 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
299322
options:MTLResourceStorageModeShared
300323
deallocator:nil];
301324

325+
GGML_ASSERT(buf_dst);
326+
302327
struct ggml_metal_buffer_id bid_src = ggml_metal_get_buffer_id(tensor);
303328
if (bid_src.metal == nil) {
304329
GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);

ggml/src/ggml-metal/ggml-metal-device.m

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1176,6 +1176,8 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *
11761176
options:MTLResourceStorageModeShared
11771177
deallocator:nil];
11781178

1179+
GGML_ASSERT(buf_src);
1180+
11791181
// dst
11801182
struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf, tensor);
11811183
bid_dst.offs += offset;
@@ -1232,6 +1234,8 @@ void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_ten
12321234
options:MTLResourceStorageModeShared
12331235
deallocator:nil];
12341236

1237+
GGML_ASSERT(buf_dst);
1238+
12351239
id<MTLCommandQueue> queue = buf->queue;
12361240
id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
12371241

0 commit comments

Comments
 (0)