@@ -96,6 +96,7 @@ static int llama_sample_top_p_top_k(
96
96
struct LLamaPrivate {
97
97
const std::string modelPath;
98
98
bool modelLoaded;
99
+ int device = -1 ;
99
100
llama_model *model = nullptr ;
100
101
llama_context *ctx = nullptr ;
101
102
llama_model_params model_params;
@@ -167,24 +168,17 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx)
167
168
if (llama_verbose ()) {
168
169
std::cerr << " llama.cpp: using Metal" << std::endl;
169
170
}
170
- // metal always runs the whole model if n_gpu_layers is not 0, at least
171
- // currently
172
- d_ptr->model_params .n_gpu_layers = 1 ;
173
- #endif
174
- #ifdef GGML_USE_KOMPUTE
175
- if (ggml_vk_has_device ()) {
176
- // vulkan always runs the whole model if n_gpu_layers is not 0, at least
177
- // currently
178
- d_ptr->model_params .n_gpu_layers = 1 ;
171
+ d_ptr->model_params .n_gpu_layers = 100 ;
172
+ #elif defined(GGML_USE_KOMPUTE)
173
+ if (d_ptr->device != -1 ) {
174
+ d_ptr->model_params .main_gpu = d_ptr->device ;
175
+ d_ptr->model_params .n_gpu_layers = 100 ;
179
176
}
180
177
#endif
181
178
182
179
d_ptr->model = llama_load_model_from_file_gpt4all (modelPath.c_str (), &d_ptr->model_params );
183
180
if (!d_ptr->model ) {
184
- #ifdef GGML_USE_KOMPUTE
185
- // Explicitly free the device so next load it doesn't use it
186
- ggml_vk_free_device ();
187
- #endif
181
+ d_ptr->device = -1 ;
188
182
std::cerr << " LLAMA ERROR: failed to load model from " << modelPath << std::endl;
189
183
return false ;
190
184
}
@@ -214,18 +208,15 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx)
214
208
215
209
d_ptr->ctx = llama_new_context_with_model (d_ptr->model , d_ptr->ctx_params );
216
210
if (!d_ptr->ctx ) {
217
- #ifdef GGML_USE_KOMPUTE
218
- // Explicitly free the device so next load it doesn't use it
219
- ggml_vk_free_device ();
220
- #endif
211
+ d_ptr->device = -1 ;
221
212
std::cerr << " LLAMA ERROR: failed to init context for model " << modelPath << std::endl;
222
213
return false ;
223
214
}
224
215
225
216
d_ptr->end_tokens = {llama_token_eos (d_ptr->model )};
226
217
227
218
#ifdef GGML_USE_KOMPUTE
228
- if (ggml_vk_has_device ()) {
219
+ if (usingGPUDevice () && ggml_vk_has_device ()) {
229
220
std::cerr << " llama.cpp: using Vulkan on " << ggml_vk_current_device ().name << std::endl;
230
221
}
231
222
#endif
@@ -339,70 +330,78 @@ const std::vector<LLModel::Token> &LLamaModel::endTokens() const
339
330
std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices (size_t memoryRequired)
340
331
{
341
332
#if defined(GGML_USE_KOMPUTE)
342
- std::vector<ggml_vk_device> vkDevices = ggml_vk_available_devices (memoryRequired);
343
-
344
- std::vector<LLModel::GPUDevice> devices;
345
- for (const auto & vkDevice : vkDevices) {
346
- LLModel::GPUDevice device;
347
- device.index = vkDevice.index ;
348
- device.type = vkDevice.type ;
349
- device.heapSize = vkDevice.heapSize ;
350
- device.name = vkDevice.name ;
351
- device.vendor = vkDevice.vendor ;
352
-
353
- devices.push_back (device);
354
- }
333
+ size_t count = 0 ;
334
+ auto * vkDevices = ggml_vk_available_devices (memoryRequired, &count);
335
+
336
+ if (vkDevices) {
337
+ std::vector<LLModel::GPUDevice> devices;
338
+ devices.reserve (count);
339
+
340
+ for (size_t i = 0 ; i < count; ++i) {
341
+ auto & dev = vkDevices[i];
342
+ devices.emplace_back (
343
+ /* index = */ dev.index ,
344
+ /* type = */ dev.type ,
345
+ /* heapSize = */ dev.heapSize ,
346
+ /* name = */ dev.name ,
347
+ /* vendor = */ dev.vendor
348
+ );
349
+ }
355
350
356
- return devices ;
357
- # else
358
- return std::vector<LLModel::GPUDevice>();
351
+ free (vkDevices) ;
352
+ return devices;
353
+ }
359
354
#endif
355
+
356
+ return {};
360
357
}
361
358
362
- bool LLamaModel::initializeGPUDevice (size_t memoryRequired, const std::string& device )
359
+ bool LLamaModel::initializeGPUDevice (size_t memoryRequired, const std::string &name )
363
360
{
364
361
#if defined(GGML_USE_KOMPUTE)
365
- return ggml_vk_init_device (memoryRequired, device);
362
+ ggml_vk_device device;
363
+ bool ok = ggml_vk_get_device (&device, memoryRequired, name.c_str ());
364
+ if (ok) {
365
+ d_ptr->device = device.index ;
366
+ return true ;
367
+ }
366
368
#else
367
- return false ;
369
+ (void )memoryRequired;
370
+ (void )name;
368
371
#endif
372
+ return false ;
369
373
}
370
374
371
375
bool LLamaModel::initializeGPUDevice (const LLModel::GPUDevice &device, std::string *unavail_reason)
372
376
{
373
- bool result = false ;
374
377
#if defined(GGML_USE_KOMPUTE)
375
- ggml_vk_device vkDevice;
376
- vkDevice.index = device.index ;
377
- vkDevice.type = device.type ;
378
- vkDevice.heapSize = device.heapSize ;
379
- vkDevice.name = device.name ;
380
- vkDevice.vendor = device.vendor ;
381
- result = ggml_vk_init_device (vkDevice);
382
- if (!result && unavail_reason) {
383
- *unavail_reason = " failed to init GPU" ;
384
- }
378
+ (void )unavail_reason;
379
+ d_ptr->device = device.index ;
380
+ return true ;
385
381
#else
382
+ (void )device;
386
383
if (unavail_reason) {
387
384
*unavail_reason = " built without Kompute" ;
388
385
}
386
+ return false ;
389
387
#endif
390
- return result;
391
388
}
392
389
393
390
bool LLamaModel::initializeGPUDevice (int device)
394
391
{
395
392
#if defined(GGML_USE_KOMPUTE)
396
- return ggml_vk_init_device (device);
393
+ d_ptr->device = device;
394
+ return true ;
397
395
#else
396
+ (void )device;
398
397
return false ;
399
398
#endif
400
399
}
401
400
402
401
bool LLamaModel::hasGPUDevice ()
403
402
{
404
403
#if defined(GGML_USE_KOMPUTE)
405
- return ggml_vk_has_device () ;
404
+ return d_ptr-> device != - 1 ;
406
405
#else
407
406
return false ;
408
407
#endif
@@ -411,11 +410,12 @@ bool LLamaModel::hasGPUDevice()
411
410
bool LLamaModel::usingGPUDevice ()
412
411
{
413
412
#if defined(GGML_USE_KOMPUTE)
414
- return ggml_vk_using_vulkan () ;
413
+ return hasGPUDevice () && d_ptr-> model_params . n_gpu_layers > 0 ;
415
414
#elif defined(GGML_USE_METAL)
416
415
return true ;
417
- #endif
416
+ #else
418
417
return false ;
418
+ #endif
419
419
}
420
420
421
421
std::string get_arch_name (gguf_context *ctx_gguf) {
0 commit comments