From a0101225bc60908572ed8b4dfd2ee1a61c69b689 Mon Sep 17 00:00:00 2001
From: fl0rianr <f.reinle@otec.de>
Date: Wed, 6 May 2026 17:03:45 +0200
Subject: [PATCH] common: do not fit to unknown device memory (#22614)

* common: do not fit to unknown device memory

Signed-off-by: Florian Reinle <f.reinle@otec.de>

* common: preserve host fallback for non-GPU fit devices

Signed-off-by: Florian Reinle <f.reinle@otec.de>

* common: keep unknown GPU fit memory at zero

Signed-off-by: Florian Reinle <f.reinle@otec.de>

---------

Signed-off-by: Florian Reinle <f.reinle@otec.de>
---
 common/fit.cpp | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/common/fit.cpp b/common/fit.cpp
index aca3f4d407..e66982df5b 100644
--- a/common/fit.cpp
+++ b/common/fit.cpp
@@ -109,16 +109,24 @@ static std::vector<llama_device_memory_data> common_get_device_memory_data(
         ret.back().total = total;
     }
     for (size_t i = 0; i < nd; i++) {
+        ggml_backend_dev_t dev = llama_model_get_device(model, i);
+
         size_t free;
         size_t total;
-        ggml_backend_dev_memory(llama_model_get_device(model, i), &free, &total);
+        ggml_backend_dev_memory(dev, &free, &total);
 
-        // devices can return 0 bytes for free and total memory if they do not
-        // have any to report. in this case, we will use the host memory as a fallback
-        // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
+        // Some non-GPU accelerator backends, such as BLAS, report 0/0 and rely on
+        // the host-memory fallback. For GPU-like backends, keep 0/0 so --fit does
+        // not assign anything to a device with an unknown memory budget.
         if (free == 0 && total == 0) {
-            free  = ret.back().free;
-            total = ret.back().total;
+            const enum ggml_backend_dev_type type = ggml_backend_dev_type(dev);
+            if (type == GGML_BACKEND_DEVICE_TYPE_GPU || type == GGML_BACKEND_DEVICE_TYPE_IGPU) {
+                LOG_WRN("%s: device %s did not report memory; --fit will not use it\n",
+                        __func__, ggml_backend_dev_name(dev));
+            } else {
+                free  = ret.back().free;
+                total = ret.back().total;
+            }
         }
         ret[i].free  = free;
         ret[i].total = total;