load directly from downloaded state

handle models that need to be downloaded before estimation
cont : clean-up
2026-05-09 02:24:17 +00:00 · 2026-05-01 15:39:00 +02:00 · 2026-05-01 15:39:00 +02:00 · 2026-05-01 15:39:00 +02:00 · 2026-05-01 15:39:00 +02:00 · 2026-05-01 15:39:00 +02:00
7 changed files with 394 additions and 60 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3052,6 +3052,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.models_max = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
+    add_opt(common_arg(
+        {"--models-memory-margin"}, "N",
+        string_format("for router server, MiB of memory to leave free, per device (default: %d, 0 = unlimited)", params.models_memory_margin),
+        [](common_params & params, int value) {
+            params.models_memory_margin = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MEMORY_MARGIN"));
    add_opt(common_arg(
        {"--models-autoload"},
        {"--no-models-autoload"},
@@ -3281,6 +3288,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.offline = true;
        }
    ).set_env("LLAMA_OFFLINE"));
+    add_opt(common_arg(
+        {"--download-only"},
+        "Download the model file(s) and exit",
+        [](common_params & params) {
+            params.download_only = true;
+        }
+    ));
    add_opt(common_arg(
        {"-lv", "--verbosity", "--log-verbosity"}, "N",
        string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
--- a/common/common.h
+++ b/common/common.h
@@ -503,6 +503,7 @@ struct common_params {
    int32_t control_vector_layer_start = -1; // layer range for control vector
    int32_t control_vector_layer_end   = -1; // layer range for control vector
    bool    offline                    = false;
+    bool    download_only              = false; // only download the model if required, don't start the server

    int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
    int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@@ -626,10 +627,11 @@ struct common_params {
    std::vector<std::string> server_tools;

    // router server configs
-    std::string models_dir    = ""; // directory containing models for the router server
-    std::string models_preset = ""; // directory containing model presets for the router server
-    int models_max = 4;             // maximum number of models to load simultaneously
-    bool models_autoload = true;    // automatically load models when requested via the router server
+    std::string models_dir    = "";  // directory containing models for the router server
+    std::string models_preset = "";  // directory containing model presets for the router server
+    int models_max = 4;              // maximum number of models to load simultaneously
+    int models_memory_margin = 1024; // MiB of free memory to preserve per device (0 = disabled)
+    bool models_autoload = true;     // automatically load models when requested via the router server

    bool log_json = false;

--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -3493,6 +3493,19 @@ void llama_perf_context_reset(llama_context * ctx) {
    ctx->perf_reset();
 }

+uint64_t llama_context_device_memory(const llama_context * ctx, ggml_backend_dev_t device) {
+    const bool is_host = ggml_backend_dev_type(device) == GGML_BACKEND_DEVICE_TYPE_CPU;
+    uint64_t total = 0;
+    for (const auto & [buft, mb] : ctx->memory_breakdown()) {
+        const bool matches = is_host ? ggml_backend_buft_is_host(buft) :
+                                       ggml_backend_buft_get_device(buft) == device;
+        if (matches) {
+            total += mb.total();
+        }
+    }
+    return total;
+}
+
 //
 // training
 //
--- a/src/llama-ext.h
+++ b/src/llama-ext.h
@@ -88,3 +88,9 @@ LLAMA_API int32_t llama_model_n_devices(const struct llama_model * model);
 LLAMA_API ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i);

 LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx);
+
+// Returns the projected memory use (model + context + compute) in bytes
+// for the given device within this context. Returns 0 if the device is not used.
+LLAMA_API uint64_t llama_context_device_memory(
+        const struct llama_context * ctx,
+        ggml_backend_dev_t           device);
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -8,6 +8,8 @@
 #include <cpp-httplib/httplib.h> // TODO: remove this once we use HTTP client from download.h
 #include <sheredom/subprocess.h>

+#include "../../src/llama-ext.h"
+
 #include <functional>
 #include <algorithm>
 #include <thread>
@@ -96,6 +98,7 @@ static void unset_reserved_args(common_preset & preset, bool unset_model_args) {
    preset.unset_option("LLAMA_API_KEY");
    preset.unset_option("LLAMA_ARG_MODELS_DIR");
    preset.unset_option("LLAMA_ARG_MODELS_MAX");
+    preset.unset_option("LLAMA_ARG_MODELS_MEMORY_MARGIN");
    preset.unset_option("LLAMA_ARG_MODELS_PRESET");
    preset.unset_option("LLAMA_ARG_MODELS_AUTOLOAD");
    if (unset_model_args) {
@@ -179,9 +182,27 @@ server_models::server_models(
        bin_path = get_server_exec_path().string();
    } catch (const std::exception & e) {
        bin_path = argv[0];
-        LOG_WRN("failed to get server executable path: %s\n", e.what());
-        LOG_WRN("using original argv[0] as fallback: %s\n", argv[0]);
+        SRV_WRN("failed to get server executable path: %s\n", e.what());
+        SRV_WRN("using original argv[0] as fallback: %s\n", argv[0]);
    }
+
+    const size_t memory_margin = (size_t) base_params.models_memory_margin * 1024 * 1024;
+
+    if (memory_margin > 0) {
+        const size_t n_devs = ggml_backend_dev_count();
+        for (size_t i = 0; i < n_devs; i++) {
+            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+            size_t free, total;
+            ggml_backend_dev_memory(dev, &free, &total);
+            if (total > 0) {
+                const size_t available = (free > memory_margin) ? free - memory_margin : 0;
+                dmm_available[dev] = available;
+                SRV_DBG("device %s: available memory after margin=%zu MiB\n",
+                    ggml_backend_dev_name(dev), available / (1024 * 1024));
+            }
+        }
+    }
+
    load_models();
 }

@@ -297,16 +318,17 @@ void server_models::load_models() {
    // convert presets to server_model_meta and add to mapping
    for (const auto & preset : final_presets) {
        server_model_meta meta{
-            /* preset       */ preset.second,
-            /* name         */ preset.first,
-            /* aliases      */ {},
-            /* tags         */ {},
-            /* port         */ 0,
-            /* status       */ SERVER_MODEL_STATUS_UNLOADED,
-            /* last_used    */ 0,
-            /* args         */ std::vector<std::string>(),
-            /* exit_code    */ 0,
-            /* stop_timeout */ DEFAULT_STOP_TIMEOUT,
+            /* preset            */ preset.second,
+            /* name              */ preset.first,
+            /* aliases           */ {},
+            /* tags              */ {},
+            /* port              */ 0,
+            /* status            */ SERVER_MODEL_STATUS_UNLOADED,
+            /* last_used         */ 0,
+            /* memory_per_device */ {},
+            /* args              */ std::vector<std::string>(),
+            /* exit_code         */ 0,
+            /* stop_timeout      */ DEFAULT_STOP_TIMEOUT,
        };
        add_model(std::move(meta));
    }
@@ -497,49 +519,287 @@ std::vector<server_model_meta> server_models::get_all_meta() {
    return result;
 }

-void server_models::unload_lru() {
-    if (base_params.models_max <= 0) {
-        return; // no limit
-    }
-    // remove one of the servers if we passed the models_max (least recently used - LRU)
-    std::string lru_model_name = "";
-    int64_t lru_last_used = ggml_time_ms();
-    size_t count_active = 0;
-    {
-        std::unique_lock<std::mutex> lk(mutex);
-        for (const auto & m : mapping) {
-            if (m.second.meta.is_running()) {
-                count_active++;
-                if (m.second.meta.last_used < lru_last_used) {
-                    lru_model_name = m.first;
-                    lru_last_used = m.second.meta.last_used;
-                }
+int server_models::can_fit(const device_memory_map & dmm_req) const {
+    device_memory_map dmm_total;
+    for (const auto & m : mapping) {
+        if (m.second.meta.is_running()) {
+            for (const auto & [dev, mem] : m.second.meta.dmm_req) {
+                dmm_total[dev] += mem;
            }
        }
    }
-    if (!lru_model_name.empty() && count_active >= (size_t)base_params.models_max) {
-        SRV_INF("models_max limit reached, removing LRU name=%s\n", lru_model_name.c_str());
-        unload(lru_model_name);
-        // wait for unload to complete
-        {
-            std::unique_lock<std::mutex> lk(mutex);
-            cv.wait(lk, [this, &lru_model_name]() {
-                return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED;
-            });
+
+    auto get = [](const device_memory_map & dmm, ggml_backend_dev_t dev) {
+        auto it = dmm.find(dev);
+        return it != dmm.end() ? it->second : 0;
+    };
+
+    int res = 0;
+
+    for (const auto & [dev, limit] : dmm_available) {
+        const size_t mem_total = get(dmm_total, dev);
+        const size_t mem_new   = get(dmm_req,   dev);
+
+        SRV_DBG("device %s: total=%zu MiB, new=%zu MiB, limit=%zu MiB\n",
+            ggml_backend_dev_name(dev),
+            mem_total / (1024 * 1024), mem_new / (1024 * 1024), limit / (1024 * 1024));
+
+        if (mem_total + mem_new > limit) {
+            res++;
        }
    }
+
+    return res;
+}
+
+void server_models::unload_lru(const device_memory_map & dmm_req) {
+    const bool check_active = base_params.models_max > 0;
+    const bool check_memory = base_params.models_memory_margin > 0;
+
+    if (!check_active && !check_memory) {
+        return; // no limit
+    }
+
+    if (check_memory) {
+        GGML_ASSERT(!dmm_available.empty());
+    }
+
+    while (true) {
+        std::string lru_model_name;
+        int64_t lru_last_used = ggml_time_ms();
+
+        int count_active = 0;
+        int count_exceed = 0;
+        {
+            std::unique_lock<std::mutex> lk(mutex);
+            for (const auto & m : mapping) {
+                if (m.second.meta.is_running()) {
+                    count_active++;
+                    if (m.second.meta.last_used < lru_last_used) {
+                        lru_model_name = m.first;
+                        lru_last_used = m.second.meta.last_used;
+                    }
+                }
+            }
+            if (check_memory) {
+                count_exceed = can_fit(dmm_req);
+            }
+        }
+
+        const bool active_exceeded = check_active && count_active >= base_params.models_max;
+        const bool memory_exceeded = check_memory && count_exceed > 0;
+
+        if (!lru_model_name.empty() && (active_exceeded || memory_exceeded)) {
+            SRV_INF("limits reached (count=%d, memory margin exceeded on %d device(s)), removing LRU name=%s\n",
+                    count_active, count_exceed, lru_model_name.c_str());
+            unload(lru_model_name);
+            // wait for unload to complete
+            {
+                std::unique_lock<std::mutex> lk(mutex);
+                cv.wait(lk, [this, &lru_model_name]() {
+                    return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED;
+                });
+            }
+        } else {
+            break;
+        }
+    }
+}
+
+static std::string resolve_model_path(const common_preset & preset) {
+    common_params params;
+    preset.apply_to_params(params);
+
+    if (!params.model.path.empty()) {
+        return params.model.path;
+    }
+
+    if (!params.model.hf_repo.empty() || !params.model.url.empty()) {
+        common_download_opts opts;
+        opts.offline = true;
+        auto result = common_download_model(params.model, opts);
+        return result.model_path;
+    }
+
+    return "";
+}
+
+static device_memory_map get_model_memory_per_device(const common_preset & preset) {
+    common_params params;
+    preset.apply_to_params(params);
+
+    if(params.model.path.empty()) {
+        params.model.path = resolve_model_path(preset);
+        if(params.model.path.empty()) {
+            return {};
+        }
+    }
+
+    struct log_ud_t {
+        struct {
+            ggml_log_callback callback;
+            void * user_data;
+        } original;
+        ggml_log_level min_level;
+    } log_ud;
+    llama_log_get(&log_ud.original.callback, &log_ud.original.user_data);
+    log_ud.min_level = GGML_LOG_LEVEL_WARN;
+
+    llama_log_set([](ggml_log_level level, const char * text, void * ud) {
+        log_ud_t * d = (log_ud_t *) ud;
+        const ggml_log_level eff = level >= d->min_level ? level : GGML_LOG_LEVEL_DEBUG;
+        d->original.callback(eff, text, d->original.user_data);
+    }, &log_ud);
+
+    llama_model_params mparams = common_model_params_to_llama(params);
+    mparams.no_alloc = true;
+    mparams.use_mmap = false;
+    mparams.use_mlock = false;
+
+    llama_model_ptr model{llama_model_load_from_file(params.model.path.c_str(), mparams)};
+
+    if (!model) {
+        llama_log_set(log_ud.original.callback, log_ud.original.user_data);
+        return {};
+    }
+
+    llama_context_params cparams = common_context_params_to_llama(params);
+    llama_context_ptr ctx{llama_init_from_model(model.get(), cparams)};
+    llama_log_set(log_ud.original.callback, log_ud.original.user_data);
+
+    if (!ctx) {
+        return {};
+    }
+
+    device_memory_map result;
+    const size_t n_devs = ggml_backend_dev_count();
+    for (size_t i = 0; i < n_devs; i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        uint64_t bytes = llama_context_device_memory(ctx.get(), dev);
+        if (bytes > 0) {
+            result[dev] = bytes;
+        }
+    }
+
+    return result;
+}
+
+bool server_models::download_model(const std::string & name) {
+    std::vector<std::string> child_args;
+    std::vector<std::string> child_env;
+    {
+        std::lock_guard<std::mutex> lk(mutex);
+        auto & meta = mapping[name].meta;
+        child_args = meta.preset.to_args(bin_path);
+        child_env  = base_env;
+    }
+    child_args.push_back("--download-only");
+
+    SRV_INF("downloading model name=%s\n", name.c_str());
+
+    std::vector<char *> argv = to_char_ptr_array(child_args);
+    std::vector<char *> envp = to_char_ptr_array(child_env);
+
+    subprocess_s proc;
+    int options = subprocess_option_no_window | subprocess_option_combined_stdout_stderr;
+    if (subprocess_create_ex(argv.data(), options, envp.data(), &proc) != 0) {
+        SRV_ERR("failed to spawn download process for model name=%s\n", name.c_str());
+        return false;
+    }
+
+    FILE * out = subprocess_stdout(&proc);
+    if (out) {
+        char buffer[4096];
+        while (fgets(buffer, sizeof(buffer), out) != nullptr) {
+            LOG("[dl:%s] %s", name.c_str(), buffer);
+        }
+    }
+
+    int exit_code = 0;
+    subprocess_join(&proc, &exit_code);
+    subprocess_destroy(&proc);
+
+    if (exit_code != 0) {
+        SRV_ERR("download process for model name=%s exited with code %d\n", name.c_str(), exit_code);
+        return false;
+    }
+
+    SRV_INF("download complete for model name=%s\n", name.c_str());
+    return true;
 }

 void server_models::load(const std::string & name) {
    if (!has_model(name)) {
        throw std::runtime_error("model name=" + name + " is not found");
    }
-    unload_lru();
+
+    {
+        common_preset preset_copy;
+        {
+            std::lock_guard<std::mutex> lk(mutex);
+            preset_copy = mapping[name].meta.preset;
+        }
+        if (resolve_model_path(preset_copy).empty()) {
+            {
+                std::lock_guard<std::mutex> lk(mutex);
+                auto & meta = mapping[name].meta;
+                if (meta.status != SERVER_MODEL_STATUS_UNLOADED) {
+                    return;
+                }
+                meta.status = SERVER_MODEL_STATUS_DOWNLOADING;
+                cv.notify_all();
+            }
+            std::thread([this, name]() {
+                if (!download_model(name)) {
+                    update_status(name, SERVER_MODEL_STATUS_UNLOADED, 1);
+                    return;
+                }
+                device_memory_map mem;
+                if (base_params.models_memory_margin > 0) {
+                    std::lock_guard<std::mutex> lk(mutex);
+                    auto & meta = mapping[name].meta;
+                    meta.dmm_req = get_model_memory_per_device(meta.preset);
+                    if (meta.dmm_req.empty()) {
+                        SRV_WRN("failed to estimate memory for model %s, memory limits will not apply\n", name.c_str());
+                    }
+                    mem = meta.dmm_req;
+                }
+                try {
+                    _load(name, mem);
+                } catch (const std::exception & e) {
+                    SRV_ERR("failed to load model %s after download: %s\n", name.c_str(), e.what());
+                    update_status(name, SERVER_MODEL_STATUS_UNLOADED, 1);
+                }
+            }).detach();
+            return;
+        }
+    }
+
+    device_memory_map dmm_req;
+    if (base_params.models_memory_margin > 0) {
+        // determine the required memory by the model upon its first load
+        std::lock_guard<std::mutex> lk(mutex);
+        auto & meta = mapping[name].meta;
+        if (meta.dmm_req.empty()) {
+            meta.dmm_req = get_model_memory_per_device(meta.preset);
+            if (meta.dmm_req.empty()) {
+                SRV_WRN("failed to estimate memory for model %s, memory limits will not apply\n", name.c_str());
+            }
+        }
+
+        dmm_req = meta.dmm_req;
+    }
+
+    _load(name, dmm_req);
+}
+
+void server_models::_load(const std::string & name, const device_memory_map & dmm_req) {
+    unload_lru(dmm_req);

    std::lock_guard<std::mutex> lk(mutex);

    auto meta = mapping[name].meta;
-    if (meta.status != SERVER_MODEL_STATUS_UNLOADED) {
+    if (meta.status != SERVER_MODEL_STATUS_UNLOADED && meta.status != SERVER_MODEL_STATUS_DOWNLOADING) {
        SRV_INF("model %s is not ready\n", name.c_str());
        return;
    }
@@ -548,15 +808,24 @@ void server_models::load(const std::string & name) {
    // exceeding models_max. Without this, the window between unload_lru()
    // releasing its lock and this lock_guard acquiring allows multiple
    // threads to each observe capacity and all proceed to load.
-    if (base_params.models_max > 0) {
-        size_t count_active = 0;
-        for (const auto & m : mapping) {
-            if (m.second.meta.is_running()) {
-                count_active++;
+    {
+        const bool check_active = base_params.models_max > 0;
+        const bool check_memory = base_params.models_memory_margin > 0;
+
+        if (check_active || check_memory) {
+            int count_active = 0;
+            for (const auto & m : mapping) {
+                if (m.second.meta.is_running()) {
+                    count_active++;
+                }
+            }
+
+            const bool active_exceeded = check_active && count_active >= base_params.models_max;
+            const bool memory_exceeded = check_memory && can_fit(dmm_req) > 0;
+
+            if (active_exceeded || memory_exceeded) {
+                throw std::runtime_error("model limit reached, try again later");
            }
-        }
-        if (count_active >= (size_t)base_params.models_max) {
-            throw std::runtime_error("model limit reached, try again later");
        }
    }

@@ -765,7 +1034,8 @@ void server_models::wait_until_loading_finished(const std::string & name) {
    cv.wait(lk, [this, &name]() {
        auto it = mapping.find(name);
        if (it != mapping.end()) {
-            return it->second.meta.status != SERVER_MODEL_STATUS_LOADING;
+            return it->second.meta.status != SERVER_MODEL_STATUS_LOADING &&
+                   it->second.meta.status != SERVER_MODEL_STATUS_DOWNLOADING;
        }
        return false;
    });
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -14,6 +14,9 @@
 /**
 * state diagram:
 *
+ *
+ *  ┌► DOWNLOADING ─┐
+ *  │               ▼
 * UNLOADED ──► LOADING ──► LOADED ◄──── SLEEPING
 *  ▲            │            │               ▲
 *  └───failed───┘            │               │
@@ -21,8 +24,8 @@
 *  └────────unloaded─────────┘
 */
 enum server_model_status {
-    // TODO: also add downloading state when the logic is added
    SERVER_MODEL_STATUS_UNLOADED,
+    SERVER_MODEL_STATUS_DOWNLOADING,
    SERVER_MODEL_STATUS_LOADING,
    SERVER_MODEL_STATUS_LOADED,
    SERVER_MODEL_STATUS_SLEEPING
@@ -32,6 +35,9 @@ static server_model_status server_model_status_from_string(const std::string & s
    if (status_str == "unloaded") {
        return SERVER_MODEL_STATUS_UNLOADED;
    }
+    if (status_str == "downloading") {
+        return SERVER_MODEL_STATUS_DOWNLOADING;
+    }
    if (status_str == "loading") {
        return SERVER_MODEL_STATUS_LOADING;
    }
@@ -46,14 +52,17 @@ static server_model_status server_model_status_from_string(const std::string & s

 static std::string server_model_status_to_string(server_model_status status) {
    switch (status) {
-        case SERVER_MODEL_STATUS_UNLOADED: return "unloaded";
-        case SERVER_MODEL_STATUS_LOADING:  return "loading";
-        case SERVER_MODEL_STATUS_LOADED:   return "loaded";
-        case SERVER_MODEL_STATUS_SLEEPING: return "sleeping";
-        default:                           return "unknown";
+        case SERVER_MODEL_STATUS_UNLOADED:     return "unloaded";
+        case SERVER_MODEL_STATUS_DOWNLOADING:  return "downloading";
+        case SERVER_MODEL_STATUS_LOADING:      return "loading";
+        case SERVER_MODEL_STATUS_LOADED:       return "loaded";
+        case SERVER_MODEL_STATUS_SLEEPING:     return "sleeping";
+        default:                               return "unknown";
    }
 }

+using device_memory_map = std::map<ggml_backend_dev_t, size_t>;
+
 struct server_model_meta {
    common_preset preset;
    std::string name;
@@ -62,6 +71,7 @@ struct server_model_meta {
    int port = 0;
    server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
    int64_t last_used = 0; // for LRU unloading
+    device_memory_map dmm_req; // bytes required per device
    std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
    int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
    int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
@@ -107,14 +117,28 @@ private:
    std::vector<std::string> base_env;
    common_preset base_preset; // base preset from llama-server CLI args

+    // available memory per device
+    device_memory_map dmm_available;
+
    void update_meta(const std::string & name, const server_model_meta & meta);

    // unload least recently used models if the limit is reached
-    void unload_lru();
+    void unload_lru(const device_memory_map & dmm_req);

    // not thread-safe, caller must hold mutex
    void add_model(server_model_meta && meta);

+    // return number of devices where the memory limit would be exceeded
+    // return 0 if the new model would fit on all devices
+    // not thread-safe, caller must hold mutex
+    int can_fit(const device_memory_map & dmm_req) const;
+
+    // download model files, blocking call (caller must NOT hold mutex)
+    bool download_model(const std::string & name);
+
+    // Internal helper for model loading
+    void _load(const std::string & name, const device_memory_map & dmm_req);
+
 public:
    server_models(const common_params & params, int argc, char ** argv);

--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -83,6 +83,11 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    if (params.download_only) {
+        LOG_INF("%s: model downloaded successfully, exiting\n", __func__);
+        return 0;
+    }
+
    // validate batch size for embeddings
    // embeddings require all tokens to be processed in a single ubatch
    // see https://github.com/ggml-org/llama.cpp/issues/12836
Author	SHA1	Message	Date
Ruben Ortlam	da1f16886f	load directly from downloaded state	2026-05-01 15:39:00 +02:00
Ruben Ortlam	884901f04d	handle models that need to be downloaded before estimation	2026-05-01 15:39:00 +02:00
Georgi Gerganov	01dd39342d	cont : clean-up	2026-05-01 15:39:00 +02:00
Ruben Ortlam	972813c253	also strip models memory margin from child processes	2026-05-01 15:39:00 +02:00
Ruben Ortlam	b440ee05b8	improve variable naming, fix style	2026-05-01 15:39:00 +02:00
Ruben Ortlam	f24011f2cf	improve memory_per_device map naming	2026-05-01 15:38:59 +02:00
Ruben Ortlam	f4a384b46c	fix model count exceeded check	2026-05-01 15:38:59 +02:00
Ruben Ortlam	f750bae2d3	move llama_context_device_memory function to llama-ext.h	2026-05-01 15:38:59 +02:00
Ruben Ortlam	527c91ac87	add server memory debug logging	2026-05-01 15:38:59 +02:00
Ruben Ortlam	3c815b369e	use memory margin instead of total size limit, apply to each device separately	2026-05-01 15:38:59 +02:00
Ruben Ortlam	18163c4143	only set model memory_mb if not previously calculated	2026-05-01 15:38:59 +02:00
Ruben Ortlam	af28cd24dc	use no_alloc to get memory requirements for model load	2026-05-01 15:38:59 +02:00
Ruben Ortlam	e6468c1715	estimate with to-be-loaded model size included	2026-05-01 15:38:59 +02:00
Ruben Ortlam	0a019ed812	server: add --models-memory-max parameter to allow dynamically unloading models when they exceed a memory size threshold	2026-05-01 15:38:59 +02:00