From 935a3402923e79679e056f1d8fdc6bc48ec6dbbc Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Mon, 4 May 2026 16:23:26 +0200 Subject: [PATCH] server: implement /models?reload=1 (#21848) --- tools/server/README.md | 6 +- tools/server/server-models.cpp | 334 ++++++++++++++++++------- tools/server/server-models.h | 8 + tools/server/tests/unit/test_router.py | 48 ++++ tools/server/tests/utils.py | 5 + 5 files changed, 313 insertions(+), 88 deletions(-) diff --git a/tools/server/README.md b/tools/server/README.md index a0cfdbb6fe..62f918ce4a 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1646,7 +1646,11 @@ Listing all models in cache. The model metadata will also include a field to ind } ``` -Note: For a local GGUF (stored offline in a custom directory), the model object will have `"in_cache": false`. +Note: +1. For a local GGUF (stored offline in a custom directory), the model object will have `"in_cache": false`. +2. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow: + - If a model is running but updated or removed from the source, it will be unloaded + - If a model is not running, it will be added or updated according to the source The `status` object can be: diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index db6cbce8f9..5a05ca2033 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -243,9 +243,8 @@ void server_models::add_model(server_model_meta && meta) { }; } -// TODO: allow refreshing cached model list void server_models::load_models() { - // loading models from 3 sources: + // Phase 1: load presets from all sources — pure I/O, no lock needed // 1. cached models common_presets cached_models = ctx_preset.load_from_cache(); SRV_INF("Loaded %zu cached model presets\n", cached_models.size()); @@ -270,112 +269,266 @@ void server_models::load_models() { // note: if a model exists in both cached and local, local takes precedence common_presets final_presets; - for (const auto & [name, preset] : cached_models) { - final_presets[name] = preset; - } - for (const auto & [name, preset] : local_models) { - final_presets[name] = preset; - } - - // process custom presets from INI + for (const auto & [name, preset] : cached_models) final_presets[name] = preset; + for (const auto & [name, preset] : local_models) final_presets[name] = preset; for (const auto & [name, custom] : custom_presets) { if (final_presets.find(name) != final_presets.end()) { - // apply custom config if exists - common_preset & target = final_presets[name]; - target.merge(custom); + final_presets[name].merge(custom); } else { - // otherwise add directly final_presets[name] = custom; } } - - // server base preset from CLI args take highest precedence + // server base preset from CLI args takes highest precedence for (auto & [name, preset] : final_presets) { preset.merge(base_preset); } - // convert presets to server_model_meta and add to mapping - for (const auto & preset : final_presets) { - server_model_meta meta{ - /* preset */ preset.second, - /* name */ preset.first, - /* aliases */ {}, - /* tags */ {}, - /* port */ 0, - /* status */ SERVER_MODEL_STATUS_UNLOADED, - /* last_used */ 0, - /* args */ std::vector(), - /* exit_code */ 0, - /* stop_timeout */ DEFAULT_STOP_TIMEOUT, - }; - add_model(std::move(meta)); - } - - // log available models - { - std::unordered_set custom_names; - for (const auto & [name, preset] : custom_presets) { - custom_names.insert(name); + // Helpers that read `mapping` — must be called while holding the lock. + std::unordered_set custom_names; + for (const auto & [name, preset] : custom_presets) custom_names.insert(name); + auto join_set = [](const std::set & s) { + std::string result; + for (const auto & v : s) { + if (!result.empty()) result += ", "; + result += v; } - auto join_set = [](const std::set & s) { - std::string result; - for (const auto & v : s) { - if (!result.empty()) { - result += ", "; - } - result += v; - } - return result; - }; - + return result; + }; + auto log_available_models = [&]() { SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size()); for (const auto & [name, inst] : mapping) { bool has_custom = custom_names.find(name) != custom_names.end(); std::string info; - if (!inst.meta.aliases.empty()) { - info += " (aliases: " + join_set(inst.meta.aliases) + ")"; - } - if (!inst.meta.tags.empty()) { - info += " [tags: " + join_set(inst.meta.tags) + "]"; - } + if (!inst.meta.aliases.empty()) info += " (aliases: " + join_set(inst.meta.aliases) + ")"; + if (!inst.meta.tags.empty()) info += " [tags: " + join_set(inst.meta.tags) + "]"; SRV_INF(" %c %s%s\n", has_custom ? '*' : ' ', name.c_str(), info.c_str()); } - } - - // handle custom stop-timeout option - for (auto & [name, inst] : mapping) { - std::string val; - if (inst.meta.preset.get_option(COMMON_ARG_PRESET_STOP_TIMEOUT, val)) { - try { - inst.meta.stop_timeout = std::stoi(val); - } catch (...) { - SRV_WRN("invalid stop-timeout value '%s' for model '%s', using default %d seconds\n", - val.c_str(), name.c_str(), DEFAULT_STOP_TIMEOUT); - inst.meta.stop_timeout = DEFAULT_STOP_TIMEOUT; + }; + auto apply_stop_timeout = [&]() { + for (auto & [name, inst] : mapping) { + std::string val; + if (inst.meta.preset.get_option(COMMON_ARG_PRESET_STOP_TIMEOUT, val)) { + try { + inst.meta.stop_timeout = std::stoi(val); + } catch (...) { + SRV_WRN("invalid stop-timeout value '%s' for model '%s', using default %d seconds\n", + val.c_str(), name.c_str(), DEFAULT_STOP_TIMEOUT); + inst.meta.stop_timeout = DEFAULT_STOP_TIMEOUT; + } } } - } + }; + // update_args() injects HOST/PORT/ALIAS, so strip them before comparing presets + auto preset_options_for_compare = [](common_preset p) { + p.unset_option("LLAMA_ARG_HOST"); + p.unset_option("LLAMA_ARG_PORT"); + p.unset_option("LLAMA_ARG_ALIAS"); + return p.options; + }; - // load any autoload models - std::vector models_to_load; - for (const auto & [name, inst] : mapping) { - std::string val; - if (inst.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val)) { - if (common_arg_utils::is_truthy(val)) { + // Phase 2: acquire the lock once for all mapping mutations. + // We temporarily release it only when calling functions that acquire it internally + // (unload, load) or when joining threads (the monitoring thread calls update_status + // which locks the mutex, so joining while holding it would deadlock). + std::unique_lock lk(mutex); + bool is_first_load = mapping.empty(); + + if (is_first_load) { + // FIRST LOAD: add all models, then unlock for autoloading + for (const auto & [name, preset] : final_presets) { + server_model_meta meta{ + /* preset */ preset, + /* name */ name, + /* aliases */ {}, + /* tags */ {}, + /* port */ 0, + /* status */ SERVER_MODEL_STATUS_UNLOADED, + /* last_used */ 0, + /* args */ std::vector(), + /* exit_code */ 0, + /* stop_timeout */ DEFAULT_STOP_TIMEOUT, + }; + add_model(std::move(meta)); + } + apply_stop_timeout(); + log_available_models(); + + std::vector models_to_load; + for (const auto & [name, inst] : mapping) { + std::string val; + if (inst.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val) && common_arg_utils::is_truthy(val)) { models_to_load.push_back(name); } } - } - if ((int)models_to_load.size() > base_params.models_max) { - throw std::runtime_error(string_format( - "number of models to load on startup (%zu) exceeds models_max (%d)", - models_to_load.size(), - base_params.models_max - )); - } - for (const auto & name : models_to_load) { - SRV_INF("(startup) loading model %s\n", name.c_str()); - load(name); + if ((int)models_to_load.size() > base_params.models_max) { + throw std::runtime_error(string_format( + "number of models to load on startup (%zu) exceeds models_max (%d)", + models_to_load.size(), base_params.models_max)); + } + + lk.unlock(); + for (const auto & name : models_to_load) { + SRV_INF("(startup) loading model %s\n", name.c_str()); + load(name); + } + } else { + // RELOAD: diff the new preset list against the current mapping and reconcile + is_reloading = true; + + // find running models whose source was removed or whose preset changed + std::vector to_unload; + for (const auto & [name, inst] : mapping) { + if (!inst.meta.is_running()) continue; + auto it = final_presets.find(name); + if (it == final_presets.end()) { + to_unload.push_back(name); // removed from source + } else if (preset_options_for_compare(inst.meta.preset) != preset_options_for_compare(it->second)) { + to_unload.push_back(name); // preset changed + } + } + + // unload() acquires the lock internally, so release before each call + for (const auto & name : to_unload) { + SRV_INF("(reload) unloading model name=%s (source updated or removed)\n", name.c_str()); + lk.unlock(); + unload(name); + lk.lock(); + } + + // wait for all targeted models to reach UNLOADED; cv.wait handles unlock/relock + cv.wait(lk, [&]() { + for (const auto & name : to_unload) { + auto it = mapping.find(name); + if (it != mapping.end() && it->second.meta.is_running()) return false; + } + return true; + }); + + // collect all threads to join in one pass while the lock is held: + // - monitoring threads from just-unloaded models (to_unload) + // - threads of already-UNLOADED models that are being removed from source + std::vector threads_to_join; + for (const auto & name : to_unload) { + auto it = mapping.find(name); + if (it != mapping.end() && it->second.th.joinable()) { + threads_to_join.push_back(std::move(it->second.th)); + } + } + for (auto & [name, inst] : mapping) { + if (final_presets.find(name) == final_presets.end() && !inst.meta.is_running() && inst.th.joinable()) { + threads_to_join.push_back(std::move(inst.th)); + } + } + + // join outside the lock — monitoring thread calls update_status (needs lock) + lk.unlock(); + for (auto & th : threads_to_join) th.join(); + lk.lock(); + + // erase models no longer in any source + for (auto it = mapping.begin(); it != mapping.end(); ) { + if (final_presets.find(it->first) == final_presets.end()) { + SRV_INF("(reload) removing model name=%s (no longer in source)\n", it->first.c_str()); + GGML_ASSERT(!it->second.th.joinable()); // must have been joined above + it = mapping.erase(it); + } else { + ++it; + } + } + + // update presets for non-running models still in source + for (auto & [name, inst] : mapping) { + if (inst.meta.is_running()) continue; + auto it = final_presets.find(name); + if (it == final_presets.end()) continue; // erased above + + inst.meta.preset = it->second; + + // re-parse aliases, then validate against other models + std::set new_aliases; + std::string alias_str; + if (inst.meta.preset.get_option("LLAMA_ARG_ALIAS", alias_str) && !alias_str.empty()) { + for (auto & alias : string_split(alias_str, ',')) { + alias = string_strip(alias); + if (!alias.empty()) new_aliases.insert(alias); + } + } + inst.meta.aliases.clear(); + for (const auto & alias : new_aliases) { + bool conflict = false; + for (const auto & [other_name, other_inst] : mapping) { + if (other_name == name) continue; + if (other_name == alias || other_inst.meta.aliases.count(alias)) { + SRV_WRN("(reload) alias '%s' for model '%s' conflicts with model '%s', skipping\n", + alias.c_str(), name.c_str(), other_name.c_str()); + conflict = true; + break; + } + } + if (!conflict) inst.meta.aliases.insert(alias); + } + + // re-parse tags + inst.meta.tags.clear(); + std::string tags_str; + if (inst.meta.preset.get_option("LLAMA_ARG_TAGS", tags_str) && !tags_str.empty()) { + for (auto & tag : string_split(tags_str, ',')) { + tag = string_strip(tag); + if (!tag.empty()) inst.meta.tags.insert(tag); + } + } + + inst.meta.exit_code = 0; // clear failed state so the model can be reloaded + inst.meta.update_args(ctx_preset, bin_path); + } + + // add models that are new in this reload + std::vector newly_added; + for (const auto & [name, preset] : final_presets) { + if (mapping.find(name) == mapping.end()) { + server_model_meta meta{ + /* preset */ preset, + /* name */ name, + /* aliases */ {}, + /* tags */ {}, + /* port */ 0, + /* status */ SERVER_MODEL_STATUS_UNLOADED, + /* last_used */ 0, + /* args */ std::vector(), + /* exit_code */ 0, + /* stop_timeout */ DEFAULT_STOP_TIMEOUT, + }; + add_model(std::move(meta)); + newly_added.push_back(name); + } + } + + apply_stop_timeout(); + + // clear reload flag before unlocking for autoload — load() blocks on !is_reloading, + // so clearing it here (while still locked) prevents a deadlock in the autoload calls below + is_reloading = false; + cv.notify_all(); + + log_available_models(); + + // collect autoload candidates while still under the lock + std::vector to_autoload; + for (const auto & name : newly_added) { + auto it = mapping.find(name); + if (it != mapping.end()) { + std::string val; + if (it->second.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val) && common_arg_utils::is_truthy(val)) { + to_autoload.push_back(name); + } + } + } + + lk.unlock(); + for (const auto & name : to_autoload) { + SRV_INF("(reload) loading new model %s\n", name.c_str()); + load(name); + } } } @@ -536,7 +689,10 @@ void server_models::load(const std::string & name) { } unload_lru(); - std::lock_guard lk(mutex); + std::unique_lock lk(mutex); + // edge case: block until any in-progress reload has finished so we always load + // against the freshest preset and a consistent mapping state + cv.wait(lk, [this]() { return !is_reloading; }); auto meta = mapping[name].meta; if (meta.status != SERVER_MODEL_STATUS_UNLOADED) { @@ -993,7 +1149,11 @@ void server_models_routes::init_routes() { return res; }; - this->get_router_models = [this](const server_http_req &) { + this->get_router_models = [this](const server_http_req & req) { + bool reload = !req.get_param("reload", "").empty(); + if (reload) { + models.load_models(); + } auto res = std::make_unique(); json models_json = json::array(); auto all_models = models.get_all_meta(); diff --git a/tools/server/server-models.h b/tools/server/server-models.h index b3428ef544..64a15f5ba4 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -100,6 +100,9 @@ private: std::condition_variable cv_stop; std::set stopping_models; + // set to true while load_models() is executing a reload; load() will wait until clear + bool is_reloading = false; + common_preset_context ctx_preset; common_params base_params; @@ -118,6 +121,11 @@ private: public: server_models(const common_params & params, int argc, char ** argv); + // (re-)load the list of models from various sources and prepare the metadata mapping + // - if this is called the first time, simply populate the metadata + // - if this is called subsequently (e.g. when refreshing from disk): + // - if a model is running but updated or removed from the source, it will be unloaded + // - if a model is not running, it will be added or updated according to the source void load_models(); // check if a model instance exists (thread-safe) diff --git a/tools/server/tests/unit/test_router.py b/tools/server/tests/unit/test_router.py index 79e60db408..c93b92b0b2 100644 --- a/tools/server/tests/unit/test_router.py +++ b/tools/server/tests/unit/test_router.py @@ -62,6 +62,12 @@ def test_router_chat_completion_stream(model: str, success: bool): assert content == "" +def _get_model_ids(is_reload: bool) -> set[str]: + res = server.make_request("GET", "/models" + ("?reload=1" if is_reload else "")) + assert res.status_code == 200 + return {item["id"] for item in res.body.get("data", [])} + + def _get_model_status(model_id: str) -> str: res = server.make_request("GET", "/models") assert res.status_code == 200 @@ -205,3 +211,45 @@ def test_router_api_key_required(): ) assert authed.status_code == 200 assert "error" not in authed.body + + +def test_router_reload_models(): + """POST /models/reload re-reads the INI preset and updates the model list.""" + global server + + preset_path = os.path.join(TMP_DIR, "test_reload.ini") + + # Initial preset: two models + with open(preset_path, "w") as f: + f.write( + "[model-reload-a]\n" + "hf-repo = ggml-org/test-model-stories260K\n" + "\n" + "[model-reload-b]\n" + "hf-repo = ggml-org/test-model-stories260K-infill\n" + ) + + server.models_preset = preset_path + server.start() + + ids = _get_model_ids(is_reload=False) + assert "model-reload-a" in ids + assert "model-reload-b" in ids + + # Updated preset: remove a, keep b unchanged, add c + with open(preset_path, "w") as f: + f.write( + "[model-reload-b]\n" + "hf-repo = ggml-org/test-model-stories260K-infill\n" + "\n" + "[model-reload-c]\n" + "hf-repo = ggml-org/test-model-stories260K\n" + ) + + try: + ids = _get_model_ids(is_reload=True) + assert "model-reload-a" not in ids, "removed model should no longer appear" + assert "model-reload-b" in ids, "unchanged model should still appear" + assert "model-reload-c" in ids, "newly added model should appear" + finally: + os.remove(preset_path) diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py index 88700487be..15f9bd95d7 100644 --- a/tools/server/tests/utils.py +++ b/tools/server/tests/utils.py @@ -5,6 +5,8 @@ import subprocess import os + +TMP_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tmp") import re import json from json import JSONDecodeError @@ -86,6 +88,7 @@ class ServerProcess: api_key: str | None = None models_dir: str | None = None models_max: int | None = None + models_preset: str | None = None no_models_autoload: bool | None = None lora_files: List[str] | None = None enable_ctx_shift: int | None = False @@ -156,6 +159,8 @@ class ServerProcess: server_args.extend(["--models-dir", self.models_dir]) if self.models_max is not None: server_args.extend(["--models-max", self.models_max]) + if self.models_preset: + server_args.extend(["--models-preset", self.models_preset]) if self.n_batch: server_args.extend(["--batch-size", self.n_batch]) if self.n_ubatch: