diff --git a/common/arg.cpp b/common/arg.cpp
index 7c4b5d42f0..ae052f7a5a 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -626,43 +626,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
     if (!params.speculative.tensor_buft_overrides.empty()) {
         params.speculative.tensor_buft_overrides.push_back({nullptr, nullptr});
     }
-    {
-        bool has_draft =!params.speculative.model.path.empty();
-        bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3
-        bool has_lookup_caches = !params.speculative.lookup_cache_static.empty()
-            && !params.speculative.lookup_cache_dynamic.empty();
-        bool has_simple = (params.speculative.draftless_type == COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE);
-        bool found_config_draft = false;
-        bool found_config_eagle3 = false;
-        bool found_config_ngram_cache = false;
-        bool found_config_ngram_simple = false;
-        for (const auto & config : params.speculative.configs) {
-            if (config.type == COMMON_SPECULATIVE_TYPE_DRAFT) {
-                found_config_draft = true;
-            }
-            if (config.type == COMMON_SPECULATIVE_TYPE_EAGLE3) {
-                found_config_eagle3 = true;
-            }
-            if (config.type == COMMON_SPECULATIVE_TYPE_NGRAM_CACHE) {
-                found_config_ngram_cache = true;
-            }
-            if (config.type == COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE) {
-                found_config_ngram_simple = true;
-            }
-        }
-        if (has_simple && !found_config_ngram_simple) {
-            params.speculative.configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE));
-        }
-        if (has_lookup_caches && !found_config_ngram_cache) {
-            params.speculative.configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE));
-        }
-        if (has_draft && !found_config_draft) {
-            params.speculative.configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT));
-        }
-        if (has_draft_eagle3 && !found_config_eagle3) {
-            params.speculative.configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_EAGLE3));
-        }
-    }
 
     if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
         throw std::runtime_error(string_format(
@@ -3431,17 +3394,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
-        {"--spec-draftless"}, "[none|ngram-cache|ngram-simple]",
+        {"--spec-draftless"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]",
         string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n",
             common_speculative_type_to_str(params.speculative.draftless_type).c_str()),
         [](common_params & params, const std::string & value) {
             if (value == "none") {
                 params.speculative.draftless_type = COMMON_SPECULATIVE_TYPE_NONE;
             } else if (value == "ngram-cache") {
-                // TODO: this does nothing atm
                 params.speculative.draftless_type = COMMON_SPECULATIVE_TYPE_NGRAM_CACHE;
             } else if (value == "ngram-simple") {
                 params.speculative.draftless_type = COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE;
+            } else if (value == "ngram-map-k") {
+                params.speculative.draftless_type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
+            } else if (value == "ngram-map-k4v") {
+                params.speculative.draftless_type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V;
             } else {
                 throw std::invalid_argument("unknown speculative decoding type without draft model");
             }
@@ -3449,7 +3415,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--spec-ngram-size-n"}, "N",
-        string_format("ngram size N for ngram-map speculative decoding, length of lookup n-gram (default: %d)", params.speculative.spec_ngram_size_n),
+        string_format("ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: %d)", params.speculative.spec_ngram_size_n),
         [](common_params & params, int value) {
             if (value < 1 || value > 1024) {
                 throw std::invalid_argument("ngram size N must be between 1 and 1024 inclusive");
@@ -3459,7 +3425,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--spec-ngram-size-m"}, "N",
-        string_format("ngram size M for ngram-map speculative decoding, length of draft m-gram (default: %d)", params.speculative.spec_ngram_size_m),
+        string_format("ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: %d)", params.speculative.spec_ngram_size_m),
         [](common_params & params, int value) {
             if (value < 1 || value > 1024) {
                 throw std::invalid_argument("ngram size M must be between 1 and 1024 inclusive");
@@ -3468,35 +3434,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
-        {"--spec-config"}, "SPECULATIVE_CONFIG",
-        string_format("list of speculative decoding types, separated by ';', optionally followed by a colon and a comma-separated list of key=value pairs\n(types: %s)\n", common_speculative_type_name_str().c_str()),
-        [](common_params & params, const std::string & value) {
-            const auto config_strings = string_split<std::string>(value, ';');
-            for (const auto & config_string : config_strings) {
-                const auto parts = string_split<std::string>(config_string, ':');
-                if (parts.size() < 1 || parts.size() > 2) {
-                    throw std::invalid_argument("invalid speculative decoding config");
-                }
-                const auto type_str = parts[0];
-                const auto type = common_speculative_type_from_name(type_str);
-                if (type == COMMON_SPECULATIVE_TYPE_COUNT) {
-                    throw std::invalid_argument(string_format("unknown speculative decoding type: %s", type_str.c_str()));
-                }
-                common_speculative_config spec_config = {type};
-                if (parts.size() == 2) {
-                    const auto key_value_pairs = string_split<std::string>(parts[1], ',');
-                    for (const auto & key_value_pair : key_value_pairs) {
-                        const auto key_value = string_split<std::string>(key_value_pair, '=');
-                        if (key_value.size() != 2) {
-                            throw std::invalid_argument("invalid key=value pair");
-                        }
-                        const auto & key = key_value[0];
-                        const auto & value = key_value[1];
-                        spec_config.config[key] = value;
-                    }
-                }
-                params.speculative.configs.push_back(spec_config);
+        {"--spec-ngram-check-rate"}, "N",
+        string_format("ngram check rate for ngram-simple/ngram-map speculative decoding (default: %d)", params.speculative.spec_ngram_check_rate),
+        [](common_params & params, int value) {
+            if (value < 1) {
+                throw std::invalid_argument("ngram check rate must be at least 1");
             }
+            params.speculative.spec_ngram_check_rate = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--spec-ngram-min-hits"}, "N",
+        string_format("minimum hits for ngram-map speculative decoding (default: %d)", params.speculative.spec_ngram_min_hits),
+        [](common_params & params, int value) {
+            if (value < 1) {
+                throw std::invalid_argument("ngram min hits must be at least 1");
+            }
+            params.speculative.spec_ngram_min_hits = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
diff --git a/common/common.h b/common/common.h
index e564d48532..38cc595697 100644
--- a/common/common.h
+++ b/common/common.h
@@ -253,14 +253,6 @@ struct common_params_model {
     std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
 };
 
-struct common_speculative_config {
-    common_speculative_type type;
-    std::map<std::string, std::string> config; // map of incubative options (not yet in common_params)
-
-    common_speculative_config(common_speculative_type t,
-            const std::map<std::string, std::string>& c = {}) : type(t), config(c) {}
-};
-
 struct common_params_speculative {
     std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
 
@@ -284,10 +276,10 @@ struct common_params_speculative {
     // draftless:
 
     common_speculative_type draftless_type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding without a draft model
-    uint16_t spec_ngram_size_n = 12; // ngram size for lookup
-    uint16_t spec_ngram_size_m = 48; // mgram size for speculative tokens
-
-    std::vector<common_speculative_config> configs = {}; // list of speculative configs to try
+    uint16_t spec_ngram_size_n     = 12; // ngram size for lookup
+    uint16_t spec_ngram_size_m     = 48; // mgram size for speculative tokens
+    uint16_t spec_ngram_check_rate =  1; // check rate for ngram lookup
+    uint16_t spec_ngram_min_hits   =  1; // minimum hits at ngram/mgram lookup for mgram to be proposed
 
     std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
     std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 2fa95ba55c..e9102678fa 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -36,6 +36,14 @@ const std::map<std::string, enum common_speculative_type> common_speculative_typ
     {"ngram_cache",   COMMON_SPECULATIVE_TYPE_NGRAM_CACHE}
 };
 
+struct common_speculative_config {
+    common_speculative_type type;
+    common_params_speculative params;
+
+    common_speculative_config(common_speculative_type t,
+            const common_params_speculative & p = common_params_speculative{}) : type(t), params(p) {}
+};
+
 // state of an implementation of speculative decoding
 //
 // each implementation has a unique type and a state that is implementation-specific
@@ -208,68 +216,24 @@ struct common_speculative {
     common_speculative_state * curr_impl = nullptr; // current implementation in use (for stats)
 };
 
-static common_ngram_map get_common_ngram_map(const common_speculative_config & config, uint16_t size_ngram, uint16_t size_mgram) {
-    uint16_t size_key   = size_ngram;
-    uint16_t size_value = size_mgram;
-    bool     key_only   = false;
-    uint16_t check_rate = 2;
-    uint16_t min_hits   = 1;
-    const std::map<std::string, std::string> & cfg = config.config;
-    if (cfg.find("size_ngram") != cfg.end()) {
-        size_key = std::stoi(cfg.at("size_ngram"));
-        if (size_key < 1 || size_key > 1024) {
-            throw std::invalid_argument("size_ngram must be between 1 and 1024");
-        }
-    }
-    if (cfg.find("size_mgram") != cfg.end()) {
-        size_value = std::stoi(cfg.at("size_mgram"));
-        if (size_value < 1 || size_value > 1024) {
-            throw std::invalid_argument("size_mgram must be between 1 and 1024");
-        }
-    }
-    if (cfg.find("key_only") != cfg.end()) {
-        key_only = (cfg.at("key_only") == "true");
-    }
-    if (cfg.find("check_rate") != cfg.end()) {
-        check_rate = std::stoi(cfg.at("check_rate"));
-        if (check_rate < 1 || check_rate > 1024) {
-            throw std::invalid_argument("check_rate must be between 1 and 1024");
-        }
-    }
-    if (cfg.find("min_hits") != cfg.end()) {
-        min_hits = std::stoi(cfg.at("min_hits"));
-        if (min_hits < 1 || min_hits > 1024) {
-            throw std::invalid_argument("min_hits must be between 1 and 1024");
-        }
-    }
+static common_ngram_map get_common_ngram_map(const common_speculative_config & config) {
+    uint16_t size_key   = config.params.spec_ngram_size_n;
+    uint16_t size_value = config.params.spec_ngram_size_m;
+    bool     key_only   = (config.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K);
+    uint16_t check_rate = config.params.spec_ngram_check_rate;
+    uint16_t min_hits   = config.params.spec_ngram_min_hits;
     return common_ngram_map(size_key, size_value, key_only, check_rate, min_hits);
 }
 
 static struct common_speculative_state_ngram_cache create_state_ngram_cache(
         const std::string & path_static, const std::string & path_dynamic,
         const common_speculative_config & config) {
-    uint16_t n_draft = 8;
+    uint16_t n_draft = 8; // TODO get from config?
 
+    // TODO bool param in common/common.h to set save_static/save_dynamic?
     bool save_static = false;
     bool save_dynamic = false;
 
-    const std::map<std::string, std::string> & cfg = config.config;
-
-    if (cfg.find("n_draft") != cfg.end()) {
-        n_draft = std::stoi(cfg.at("n_draft"));
-        if (n_draft < 1 || n_draft > 1024) {
-            throw std::invalid_argument("ngram-cache: n_draft must be between 1 and 1024");
-        }
-    }
-
-    if (cfg.find("save_static") != cfg.end()) {
-        save_static = (cfg.at("save_static") == "true");
-    }
-
-    if (cfg.find("save_dynamic") != cfg.end()) {
-        save_dynamic = (cfg.at("save_dynamic") == "true");
-    }
-
     common_speculative_state_ngram_cache state(config.type, path_static, path_dynamic, n_draft, save_static, save_dynamic);
 
     return state;
@@ -324,9 +288,42 @@ struct common_speculative * common_speculative_init(
         }
     }
 
+    // Compute the implementations to use based on the config and their order of preference
+    std::vector<common_speculative_config> configs = {}; // list of speculative configs to try
+    {
+        bool has_draft =!params.model.path.empty();
+        bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3
+        bool has_ngram_cache = (params.draftless_type == COMMON_SPECULATIVE_TYPE_NGRAM_CACHE);
+        bool has_ngram_simple = (params.draftless_type == COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE);
+        bool has_ngram_map_k = (params.draftless_type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K);
+        bool has_ngram_map_k4v = (params.draftless_type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V);
+        // In a more complex implementation we could use the same implementation but with different parameters.
+        // This was initially used in PR-18471 but removed to simplify the code.
+        if (has_ngram_simple) {
+            // This implementation can guess a lot of tokens without any draft model.
+            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, params));
+        }
+        if (has_ngram_map_k) {
+            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, params));
+        }
+        if (has_ngram_map_k4v) {
+            // This implementation can guess tokens with high acceptance rate but is more expensive.
+            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, params));
+        }
+        if (has_ngram_cache) {
+            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params));
+        }
+        if (has_draft) {
+            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT, params));
+        }
+        if (has_draft_eagle3) {
+            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_EAGLE3, params));
+        }
+    }
+
     std::vector<std::unique_ptr<common_speculative_state>> implementations = {};
 
-    for (const common_speculative_config & config : params.configs) {
+    for (const common_speculative_config & config : configs) {
         LOG_DBG("%s: adding implementation %s\n", __func__, common_speculative_type_to_str(config.type).c_str());
         switch (config.type) {
             case COMMON_SPECULATIVE_TYPE_NONE:
@@ -344,7 +341,7 @@ struct common_speculative * common_speculative_init(
                 break;
             }
             case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: {
-                common_ngram_map ngram_map = get_common_ngram_map(config, params.spec_ngram_size_n, params.spec_ngram_size_m);
+                common_ngram_map ngram_map = get_common_ngram_map(config);
 
                 uint16_t ngram_size_key   = ngram_map.size_key;
                 uint16_t mgram_size_value = ngram_map.size_value;
@@ -365,14 +362,14 @@ struct common_speculative * common_speculative_init(
             case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: {
                 implementations.push_back(std::make_unique<common_speculative_state_ngram_map_k>(
                     (config.type),
-                    get_common_ngram_map(config, params.spec_ngram_size_n, params.spec_ngram_size_m)
+                    get_common_ngram_map(config)
                 ));
                 break;
             }
             case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: {
                 implementations.push_back(std::make_unique<common_speculative_state_ngram_map_k4v>(
                             (config.type),
-                            get_common_ngram_map(config, params.spec_ngram_size_n, params.spec_ngram_size_m)
+                            get_common_ngram_map(config)
                 ));
                 break;
             }
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index 25b79c0788..86cb13f85f 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -77,6 +77,11 @@ json task_params::to_json(bool only_metrics) const {
             {"speculative.n_max",         speculative.n_max},
             {"speculative.n_min",         speculative.n_min},
             {"speculative.p_min",         speculative.p_min},
+            {"speculative.draftless_t",   common_speculative_type_to_str(speculative.draftless_type)},
+            {"speculative.ngram_size_n",  speculative.spec_ngram_size_n},
+            {"speculative.ngram_size_m",  speculative.spec_ngram_size_m},
+            {"speculative.ngram_c_rate",  speculative.spec_ngram_check_rate},
+            {"speculative.ngram_m_hits",  speculative.spec_ngram_min_hits},
             {"timings_per_token",         timings_per_token},
             {"post_sampling_probs",       post_sampling_probs},
             {"backend_sampling",          sampling.backend_sampling},
@@ -136,6 +141,11 @@ json task_params::to_json(bool only_metrics) const {
         {"speculative.n_max",         speculative.n_max},
         {"speculative.n_min",         speculative.n_min},
         {"speculative.p_min",         speculative.p_min},
+        {"speculative.draftless_t",   common_speculative_type_to_str(speculative.draftless_type)},
+        {"speculative.ngram_size_n",  speculative.spec_ngram_size_n},
+        {"speculative.ngram_size_m",  speculative.spec_ngram_size_m},
+        {"speculative.ngram_c_rate",  speculative.spec_ngram_check_rate},
+        {"speculative.ngram_m_hits",  speculative.spec_ngram_min_hits},
         {"timings_per_token",         timings_per_token},
         {"post_sampling_probs",       post_sampling_probs},
         {"backend_sampling",          sampling.backend_sampling},
@@ -239,45 +249,21 @@ task_params server_task::params_from_json_cmpl(
     params.speculative.n_max     = json_value(data, "speculative.n_max", defaults.speculative.n_max);
     params.speculative.p_min     = json_value(data, "speculative.p_min", defaults.speculative.p_min);
 
-    // TODO: is this needed? remove?
-    //params.speculative.self_mode = json_value(data, "speculative.self_mode", defaults.speculative.self_mode);
-    //params.speculative.self_cfg  = json_value(data, "speculative.self_cfg", defaults.speculative.self_cfg);
-    // Set params.speculative.configs. Use json-array "speculative.configs" if provided in data, otherwise use {}
-    {
-        params.speculative.configs = defaults.speculative.configs;
-        const auto & configs = data.find("speculative.configs");
-        if (configs != data.end() && configs->is_array()) {
-            params.speculative.configs.clear();
-            for (const auto & config : *configs) {
-                if (config.is_object()) {
-                    // config should have keys "type" and "config" (optional)
-                    const auto & type = config.find("type");
-                    if (type != config.end() && type->is_string()) {
-                        const auto type_name = type->get<std::string>();
-                        const auto type_enum = common_speculative_type_from_name(type_name);
-                        if (type_enum != COMMON_SPECULATIVE_TYPE_COUNT) {
-                            common_speculative_config cfg(type_enum);
-                            const auto & cfg_map = config.find("config");
-                            if (cfg_map != config.end() && cfg_map->is_object()) {
-                                for (const auto & [key, value] : cfg_map->items()) {
-                                    cfg.config[key] = value.get<std::string>();
-                                }
-                            }
-                            params.speculative.configs.push_back(cfg);
-                        } else {
-                            SRV_WRN("Unknown speculative type: %s\n", type_name.c_str());
-                        }
-
-                    }
-                }
-            }
-        }
-    }
-
     params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
     params.speculative.n_min = std::max(params.speculative.n_min, 0);
     params.speculative.n_max = std::max(params.speculative.n_max, 0);
 
+    params.speculative.draftless_type         = common_speculative_type_from_name(json_value(data, "speculative.draftless_t", common_speculative_type_to_str(defaults.speculative.draftless_type)));
+    params.speculative.spec_ngram_size_n      = json_value(data, "speculative.ngram_size_n", defaults.speculative.spec_ngram_size_n);
+    params.speculative.spec_ngram_size_m      = json_value(data, "speculative.ngram_size_m", defaults.speculative.spec_ngram_size_m);
+    params.speculative.spec_ngram_check_rate  = json_value(data, "speculative.ngram_c_rate", defaults.speculative.spec_ngram_check_rate);
+    params.speculative.spec_ngram_min_hits    = json_value(data, "speculative.ngram_m_hits", defaults.speculative.spec_ngram_min_hits);
+
+    params.speculative.spec_ngram_size_n      = std::max(std::min(1, (int) params.speculative.spec_ngram_size_n),     1024);
+    params.speculative.spec_ngram_size_m      = std::max(std::min(1, (int) params.speculative.spec_ngram_size_m),     1024);
+    params.speculative.spec_ngram_check_rate  = std::max(std::min(1, (int) params.speculative.spec_ngram_check_rate), 1024);
+    params.speculative.spec_ngram_min_hits    = std::max(std::min(1, (int) params.speculative.spec_ngram_min_hits),   1024);
+
     // Use OpenAI API logprobs only if n_probs wasn't provided
     if (data.contains("logprobs") && params.sampling.n_probs == defaults.sampling.n_probs){
         params.sampling.n_probs = json_value(data, "logprobs", defaults.sampling.n_probs);