diff --git a/common/arg.cpp b/common/arg.cpp index 7c4b5d42f0..ae052f7a5a 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -626,43 +626,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context if (!params.speculative.tensor_buft_overrides.empty()) { params.speculative.tensor_buft_overrides.push_back({nullptr, nullptr}); } - { - bool has_draft =!params.speculative.model.path.empty(); - bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3 - bool has_lookup_caches = !params.speculative.lookup_cache_static.empty() - && !params.speculative.lookup_cache_dynamic.empty(); - bool has_simple = (params.speculative.draftless_type == COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE); - bool found_config_draft = false; - bool found_config_eagle3 = false; - bool found_config_ngram_cache = false; - bool found_config_ngram_simple = false; - for (const auto & config : params.speculative.configs) { - if (config.type == COMMON_SPECULATIVE_TYPE_DRAFT) { - found_config_draft = true; - } - if (config.type == COMMON_SPECULATIVE_TYPE_EAGLE3) { - found_config_eagle3 = true; - } - if (config.type == COMMON_SPECULATIVE_TYPE_NGRAM_CACHE) { - found_config_ngram_cache = true; - } - if (config.type == COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE) { - found_config_ngram_simple = true; - } - } - if (has_simple && !found_config_ngram_simple) { - params.speculative.configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE)); - } - if (has_lookup_caches && !found_config_ngram_cache) { - params.speculative.configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE)); - } - if (has_draft && !found_config_draft) { - params.speculative.configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT)); - } - if (has_draft_eagle3 && !found_config_eagle3) { - params.speculative.configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_EAGLE3)); - } - } if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) { throw std::runtime_error(string_format( @@ -3431,17 +3394,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( - {"--spec-draftless"}, "[none|ngram-cache|ngram-simple]", + {"--spec-draftless"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]", string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n", common_speculative_type_to_str(params.speculative.draftless_type).c_str()), [](common_params & params, const std::string & value) { if (value == "none") { params.speculative.draftless_type = COMMON_SPECULATIVE_TYPE_NONE; } else if (value == "ngram-cache") { - // TODO: this does nothing atm params.speculative.draftless_type = COMMON_SPECULATIVE_TYPE_NGRAM_CACHE; } else if (value == "ngram-simple") { params.speculative.draftless_type = COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE; + } else if (value == "ngram-map-k") { + params.speculative.draftless_type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K; + } else if (value == "ngram-map-k4v") { + params.speculative.draftless_type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V; } else { throw std::invalid_argument("unknown speculative decoding type without draft model"); } @@ -3449,7 +3415,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--spec-ngram-size-n"}, "N", - string_format("ngram size N for ngram-map speculative decoding, length of lookup n-gram (default: %d)", params.speculative.spec_ngram_size_n), + string_format("ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: %d)", params.speculative.spec_ngram_size_n), [](common_params & params, int value) { if (value < 1 || value > 1024) { throw std::invalid_argument("ngram size N must be between 1 and 1024 inclusive"); @@ -3459,7 +3425,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--spec-ngram-size-m"}, "N", - string_format("ngram size M for ngram-map speculative decoding, length of draft m-gram (default: %d)", params.speculative.spec_ngram_size_m), + string_format("ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: %d)", params.speculative.spec_ngram_size_m), [](common_params & params, int value) { if (value < 1 || value > 1024) { throw std::invalid_argument("ngram size M must be between 1 and 1024 inclusive"); @@ -3468,35 +3434,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( - {"--spec-config"}, "SPECULATIVE_CONFIG", - string_format("list of speculative decoding types, separated by ';', optionally followed by a colon and a comma-separated list of key=value pairs\n(types: %s)\n", common_speculative_type_name_str().c_str()), - [](common_params & params, const std::string & value) { - const auto config_strings = string_split(value, ';'); - for (const auto & config_string : config_strings) { - const auto parts = string_split(config_string, ':'); - if (parts.size() < 1 || parts.size() > 2) { - throw std::invalid_argument("invalid speculative decoding config"); - } - const auto type_str = parts[0]; - const auto type = common_speculative_type_from_name(type_str); - if (type == COMMON_SPECULATIVE_TYPE_COUNT) { - throw std::invalid_argument(string_format("unknown speculative decoding type: %s", type_str.c_str())); - } - common_speculative_config spec_config = {type}; - if (parts.size() == 2) { - const auto key_value_pairs = string_split(parts[1], ','); - for (const auto & key_value_pair : key_value_pairs) { - const auto key_value = string_split(key_value_pair, '='); - if (key_value.size() != 2) { - throw std::invalid_argument("invalid key=value pair"); - } - const auto & key = key_value[0]; - const auto & value = key_value[1]; - spec_config.config[key] = value; - } - } - params.speculative.configs.push_back(spec_config); + {"--spec-ngram-check-rate"}, "N", + string_format("ngram check rate for ngram-simple/ngram-map speculative decoding (default: %d)", params.speculative.spec_ngram_check_rate), + [](common_params & params, int value) { + if (value < 1) { + throw std::invalid_argument("ngram check rate must be at least 1"); } + params.speculative.spec_ngram_check_rate = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--spec-ngram-min-hits"}, "N", + string_format("minimum hits for ngram-map speculative decoding (default: %d)", params.speculative.spec_ngram_min_hits), + [](common_params & params, int value) { + if (value < 1) { + throw std::invalid_argument("ngram min hits must be at least 1"); + } + params.speculative.spec_ngram_min_hits = value; } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( diff --git a/common/common.h b/common/common.h index e564d48532..38cc595697 100644 --- a/common/common.h +++ b/common/common.h @@ -253,14 +253,6 @@ struct common_params_model { std::string name = ""; // in format /[:] (tag is optional) // NOLINT }; -struct common_speculative_config { - common_speculative_type type; - std::map config; // map of incubative options (not yet in common_params) - - common_speculative_config(common_speculative_type t, - const std::map& c = {}) : type(t), config(c) {} -}; - struct common_params_speculative { std::vector devices; // devices to use for offloading @@ -284,10 +276,10 @@ struct common_params_speculative { // draftless: common_speculative_type draftless_type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding without a draft model - uint16_t spec_ngram_size_n = 12; // ngram size for lookup - uint16_t spec_ngram_size_m = 48; // mgram size for speculative tokens - - std::vector configs = {}; // list of speculative configs to try + uint16_t spec_ngram_size_n = 12; // ngram size for lookup + uint16_t spec_ngram_size_m = 48; // mgram size for speculative tokens + uint16_t spec_ngram_check_rate = 1; // check rate for ngram lookup + uint16_t spec_ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT diff --git a/common/speculative.cpp b/common/speculative.cpp index 2fa95ba55c..e9102678fa 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -36,6 +36,14 @@ const std::map common_speculative_typ {"ngram_cache", COMMON_SPECULATIVE_TYPE_NGRAM_CACHE} }; +struct common_speculative_config { + common_speculative_type type; + common_params_speculative params; + + common_speculative_config(common_speculative_type t, + const common_params_speculative & p = common_params_speculative{}) : type(t), params(p) {} +}; + // state of an implementation of speculative decoding // // each implementation has a unique type and a state that is implementation-specific @@ -208,68 +216,24 @@ struct common_speculative { common_speculative_state * curr_impl = nullptr; // current implementation in use (for stats) }; -static common_ngram_map get_common_ngram_map(const common_speculative_config & config, uint16_t size_ngram, uint16_t size_mgram) { - uint16_t size_key = size_ngram; - uint16_t size_value = size_mgram; - bool key_only = false; - uint16_t check_rate = 2; - uint16_t min_hits = 1; - const std::map & cfg = config.config; - if (cfg.find("size_ngram") != cfg.end()) { - size_key = std::stoi(cfg.at("size_ngram")); - if (size_key < 1 || size_key > 1024) { - throw std::invalid_argument("size_ngram must be between 1 and 1024"); - } - } - if (cfg.find("size_mgram") != cfg.end()) { - size_value = std::stoi(cfg.at("size_mgram")); - if (size_value < 1 || size_value > 1024) { - throw std::invalid_argument("size_mgram must be between 1 and 1024"); - } - } - if (cfg.find("key_only") != cfg.end()) { - key_only = (cfg.at("key_only") == "true"); - } - if (cfg.find("check_rate") != cfg.end()) { - check_rate = std::stoi(cfg.at("check_rate")); - if (check_rate < 1 || check_rate > 1024) { - throw std::invalid_argument("check_rate must be between 1 and 1024"); - } - } - if (cfg.find("min_hits") != cfg.end()) { - min_hits = std::stoi(cfg.at("min_hits")); - if (min_hits < 1 || min_hits > 1024) { - throw std::invalid_argument("min_hits must be between 1 and 1024"); - } - } +static common_ngram_map get_common_ngram_map(const common_speculative_config & config) { + uint16_t size_key = config.params.spec_ngram_size_n; + uint16_t size_value = config.params.spec_ngram_size_m; + bool key_only = (config.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K); + uint16_t check_rate = config.params.spec_ngram_check_rate; + uint16_t min_hits = config.params.spec_ngram_min_hits; return common_ngram_map(size_key, size_value, key_only, check_rate, min_hits); } static struct common_speculative_state_ngram_cache create_state_ngram_cache( const std::string & path_static, const std::string & path_dynamic, const common_speculative_config & config) { - uint16_t n_draft = 8; + uint16_t n_draft = 8; // TODO get from config? + // TODO bool param in common/common.h to set save_static/save_dynamic? bool save_static = false; bool save_dynamic = false; - const std::map & cfg = config.config; - - if (cfg.find("n_draft") != cfg.end()) { - n_draft = std::stoi(cfg.at("n_draft")); - if (n_draft < 1 || n_draft > 1024) { - throw std::invalid_argument("ngram-cache: n_draft must be between 1 and 1024"); - } - } - - if (cfg.find("save_static") != cfg.end()) { - save_static = (cfg.at("save_static") == "true"); - } - - if (cfg.find("save_dynamic") != cfg.end()) { - save_dynamic = (cfg.at("save_dynamic") == "true"); - } - common_speculative_state_ngram_cache state(config.type, path_static, path_dynamic, n_draft, save_static, save_dynamic); return state; @@ -324,9 +288,42 @@ struct common_speculative * common_speculative_init( } } + // Compute the implementations to use based on the config and their order of preference + std::vector configs = {}; // list of speculative configs to try + { + bool has_draft =!params.model.path.empty(); + bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3 + bool has_ngram_cache = (params.draftless_type == COMMON_SPECULATIVE_TYPE_NGRAM_CACHE); + bool has_ngram_simple = (params.draftless_type == COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE); + bool has_ngram_map_k = (params.draftless_type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K); + bool has_ngram_map_k4v = (params.draftless_type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V); + // In a more complex implementation we could use the same implementation but with different parameters. + // This was initially used in PR-18471 but removed to simplify the code. + if (has_ngram_simple) { + // This implementation can guess a lot of tokens without any draft model. + configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, params)); + } + if (has_ngram_map_k) { + configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, params)); + } + if (has_ngram_map_k4v) { + // This implementation can guess tokens with high acceptance rate but is more expensive. + configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, params)); + } + if (has_ngram_cache) { + configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params)); + } + if (has_draft) { + configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT, params)); + } + if (has_draft_eagle3) { + configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_EAGLE3, params)); + } + } + std::vector> implementations = {}; - for (const common_speculative_config & config : params.configs) { + for (const common_speculative_config & config : configs) { LOG_DBG("%s: adding implementation %s\n", __func__, common_speculative_type_to_str(config.type).c_str()); switch (config.type) { case COMMON_SPECULATIVE_TYPE_NONE: @@ -344,7 +341,7 @@ struct common_speculative * common_speculative_init( break; } case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: { - common_ngram_map ngram_map = get_common_ngram_map(config, params.spec_ngram_size_n, params.spec_ngram_size_m); + common_ngram_map ngram_map = get_common_ngram_map(config); uint16_t ngram_size_key = ngram_map.size_key; uint16_t mgram_size_value = ngram_map.size_value; @@ -365,14 +362,14 @@ struct common_speculative * common_speculative_init( case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: { implementations.push_back(std::make_unique( (config.type), - get_common_ngram_map(config, params.spec_ngram_size_n, params.spec_ngram_size_m) + get_common_ngram_map(config) )); break; } case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: { implementations.push_back(std::make_unique( (config.type), - get_common_ngram_map(config, params.spec_ngram_size_n, params.spec_ngram_size_m) + get_common_ngram_map(config) )); break; } diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 25b79c0788..86cb13f85f 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -77,6 +77,11 @@ json task_params::to_json(bool only_metrics) const { {"speculative.n_max", speculative.n_max}, {"speculative.n_min", speculative.n_min}, {"speculative.p_min", speculative.p_min}, + {"speculative.draftless_t", common_speculative_type_to_str(speculative.draftless_type)}, + {"speculative.ngram_size_n", speculative.spec_ngram_size_n}, + {"speculative.ngram_size_m", speculative.spec_ngram_size_m}, + {"speculative.ngram_c_rate", speculative.spec_ngram_check_rate}, + {"speculative.ngram_m_hits", speculative.spec_ngram_min_hits}, {"timings_per_token", timings_per_token}, {"post_sampling_probs", post_sampling_probs}, {"backend_sampling", sampling.backend_sampling}, @@ -136,6 +141,11 @@ json task_params::to_json(bool only_metrics) const { {"speculative.n_max", speculative.n_max}, {"speculative.n_min", speculative.n_min}, {"speculative.p_min", speculative.p_min}, + {"speculative.draftless_t", common_speculative_type_to_str(speculative.draftless_type)}, + {"speculative.ngram_size_n", speculative.spec_ngram_size_n}, + {"speculative.ngram_size_m", speculative.spec_ngram_size_m}, + {"speculative.ngram_c_rate", speculative.spec_ngram_check_rate}, + {"speculative.ngram_m_hits", speculative.spec_ngram_min_hits}, {"timings_per_token", timings_per_token}, {"post_sampling_probs", post_sampling_probs}, {"backend_sampling", sampling.backend_sampling}, @@ -239,45 +249,21 @@ task_params server_task::params_from_json_cmpl( params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max); params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min); - // TODO: is this needed? remove? - //params.speculative.self_mode = json_value(data, "speculative.self_mode", defaults.speculative.self_mode); - //params.speculative.self_cfg = json_value(data, "speculative.self_cfg", defaults.speculative.self_cfg); - // Set params.speculative.configs. Use json-array "speculative.configs" if provided in data, otherwise use {} - { - params.speculative.configs = defaults.speculative.configs; - const auto & configs = data.find("speculative.configs"); - if (configs != data.end() && configs->is_array()) { - params.speculative.configs.clear(); - for (const auto & config : *configs) { - if (config.is_object()) { - // config should have keys "type" and "config" (optional) - const auto & type = config.find("type"); - if (type != config.end() && type->is_string()) { - const auto type_name = type->get(); - const auto type_enum = common_speculative_type_from_name(type_name); - if (type_enum != COMMON_SPECULATIVE_TYPE_COUNT) { - common_speculative_config cfg(type_enum); - const auto & cfg_map = config.find("config"); - if (cfg_map != config.end() && cfg_map->is_object()) { - for (const auto & [key, value] : cfg_map->items()) { - cfg.config[key] = value.get(); - } - } - params.speculative.configs.push_back(cfg); - } else { - SRV_WRN("Unknown speculative type: %s\n", type_name.c_str()); - } - - } - } - } - } - } - params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min); params.speculative.n_min = std::max(params.speculative.n_min, 0); params.speculative.n_max = std::max(params.speculative.n_max, 0); + params.speculative.draftless_type = common_speculative_type_from_name(json_value(data, "speculative.draftless_t", common_speculative_type_to_str(defaults.speculative.draftless_type))); + params.speculative.spec_ngram_size_n = json_value(data, "speculative.ngram_size_n", defaults.speculative.spec_ngram_size_n); + params.speculative.spec_ngram_size_m = json_value(data, "speculative.ngram_size_m", defaults.speculative.spec_ngram_size_m); + params.speculative.spec_ngram_check_rate = json_value(data, "speculative.ngram_c_rate", defaults.speculative.spec_ngram_check_rate); + params.speculative.spec_ngram_min_hits = json_value(data, "speculative.ngram_m_hits", defaults.speculative.spec_ngram_min_hits); + + params.speculative.spec_ngram_size_n = std::max(std::min(1, (int) params.speculative.spec_ngram_size_n), 1024); + params.speculative.spec_ngram_size_m = std::max(std::min(1, (int) params.speculative.spec_ngram_size_m), 1024); + params.speculative.spec_ngram_check_rate = std::max(std::min(1, (int) params.speculative.spec_ngram_check_rate), 1024); + params.speculative.spec_ngram_min_hits = std::max(std::min(1, (int) params.speculative.spec_ngram_min_hits), 1024); + // Use OpenAI API logprobs only if n_probs wasn't provided if (data.contains("logprobs") && params.sampling.n_probs == defaults.sampling.n_probs){ params.sampling.n_probs = json_value(data, "logprobs", defaults.sampling.n_probs);