diff --git a/common/arg.cpp b/common/arg.cpp index 07ba719352..f1f4c12a3c 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2223,7 +2223,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex if (llama_supports_rpc()) { add_opt(common_arg( {"--rpc"}, "SERVERS", - "comma separated list of RPC servers (host:port)", + "comma-separated list of RPC servers (host:port)", [](common_params & params, const std::string & value) { add_rpc_devices(value); GGML_UNUSED(params); @@ -3555,7 +3555,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_MODEL")); add_opt(common_arg( {"--spec-type"}, common_speculative_all_types_str(), - string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n", + string_format("comma-separated list of types of speculative decoding to use (default: %s)\n", common_speculative_type_name_str(params.speculative.types).c_str()), [](common_params & params, const std::string & value) { const auto enabled_types = string_split(value, ','); diff --git a/common/common.h b/common/common.h index aafc376f2e..a3cd174395 100644 --- a/common/common.h +++ b/common/common.h @@ -157,9 +157,9 @@ enum common_params_sampling_config : uint64_t { enum common_speculative_type { COMMON_SPECULATIVE_TYPE_NONE, // no speculative decoding - COMMON_SPECULATIVE_TYPE_DRAFT, // draft model - COMMON_SPECULATIVE_TYPE_EAGLE3, // eagle draft model - COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding + COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE, // standalone draft model speculative decoding + COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, // Eagle3 speculative decoding + COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding based on n-grams COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values COMMON_SPECULATIVE_TYPE_NGRAM_MOD, @@ -342,6 +342,7 @@ struct common_params_speculative_ngram_cache { struct common_params_speculative { std::vector types = { COMMON_SPECULATIVE_TYPE_NONE }; + // used by Simple, MTP, Eagle3, etc. - all methods that require some kind of draft model common_params_speculative_draft draft; common_params_speculative_ngram_mod ngram_mod; diff --git a/common/speculative.cpp b/common/speculative.cpp index e487e003d3..0eebcb3dcf 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -21,8 +21,8 @@ const std::map common_speculative_type_from_name_map = { {"none", COMMON_SPECULATIVE_TYPE_NONE}, - {"draft", COMMON_SPECULATIVE_TYPE_DRAFT}, - {"eagle3", COMMON_SPECULATIVE_TYPE_EAGLE3}, + {"draft-simple", COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE}, + {"draft-eagle3", COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3}, {"ngram-simple", COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE}, {"ngram-map-k", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K}, {"ngram-map-k4v", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V}, @@ -145,15 +145,15 @@ struct common_speculative_impl { virtual void accept(llama_seq_id seq_id, uint16_t n_accepted) = 0; }; -struct common_speculative_state_draft : public common_speculative_impl { +struct common_speculative_impl_draft_simple : public common_speculative_impl { common_params_speculative_draft params; llama_batch batch; std::vector smpls; - common_speculative_state_draft(const common_params_speculative & params, uint32_t n_seq) - : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT, n_seq) + common_speculative_impl_draft_simple(const common_params_speculative & params, uint32_t n_seq) + : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE, n_seq) , params(params.draft) { auto * ctx_dft = this->params.ctx_dft; @@ -206,7 +206,7 @@ struct common_speculative_state_draft : public common_speculative_impl { } } - ~common_speculative_state_draft() override { + ~common_speculative_impl_draft_simple() override { llama_batch_free(batch); } @@ -340,11 +340,11 @@ struct common_speculative_state_draft : public common_speculative_impl { } }; -struct common_speculative_state_eagle3 : public common_speculative_impl { +struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { //common_params_speculative_eagle3 params; - common_speculative_state_eagle3(const common_params_speculative & /*params*/, uint32_t n_seq) - : common_speculative_impl(COMMON_SPECULATIVE_TYPE_EAGLE3, n_seq) {} + common_speculative_impl_draft_eagle3(const common_params_speculative & /*params*/, uint32_t n_seq) + : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq) {} void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override { // noop @@ -365,13 +365,13 @@ struct common_speculative_state_eagle3 : public common_speculative_impl { }; // state of self-speculation (simple implementation, not ngram-map) -struct common_speculative_state_ngram_simple : public common_speculative_impl { +struct common_speculative_impl_ngram_simple : public common_speculative_impl { common_params_speculative_ngram_map params; // shared across all sequences common_ngram_simple_config config; - common_speculative_state_ngram_simple( + common_speculative_impl_ngram_simple( const common_params_speculative & params, uint32_t n_seq, common_ngram_simple_config config) : common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, n_seq) @@ -405,13 +405,13 @@ struct common_speculative_state_ngram_simple : public common_speculative_impl { } }; -struct common_speculative_state_ngram_map_k : public common_speculative_impl { +struct common_speculative_impl_ngram_map_k : public common_speculative_impl { common_params_speculative_ngram_map params; // n_seq configs std::vector config; - common_speculative_state_ngram_map_k( + common_speculative_impl_ngram_map_k( const common_params_speculative & params, const common_ngram_map & config, uint32_t n_seq) @@ -453,7 +453,7 @@ struct common_speculative_state_ngram_map_k : public common_speculative_impl { } }; -struct common_speculative_state_ngram_mod : public common_speculative_impl { +struct common_speculative_impl_ngram_mod : public common_speculative_impl { common_params_speculative_ngram_mod params; // shared across all sequences @@ -475,7 +475,7 @@ struct common_speculative_state_ngram_mod : public common_speculative_impl { std::vector sinfos; - common_speculative_state_ngram_mod( + common_speculative_impl_ngram_mod( const common_params_speculative & params, uint32_t n_seq) : common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_MOD, n_seq) @@ -621,7 +621,7 @@ struct common_speculative_state_ngram_mod : public common_speculative_impl { } }; -struct common_speculative_state_ngram_cache : public common_speculative_impl { +struct common_speculative_impl_ngram_cache : public common_speculative_impl { common_params_speculative_ngram_cache params; uint16_t n_draft; @@ -639,7 +639,7 @@ struct common_speculative_state_ngram_cache : public common_speculative_impl { std::vector sinfos; - common_speculative_state_ngram_cache( + common_speculative_impl_ngram_cache( const common_params_speculative & params, uint32_t n_seq, uint16_t n_draft, @@ -775,7 +775,7 @@ static common_ngram_map get_common_ngram_map( return common_ngram_map(size_key, size_value, key_only, min_hits); } -static common_speculative_state_ngram_cache create_state_ngram_cache( +static common_speculative_impl_ngram_cache create_state_ngram_cache( const common_speculative_config & config, uint32_t n_seq, const std::string & path_static, @@ -786,7 +786,7 @@ static common_speculative_state_ngram_cache create_state_ngram_cache( bool save_static = false; bool save_dynamic = false; - common_speculative_state_ngram_cache state(config.params, n_seq, n_draft, path_static, path_dynamic, save_static, save_dynamic); + common_speculative_impl_ngram_cache state(config.params, n_seq, n_draft, path_static, path_dynamic, save_static, save_dynamic); return state; } @@ -818,8 +818,8 @@ const char * common_speculative_all_types_str() { std::string common_speculative_type_to_str(common_speculative_type type) { switch (type) { case COMMON_SPECULATIVE_TYPE_NONE: return "none"; - case COMMON_SPECULATIVE_TYPE_DRAFT: return "draft"; - case COMMON_SPECULATIVE_TYPE_EAGLE3: return "eagle3"; + case COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE: return "draft-simple"; + case COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3: return "draft-eagle3"; case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: return "ngram-simple"; case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: return "ngram-map-k"; case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: return "ngram-map-k4v"; @@ -872,9 +872,9 @@ common_speculative * common_speculative_init(common_params_speculative & params, { uint32_t enabled_configs = common_get_enabled_speculative_configs(params.types); - bool has_draft = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT)); - bool has_draft_model = !params.draft.mparams.path.empty(); + bool has_draft_model_path = !params.draft.mparams.path.empty(); + bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE)); // bool has_mtp = false; // TODO: add MTP here bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3 @@ -906,22 +906,22 @@ common_speculative * common_speculative_init(common_params_speculative & params, if (has_ngram_cache) { configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params)); } - if (has_draft) { - if (!has_draft_model) { + if (has_draft_simple) { + if (!has_draft_model_path) { LOG_WRN("%s: draft model is not specified - cannot use 'draft' type\n", __func__); - has_draft = false; + has_draft_simple = false; } - } else if (has_draft_model) { + } else if (has_draft_model_path) { LOG_WRN("%s: draft model is specified but 'draft' speculative type is not explicitly enabled - enabling it\n", __func__); - has_draft = true; + has_draft_simple = true; } - if (has_draft) { - configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT, params)); + if (has_draft_simple) { + configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE, params)); } // TODO: add MTP here if (has_draft_eagle3) { - configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_EAGLE3, params)); + configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, params)); } } @@ -932,12 +932,12 @@ common_speculative * common_speculative_init(common_params_speculative & params, switch (config.type) { case COMMON_SPECULATIVE_TYPE_NONE: break; - case COMMON_SPECULATIVE_TYPE_DRAFT: { - impls.push_back(std::make_unique(config.params, n_seq)); + case COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE: { + impls.push_back(std::make_unique(config.params, n_seq)); break; } - case COMMON_SPECULATIVE_TYPE_EAGLE3: { - impls.push_back(std::make_unique(config.params, n_seq)); + case COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3: { + impls.push_back(std::make_unique(config.params, n_seq)); break; } case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: { @@ -950,7 +950,7 @@ common_speculative * common_speculative_init(common_params_speculative & params, /* .size_ngram = */ ngram_size_key, /* .size_mgram = */ mgram_size_value }; - auto state = std::make_unique( + auto state = std::make_unique( /* .params = */ config.params, /* .n_seq = */ n_seq, /* .state = */ config_simple @@ -961,13 +961,13 @@ common_speculative * common_speculative_init(common_params_speculative & params, case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: { impls.push_back( - std::make_unique( + std::make_unique( config.params, get_common_ngram_map(config.type, config.params.ngram_map_k), n_seq)); break; } case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: { impls.push_back( - std::make_unique(config.params, n_seq)); + std::make_unique(config.params, n_seq)); break; } case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: { @@ -975,7 +975,7 @@ common_speculative * common_speculative_init(common_params_speculative & params, config, n_seq, params.ngram_cache.lookup_cache_static, params.ngram_cache.lookup_cache_dynamic); - impls.push_back(std::make_unique(state)); + impls.push_back(std::make_unique(state)); break; } default: diff --git a/tools/cli/README.md b/tools/cli/README.md index 02c564a290..9f0574d25d 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -55,6 +55,7 @@ | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | +| `--rpc SERVERS` | comma-separated list of RPC servers (host:port)
(env: LLAMA_ARG_RPC) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | | `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | | `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | @@ -198,7 +199,7 @@ | `--spec-draft-device, -devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | | `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | | `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_SPEC_DRAFT_MODEL) | -| `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none)

(env: LLAMA_ARG_SPEC_TYPE) | +| `--spec-type none,draft-simple,draft-eagle3,ngram-simple,ngram-map-k,ngram-map-k4v,ngram-mod,ngram-cache` | comma-separated list of types of speculative decoding to use (default: none)

(env: LLAMA_ARG_SPEC_TYPE) | | `--spec-ngram-mod-n-min N` | minimum number of ngram tokens to use for ngram-based speculative decoding (default: 48) | | `--spec-ngram-mod-n-max N` | maximum number of ngram tokens to use for ngram-based speculative decoding (default: 64) | | `--spec-ngram-mod-n-match N` | ngram-mod lookup length (default: 24) | diff --git a/tools/completion/README.md b/tools/completion/README.md index 7042889db1..048cf7416f 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -138,6 +138,7 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | +| `--rpc SERVERS` | comma-separated list of RPC servers (host:port)
(env: LLAMA_ARG_RPC) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | | `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | | `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | diff --git a/tools/server/README.md b/tools/server/README.md index 7f856faa81..07e180929e 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -72,6 +72,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | +| `--rpc SERVERS` | comma-separated list of RPC servers (host:port)
(env: LLAMA_ARG_RPC) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | | `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | | `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | @@ -247,7 +248,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `--spec-draft-device, -devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | | `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | | `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_SPEC_DRAFT_MODEL) | -| `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none)

(env: LLAMA_ARG_SPEC_TYPE) | +| `--spec-type none,draft-simple,draft-eagle3,ngram-simple,ngram-map-k,ngram-map-k4v,ngram-mod,ngram-cache` | comma-separated list of types of speculative decoding to use (default: none)

(env: LLAMA_ARG_SPEC_TYPE) | | `--spec-ngram-mod-n-min N` | minimum number of ngram tokens to use for ngram-based speculative decoding (default: 48) | | `--spec-ngram-mod-n-max N` | maximum number of ngram tokens to use for ngram-based speculative decoding (default: 64) | | `--spec-ngram-mod-n-match N` | ngram-mod lookup length (default: 24) |