Server: Enable setting default sampling parameters via command-line (#8402 )

* Load server sampling parameters from the server context by default. * Wordsmithing comment
Update README.md to fix broken link to docs (#8399 )
2026-05-07 09:34:07 +00:00 · 2024-07-09 18:26:40 -04:00 · 2024-07-09 14:58:44 -04:00 · 2024-07-09 11:54:43 -04:00 · 2024-07-09 17:11:07 +02:00 · 2024-07-09 22:03:15 +08:00
9 changed files with 182 additions and 17 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,9 +50,6 @@ endif()
 # option list
 #

-# general
-option(LLAMA_CCACHE "llama: use ccache if available" ON)
-
 # debug
 option(LLAMA_ALL_WARNINGS           "llama: enable all compiler warnings"                   ON)
 option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
@@ -77,7 +74,6 @@ option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)

 # override ggml options
-set(GGML_CCACHE             ${LLAMA_CCACHE})
 set(GGML_SANITIZE_THREAD    ${LLAMA_SANITIZE_THREAD})
 set(GGML_SANITIZE_ADDRESS   ${LLAMA_SANITIZE_ADDRESS})
 set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
@@ -115,7 +111,10 @@ llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
 # build the library
 #

-add_subdirectory(ggml)
+if (NOT TARGET ggml)
+    add_subdirectory(ggml)
+    # ... otherwise assume ggml is added by a parent CMakeLists.txt
+endif()
 add_subdirectory(src)

 #
--- a/90
+++ b/90
@@ -64,10 +64,14 @@ TEST_TARGETS = \
 	tests/test-tokenizer-1-spm

 # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
-LEGACY_TARGETS = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
+LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
 	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
 	retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm

+# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
+#  We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
+LEGACY_TARGETS_BUILD = main quantize perplexity embedding server finetune
+
 # Deprecation aliases
 ifdef LLAMA_CUBLAS
 $(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
@@ -193,7 +197,7 @@ ifdef GGML_RPC
 	BUILD_TARGETS += rpc-server
 endif

-default: $(BUILD_TARGETS)
+default: $(BUILD_TARGETS) $(LEGACY_TARGETS_BUILD)

 test: $(TEST_TARGETS)
 	@failures=0; \
@@ -228,7 +232,7 @@ test: $(TEST_TARGETS)
 	fi
 	@echo 'All tests passed.'

-all: $(BUILD_TARGETS) $(TEST_TARGETS)
+all: $(BUILD_TARGETS) $(TEST_TARGETS) $(LEGACY_TARGETS_BUILD)

 ifdef RISCV_CROSS_COMPILE
 CC	:= riscv64-unknown-linux-gnu-gcc
@@ -245,17 +249,22 @@ MK_CFLAGS    = -std=c11   -fPIC
 MK_CXXFLAGS  = -std=c++11 -fPIC
 MK_NVCCFLAGS = -std=c++11

-ifndef LLAMA_NO_CCACHE
+ifdef LLAMA_NO_CCACHE
+GGML_NO_CCACHE := 1
+DEPRECATE_WARNING := 1
+endif
+
+ifndef GGML_NO_CCACHE
 CCACHE := $(shell which ccache)
 ifdef CCACHE
 export CCACHE_SLOPPINESS = time_macros
-$(info I ccache found, compilation results will be cached. Disable with LLAMA_NO_CCACHE.)
+$(info I ccache found, compilation results will be cached. Disable with GGML_NO_CCACHE.)
 CC    := $(CCACHE) $(CC)
 CXX   := $(CCACHE) $(CXX)
 else
 $(info I ccache not found. Consider installing it for faster compilation.)
 endif # CCACHE
-endif # LLAMA_NO_CCACHE
+endif # GGML_NO_CCACHE

 # clock_gettime came in POSIX.1b (1993)
 # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
@@ -926,6 +935,7 @@ $(info   - LLAMA_NO_LLAMAFILE)
 $(info   - LLAMA_NO_ACCELERATE)
 $(info   - LLAMA_NO_OPENMP)
 $(info   - LLAMA_NO_METAL)
+$(info   - LLAMA_NO_CCACHE)
 $(info )
 endif

@@ -1092,7 +1102,7 @@ clean:
 	rm -vrf ggml/src/ggml-cuda/template-instances/*.o
 	rm -rvf $(BUILD_TARGETS)
 	rm -rvf $(TEST_TARGETS)
-	rm -rvf $(LEGACY_TARGETS)
+	rm -rvf $(LEGACY_TARGETS_CLEAN)
 	find examples pocs -type f -name "*.o" -delete

 #
@@ -1488,3 +1498,69 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+#
+# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
+#
+# Mark legacy binary targets as .PHONY so that they are always checked.
+.PHONY: main quantize perplexity embedding server finetune
+
+main: examples/deprecation-warning/deprecation-warning.cpp
+ifneq (,$(wildcard main))
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	@echo "#########"
+	@echo "WARNING: The 'main' binary is deprecated. Please use 'llama-cli' instead."
+	@echo "  Remove the 'main' binary to remove this warning."
+	@echo "#########"
+endif
+
+quantize: examples/deprecation-warning/deprecation-warning.cpp
+ifneq (,$(wildcard quantize))
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	@echo "#########"
+	@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
+	@echo "  Remove the 'quantize' binary to remove this warning."
+	@echo "#########"
+endif
+
+perplexity: examples/deprecation-warning/deprecation-warning.cpp
+ifneq (,$(wildcard perplexity))
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	@echo "#########"
+	@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
+	@echo "  Remove the 'perplexity' binary to remove this warning."
+	@echo "#########"
+endif
+
+embedding: examples/deprecation-warning/deprecation-warning.cpp
+ifneq (,$(wildcard embedding))
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	@echo "#########"
+	@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
+	@echo "  Remove the 'embedding' binary to remove this warning."
+	@echo "#########"
+endif
+
+server: examples/deprecation-warning/deprecation-warning.cpp
+ifneq (,$(wildcard server))
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	@echo "#########"
+	@echo "WARNING: The 'server' binary is deprecated. Please use 'llama-server' instead."
+	@echo "  Remove the 'server' binary to remove this warning."
+	@echo "#########"
+endif
+
+finetune: examples/deprecation-warning/deprecation-warning.cpp
+ifneq (,$(wildcard finetune))
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	@echo "#########"
+	@echo "WARNING: The 'finetune' binary is deprecated. Please use 'llama-finetune' instead."
+	@echo "  Remove the 'finetune' binary to remove this warning."
+	@echo "#########"
+endif
--- a/README.md
+++ b/README.md
@@ -453,7 +453,7 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
 - [How to build](./docs/build.md)
 - [Running on Docker](./docs/docker.md)
 - [Build on Android](./docs/android.md)
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
+- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
 - [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)

 **Seminal papers and background on the models**
--- a/examples/deprecation-warning/README.md
+++ b/examples/deprecation-warning/README.md
@@ -0,0 +1,51 @@
+# Migration notice for binary filenames
+
+> [!IMPORTANT]
+[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
+
+This migration was important, but it is a breaking change that may not always be immediately obvious to users.
+
+Please update all scripts and workflows to use the new binary names.
+
+| Old Filename | New Filename |
+| ---- | ---- |
+| main | llama-cli |
+| server | llama-server |
+| llama-bench | llama-bench |
+| embedding | llama-embedding |
+| finetune | llama-finetune |
+| quantize | llama-quantize |
+| tokenize | llama-tokenize |
+| export-lora | llama-export-lora |
+| libllava.a | libllava.a |
+| baby-llama | llama-baby-llama |
+| batched | llama-batched |
+| batched-bench | llama-batched-bench |
+| benchmark-matmult | llama-benchmark-matmult |
+| convert-llama2c-to-ggml | llama-convert-llama2c-to-ggml |
+| eval-callback | llama-eval-callback |
+| gbnf-validator | llama-gbnf-validator |
+| gguf | llama-gguf |
+| gguf-split | llama-gguf-split |
+| gritlm | llama-gritlm |
+| imatrix | llama-imatrix |
+| infill | llama-infill |
+| llava-cli | llama-llava-cli |
+| lookahead | llama-lookahead |
+| lookup | llama-lookup |
+| lookup-create | llama-lookup-create |
+| lookup-merge | llama-lookup-merge |
+| lookup-stats | llama-lookup-stats |
+| parallel | llama-parallel |
+| passkey | llama-passkey |
+| perplexity | llama-perplexity |
+| q8dot | llama-q8dot |
+| quantize-stats | llama-quantize-stats |
+| retrieval | llama-retrieval |
+| save-load-state | llama-save-load-state |
+| simple | llama-simple |
+| speculative | llama-speculative |
+| train-text-from-scratch | llama-train-text-from-scratch |
+| vdot | llama-vdot |
+| tests/test-c.o | tests/test-c.o |
+
--- a/examples/deprecation-warning/deprecation-warning.cpp
+++ b/examples/deprecation-warning/deprecation-warning.cpp
@@ -0,0 +1,35 @@
+// Warns users that this filename was deprecated, and provides a link for more information.
+
+#include <cstdio>
+#include <string>
+#include <unordered_map>
+
+// Main
+int main(int argc, char** argv) {
+    std::string filename = "main";
+    if (argc >= 1) {
+        filename = argv[0];
+    }
+
+    // Get only the program name from the full path
+    auto pos = filename.find_last_of('/');
+    if (pos != std::string::npos) {
+        filename = filename.substr(pos+1);
+    }
+
+    // Append "llama-" to the beginning of filename to get the replacemnt filename
+    auto replacement_filename = "llama-" + filename;
+
+    // The exception is if the filename is "main", then our replacement filename is "llama-cli"
+    if (filename == "main") {
+        replacement_filename = "llama-cli";
+    }
+
+    fprintf(stdout, "\n");
+    fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
+    fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str());
+    fprintf(stdout, " See https://github.com/ggerganov/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
+    fprintf(stdout, "\n");
+
+    return EXIT_FAILURE;
+}
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -884,7 +884,8 @@ struct server_context {

    bool launch_slot_with_task(server_slot & slot, const server_task & task) {
        slot_params default_params;
-        llama_sampling_params default_sparams;
+        // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
+        llama_sampling_params default_sparams = params.sparams;
        auto & data = task.data;

        if (data.count("__oaicompat") != 0) {
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -3658,6 +3658,10 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
    use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
 #endif // SYCL_USE_XMX

+    // mmvq path is faster in the CUDA backend.
+    if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda)
+        use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
+
    if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
        // KQ single-batch
        ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst);
--- a/gguf-py/gguf/lazy.py
+++ b/gguf-py/gguf/lazy.py
@@ -6,7 +6,6 @@ from typing import Any, Callable
 from collections import deque

 import numpy as np
-from numpy._typing import _Shape
 from numpy.typing import DTypeLike


@@ -219,7 +218,7 @@ class LazyNumpyTensor(LazyBase):
    _tensor_type = np.ndarray

    @classmethod
-    def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: _Shape) -> np.ndarray[Any, Any]:
+    def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
        # The initial idea was to use np.nan as the fill value,
        # but non-float types like np.int16 can't use that.
        # So zero it is.
--- a/grammars/README.md
+++ b/grammars/README.md
@@ -4,7 +4,7 @@ GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.

 ## Background

-[Bakus-Naur Form (BNF)](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features.
+[Backus-Naur Form (BNF)](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features.

 ## Basics
Author	SHA1	Message	Date
Clint Herron	a59f8fdc85	Server: Enable setting default sampling parameters via command-line (#8402 ) * Load server sampling parameters from the server context by default. * Wordsmithing comment	2024-07-09 18:26:40 -04:00
Andy Salerno	fd560fe680	Update README.md to fix broken link to docs (#8399 ) Update the "Performance troubleshooting" doc link to be correct - the file was moved into a dir called 'development'	2024-07-09 14:58:44 -04:00
Clint Herron	e500d6135a	Deprecation warning to assist with migration to new binary names (#8283 ) * Adding a simple program to provide a deprecation warning that can exist to help people notice the binary name change from #7809 and migrate to the new filenames. * Build legacy replacement binaries only if they already exist. Check for their existence every time so that they are not ignored.	2024-07-09 11:54:43 -04:00
Johannes Gäßler	a03e8dd99d	make/cmake: LLAMA_NO_CCACHE -> GGML_NO_CCACHE (#8392 )	2024-07-09 17:11:07 +02:00
Alberto Cabrera Pérez	5b0b8d8cfb	sycl : Reenabled mmvq path for the SYCL Nvidia Backend (#8372 ) * SYCL : Reenabled mmvq path for the SYCL Nvidia Backend * Reduced verbosity of comment	2024-07-09 22:03:15 +08:00
Borislav Stanimirov	9925ca4087	cmake : allow external ggml (#8370 )	2024-07-09 11:38:00 +03:00
daghanerdonmez	9beb2dda03	readme : fix typo [no ci] (#8389 ) Bakus-Naur --> Backus-Naur	2024-07-09 09:16:00 +03:00
compilade	7d0e23d72e	gguf-py : do not use internal numpy types (#7472 )	2024-07-09 01:04:49 -04:00