server : add --log-disable to disable logging to file (#4260 )

* * add --log-disable to disable logging to file in the server example * * typo fix
server : add single-client multi-prompt support (#4232 )
2026-05-05 16:44:11 +00:00 · 2023-12-01 00:25:49 +02:00 · 2023-12-01 00:25:04 +02:00 · 2023-12-01 00:23:44 +02:00 · 2023-12-01 00:23:08 +02:00 · 2023-11-30 23:11:14 +01:00
20 changed files with 307 additions and 129 deletions
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -13,6 +13,8 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
    ./quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
    ./main "$@"
+elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
+    ./finetune "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
    echo "Converting PTH to GGML..."
    for i in `ls $1/$2/ggml-model-f16.bin*`; do
@@ -34,6 +36,8 @@ else
    echo "              ex: --outtype f16 \"/models/7B/\" "
    echo "  --quantize (-q): Optimize with quantization process ggml"
    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
+    echo "  --finetune (-f): Run finetune command to create a lora finetune of the model"
+    echo "              See documentation for finetune for command-line parameters"
    echo "  --all-in-one (-a): Execute --convert & --quantize"
    echo "              ex: \"/models/\" 7B"
    echo "  --server (-s): Run a model on the server"
--- a/.gitignore
+++ b/.gitignore
@@ -88,15 +88,16 @@ poetry.lock
 poetry.toml

 # Test binaries
-tests/test-grammar-parser
-tests/test-llama-grammar
-tests/test-double-float
-tests/test-grad0
-tests/test-opt
-tests/test-quantize-fns
-tests/test-quantize-perf
-tests/test-sampling
-tests/test-tokenizer-0-llama
-tests/test-tokenizer-0-falcon
-tests/test-tokenizer-1-llama
-tests/test-tokenizer-1-bpe
+/tests/test-grammar-parser
+/tests/test-llama-grammar
+/tests/test-double-float
+/tests/test-grad0
+/tests/test-opt
+/tests/test-quantize-fns
+/tests/test-quantize-perf
+/tests/test-sampling
+/tests/test-tokenizer-0-llama
+/tests/test-tokenizer-0-falcon
+/tests/test-tokenizer-1-llama
+/tests/test-tokenizer-1-bpe
+/tests/test-rope
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,7 @@ else()
 endif()

 # general
+option(BUILD_SHARED_LIBS                "build shared libraries"                                OFF)
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
 option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
 option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)
@@ -100,6 +101,9 @@ option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALO
 option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER               "llama: build server example"                           ON)

+# Required for relocatable CMake package
+include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
+
 #
 # Compile flags
 #
@@ -161,7 +165,7 @@ if (LLAMA_METAL)
    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")

    # copy ggml-metal.metal to bin directory
-    configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
+    configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)

    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
        ${FOUNDATION_LIBRARY}
--- a/25
+++ b/25
@@ -8,7 +8,7 @@ BUILD_TARGETS = \
 TEST_TARGETS = \
 	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
-	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
+	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope

 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -30,7 +30,7 @@ ifeq '' '$(findstring clang,$(shell $(CC) --version))'
 	CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
 else
 	CC_IS_CLANG=1
-	ifeq '' '$(findstring Apple LLVM,$(shell $(CC) --version))'
+	ifeq '' '$(findstring Apple,$(shell $(CC) --version))'
 		CC_IS_LLVM_CLANG=1
 	else
 		CC_IS_APPLE_CLANG=1
@@ -648,7 +648,7 @@ beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS)
 finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-export-lora: examples/export-lora/export-lora.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
@@ -701,28 +701,28 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

-tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-double-float: tests/test-double-float.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-grad0: tests/test-grad0.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-opt: tests/test-opt.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-opt: tests/test-opt.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
@@ -737,5 +737,8 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
 tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

+tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 tests/test-c.o: tests/test-c.c llama.h
 	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
--- a/README.md
+++ b/README.md
@@ -117,6 +117,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
 - [withcatai/catai](https://github.com/withcatai/catai)
 - [semperai/amica](https://github.com/semperai/amica)
+- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)

 ---

@@ -323,7 +324,7 @@ mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128

 ### BLAS Build

-Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
+Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:

 - #### Accelerate Framework:

@@ -895,7 +896,7 @@ Additionally, there the following images, similar to the above:
 - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)

-The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the Gitlab Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
+The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).

 #### Usage

--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -11,7 +11,12 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
    if(NOT IS_DIRECTORY "${GIT_DIR}")
        file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
        string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
-        set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
+        string(FIND "${REAL_GIT_DIR}" "/" SLASH_POS)
+        if (SLASH_POS EQUAL 0)
+            set(GIT_DIR "${REAL_GIT_DIR}")
+        else()
+            set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
+        endif()
    endif()

    set(GIT_INDEX "${GIT_DIR}/index")
@@ -26,7 +31,7 @@ add_custom_command(
    COMMENT "Generating build details from Git"
    COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
            -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/build-info.cmake"
+            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
    VERBATIM
--- a/convert.py
+++ b/convert.py
@@ -267,7 +267,7 @@ class Params:
            n_ctx = 2048

        return Params(
-            n_vocab          = config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
+            n_vocab          = model["tok_embeddings.weight"].shape[0],
            n_embd           = config["dim"],
            n_layer          = config["n_layers"],
            n_ctx            = n_ctx,
--- a/examples/batched.swift/README.md
+++ b/examples/batched.swift/README.md
@@ -1,4 +1,4 @@
 This is a swift clone of `examples/batched`.

 $ `make`
-$ `./swift MODEL_PATH [PROMPT] [PARALLEL]`
+$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]`
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -5,7 +5,7 @@ import json
 import torch
 import numpy as np
 from gguf import *
-from transformers import CLIPModel, CLIPProcessor
+from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel

 TEXT = "clip.text"
 VISION = "clip.vision"
@@ -78,11 +78,19 @@ ap.add_argument("--text-only", action="store_true", required=False,
                help="Save a text-only model. It can't be used to encode images")
 ap.add_argument("--vision-only", action="store_true", required=False,
                help="Save a vision-only model. It can't be used to encode texts")
+ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
+                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
 ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
 ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
+# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
+default_image_mean = [0.48145466, 0.4578275, 0.40821073]
+default_image_std = [0.26862954, 0.26130258, 0.27577711]
+ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
+ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)

+# with proper
 args = ap.parse_args()


@@ -96,15 +104,22 @@ if args.use_f32:
 # output in the same directory as the model if output_dir is None
 dir_model = args.model_dir

-
-with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
-    vocab = json.load(f)
-    tokens = [key for key in vocab]
+if args.clip_model_is_vision:
+    vocab = None
+    tokens = None
+else:
+    with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+        vocab = json.load(f)
+        tokens = [key for key in vocab]

 with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
    config = json.load(f)
-    v_hparams = config["vision_config"]
-    t_hparams = config["text_config"]
+    if args.clip_model_is_vision:
+        v_hparams = config
+        t_hparams = None
+    else:
+        v_hparams = config["vision_config"]
+        t_hparams = config["text_config"]

 # possible data types
 #   ftype == 0 -> float32
@@ -117,9 +132,12 @@ ftype = 1
 if args.use_f32:
    ftype = 0

-
-model = CLIPModel.from_pretrained(dir_model)
-processor = CLIPProcessor.from_pretrained(dir_model)
+if args.clip_model_is_vision:
+    model = CLIPVisionModel.from_pretrained(dir_model)
+    processor = None
+else:
+    model = CLIPModel.from_pretrained(dir_model)
+    processor = CLIPProcessor.from_pretrained(dir_model)

 fname_middle = None
 has_text_encoder = True
@@ -128,13 +146,13 @@ has_llava_projector = False
 if args.text_only:
    fname_middle = "text-"
    has_vision_encoder = False
-elif args.vision_only:
-    fname_middle = "vision-"
-    has_text_encoder = False
 elif args.llava_projector is not None:
    fname_middle = "mmproj-"
    has_text_encoder = False
    has_llava_projector = True
+elif args.vision_only:
+    fname_middle = "vision-"
+    has_text_encoder = False
 else:
    fname_middle = ""

@@ -182,8 +200,12 @@ if has_vision_encoder:
    block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)

-    image_mean = processor.image_processor.image_mean if args.image_mean is None else args.image_mean
-    image_std = processor.image_processor.image_std if args.image_std is None else args.image_std
+    if processor is not None:
+        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
+        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
+    else:
+        image_mean = args.image_mean if args.image_mean is not None else default_image_mean
+        image_std = args.image_std if args.image_std is not None else default_image_std
    fout.add_array("clip.vision.image_mean", image_mean)
    fout.add_array("clip.vision.image_std", image_std)

--- a/examples/lookahead/README.md
+++ b/examples/lookahead/README.md
@@ -0,0 +1,7 @@
+# llama.cpp/examples/lookahead
+
+Demonstartion of lookahead decoding technique:
+
+https://lmsys.org/blog/2023-11-21-lookahead-decoding/
+
+More info: https://github.com/ggerganov/llama.cpp/pull/4207
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -100,6 +100,12 @@ static void sigint_handler(int signo) {
 }
 #endif

+static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    LOG_TEE("%s", text);
+}
+
 int main(int argc, char ** argv) {
    gpt_params params;
    g_params = &params;
@@ -113,6 +119,7 @@ int main(int argc, char ** argv) {
    log_set_target(log_filename_generator("main", "log"));
    LOG_TEE("Log start\n");
    log_dump_cmdline(argc, argv);
+    llama_log_set(llama_log_callback_logTee, nullptr);
 #endif // LOG_DISABLE_LOGS

    // TODO: Dump params ?
--- a/examples/server/api_like_OAI.py
+++ b/examples/server/api_like_OAI.py
@@ -11,10 +11,10 @@ app = Flask(__name__)
 slot_id = -1

 parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
-parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
-parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: '\\nUSER: ')", default="\\nUSER: ")
-parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: '\\nASSISTANT: ')", default="\\nASSISTANT: ")
-parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: '\\nASSISTANT's RULE: ')", default="\\nASSISTANT's RULE: ")
+parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.')
+parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: 'USER: ')", default="USER: ")
+parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: 'ASSISTANT: ')", default="ASSISTANT: ")
+parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: 'ASSISTANT's RULE: ')", default="ASSISTANT's RULE: ")
 parser.add_argument("--stop", type=str, help="the end of response in chat completions(default: '</s>')", default="</s>")
 parser.add_argument("--llama-api", type=str, help="Set the address of server.cpp in llama.cpp(default: http://127.0.0.1:8080)", default='http://127.0.0.1:8080')
 parser.add_argument("--api-key", type=str, help="Set the api key to allow only few user(default: NULL)", default="")
@@ -34,19 +34,19 @@ def is_present(json, key):

 #convert chat to prompt
 def convert_chat(messages):
-    prompt = "" + args.chat_prompt.replace("\\n", "\n")

-    system_n = args.system_name.replace("\\n", "\n")
-    user_n = args.user_name.replace("\\n", "\n")
-    ai_n = args.ai_name.replace("\\n", "\n")
-    stop = args.stop.replace("\\n", "\n")
+    system_n = args.system_name
+    user_n = args.user_name
+    ai_n = args.ai_name
+    stop = args.stop

+    prompt = "" + args.chat_prompt + stop

    for line in messages:
        if (line["role"] == "system"):
-            prompt += f"{system_n}{line['content']}"
+            prompt += f"{system_n}{line['content']}{stop}"
        if (line["role"] == "user"):
-            prompt += f"{user_n}{line['content']}"
+            prompt += f"{user_n}{line['content']}{stop}"
        if (line["role"] == "assistant"):
            prompt += f"{ai_n}{line['content']}{stop}"
    prompt += ai_n.rstrip()
@@ -130,7 +130,7 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
            }
        ]
    }
-    slot_id = data["slot_id"]
+    slot_id = data.get("slot_id")
    if (chat):
        if (start):
            resData["choices"][0]["delta"] =  {
@@ -150,11 +150,13 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
    return resData


-@app.route('/chat/completions', methods=['POST'])
-@app.route('/v1/chat/completions', methods=['POST'])
+@app.route('/chat/completions', methods=['POST', 'OPTIONS'])
+@app.route('/v1/chat/completions', methods=['POST', 'OPTIONS'])
 def chat_completions():
    if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
        return Response(status=403)
+    if request.method == 'OPTIONS':
+        return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
    body = request.get_json()
    stream = False
    tokenize = False
@@ -177,20 +179,22 @@ def chat_completions():
            data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True)
            time_now = int(time.time())
            resData = make_resData_stream({}, chat=True, time_now=time_now, start=True)
-            yield 'data: {}\n'.format(json.dumps(resData))
+            yield 'data: {}\n\n'.format(json.dumps(resData))
            for line in data.iter_lines():
                if line:
                    decoded_line = line.decode('utf-8')
                    resData = make_resData_stream(json.loads(decoded_line[6:]), chat=True, time_now=time_now)
-                    yield 'data: {}\n'.format(json.dumps(resData))
-        return Response(generate(), mimetype='text/event-stream')
+                    yield 'data: {}\n\n'.format(json.dumps(resData))
+        return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})


-@app.route('/completions', methods=['POST'])
-@app.route('/v1/completions', methods=['POST'])
+@app.route('/completions', methods=['POST', 'OPTIONS'])
+@app.route('/v1/completions', methods=['POST', 'OPTIONS'])
 def completion():
    if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
        return Response(status=403)
+    if request.method == 'OPTIONS':
+        return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
    body = request.get_json()
    stream = False
    tokenize = False
@@ -216,8 +220,8 @@ def completion():
                if line:
                    decoded_line = line.decode('utf-8')
                    resData = make_resData_stream(json.loads(decoded_line[6:]), chat=False, time_now=time_now)
-                    yield 'data: {}\n'.format(json.dumps(resData))
-        return Response(generate(), mimetype='text/event-stream')
+                    yield 'data: {}\n\n'.format(json.dumps(resData))
+        return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})

 if __name__ == '__main__':
    app.run(args.host, port=args.port)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -155,15 +155,23 @@ struct task_server {
    json data;
    bool infill_mode = false;
    bool embedding_mode = false;
+    int multitask_id = -1;
 };

 struct task_result {
    int id;
+    int multitask_id = -1;
    bool stop;
    bool error;
    json result_json;
 };

+struct task_multi {
+    int id;
+    std::set<int> subtasks_remaining{};
+    std::vector<task_result> results{};
+};
+
 // TODO: can become bool if we can't find use of more states
 enum slot_state
 {
@@ -406,6 +414,9 @@ struct llama_client_slot
    double t_prompt_processing; // ms
    double t_token_generation; // ms

+    // multitasks
+    int multitask_id = -1;
+
    void reset() {
        num_prompt_tokens      = 0;
        generated_text         = "";
@@ -529,7 +540,8 @@ struct llama_server_context

    std::vector<task_server> queue_tasks;
    std::vector<task_result> queue_results;
-    std::mutex mutex_tasks;
+    std::vector<task_multi>  queue_multitasks;
+    std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks
    std::mutex mutex_results;

    ~llama_server_context()
@@ -1112,17 +1124,40 @@ struct llama_server_context
        return slot.images.size() > 0;
    }

-    void send_error(int id, std::string error)
+    void send_error(task_server& task, std::string error)
    {
        std::lock_guard<std::mutex> lock(mutex_results);
        task_result res;
-        res.id = id;
+        res.id = task.id;
+        res.multitask_id = task.multitask_id;
        res.stop = false;
        res.error = true;
        res.result_json = { { "content", error } };
        queue_results.push_back(res);
    }

+    void add_multi_task(int id, std::vector<int>& sub_ids)
+    {
+        std::lock_guard<std::mutex> lock(mutex_tasks);
+        task_multi multi;
+        multi.id = id;
+        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
+        queue_multitasks.push_back(multi);
+    }
+
+    void update_multi_task(int multitask_id, int subtask_id, task_result& result)
+    {
+        std::lock_guard<std::mutex> lock(mutex_tasks);
+        for (auto& multitask : queue_multitasks)
+        {
+            if (multitask.id == multitask_id)
+            {
+                multitask.subtasks_remaining.erase(subtask_id);
+                multitask.results.push_back(result);
+            }
+        }
+    }
+
    json get_model_props()
    {
        return get_formated_generation(slots[0]);
@@ -1167,6 +1202,7 @@ struct llama_server_context
        std::lock_guard<std::mutex> lock(mutex_results);
        task_result res;
        res.id = slot.task_id;
+        res.multitask_id = slot.multitask_id;
        res.error = false;
        res.stop = false;

@@ -1206,6 +1242,7 @@ struct llama_server_context
        std::lock_guard<std::mutex> lock(mutex_results);
        task_result res;
        res.id = slot.task_id;
+        res.multitask_id = slot.multitask_id;
        res.error = false;
        res.stop = true;

@@ -1251,6 +1288,12 @@ struct llama_server_context
            res.result_json["model"] = slot.oaicompat_model;
        }

+        // parent multitask, if any, needs to be updated
+        if (slot.multitask_id != -1)
+        {
+            update_multi_task(slot.multitask_id, slot.task_id, res);
+        }
+
        queue_results.push_back(res);
    }

@@ -1259,6 +1302,7 @@ struct llama_server_context
        std::lock_guard<std::mutex> lock(mutex_results);
        task_result res;
        res.id = slot.task_id;
+        res.multitask_id = slot.multitask_id;
        res.error = false;
        res.stop = true;

@@ -1285,9 +1329,9 @@ struct llama_server_context
        queue_results.push_back(res);
    }

-    int request_completion(json data, bool infill, bool embedding)
+    int request_completion(json data, bool infill, bool embedding, int multitask_id)
    {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
+        std::unique_lock<std::mutex> lock(mutex_tasks);
        task_server task;
        task.id = id_gen++;
        task.target_id = 0;
@@ -1295,6 +1339,16 @@ struct llama_server_context
        task.infill_mode = infill;
        task.embedding_mode = embedding;
        task.type = COMPLETION_TASK;
+        task.multitask_id = multitask_id;
+
+        // when a completion task's prompt array is not a singleton, we split it into multiple requests
+        if (task.data.at("prompt").size() > 1)
+        {
+            lock.unlock(); // entering new func scope
+            return split_multiprompt_task(task);
+        }
+
+        // otherwise, it's a single-prompt task, we actually queue it
        queue_tasks.push_back(task);
        return task.id;
    }
@@ -1313,8 +1367,17 @@ struct llama_server_context

            for (int i = 0; i < (int) queue_results.size(); i++)
            {
+                // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
+                if (queue_results[i].multitask_id == task_id)
+                {
+                    update_multi_task(task_id, queue_results[i].id, queue_results[i]);
+                    queue_results.erase(queue_results.begin() + i);
+                    continue;
+                }
+
                if (queue_results[i].id == task_id)
                {
+                    assert(queue_results[i].multitask_id == -1);
                    task_result res = queue_results[i];
                    queue_results.erase(queue_results.begin() + i);
                    return res;
@@ -1404,6 +1467,27 @@ struct llama_server_context
        queue_tasks.push_back(task);
    }

+    int split_multiprompt_task(task_server& multiprompt_task)
+    {
+        auto prompt_count = multiprompt_task.data.at("prompt").size();
+        assert(prompt_count > 1);
+
+        int multitask_id = id_gen++;
+        std::vector<int> subtask_ids(prompt_count);
+        for (int i = 0; i < prompt_count; i++)
+        {
+            json subtask_data = multiprompt_task.data;
+            subtask_data["prompt"] = subtask_data["prompt"][i];
+
+            // subtasks inherit everything else (infill mode, embedding mode, etc.)
+            subtask_ids[i] = request_completion(subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id);
+        }
+
+        // queue up the multitask so we can track its subtask progression
+        add_multi_task(multitask_id, subtask_ids);
+        return multitask_id;
+    }
+
    void process_tasks()
    {
        std::lock_guard<std::mutex> lock(mutex_tasks);
@@ -1419,7 +1503,7 @@ struct llama_server_context
                    {
                        LOG_TEE("slot unavailable\n");
                        // send error result
-                        send_error(task.id, "slot unavailable");
+                        send_error(task, "slot unavailable");
                        return;
                    }

@@ -1433,11 +1517,12 @@ struct llama_server_context
                    slot->infill = task.infill_mode;
                    slot->embedding = task.embedding_mode;
                    slot->task_id = task.id;
+                    slot->multitask_id = task.multitask_id;

                    if (!launch_slot_with_data(slot, task.data))
                    {
                        // send error result
-                        send_error(task.id, "internal_error");
+                        send_error(task, "internal_error");
                        break;
                    }
                } break;
@@ -1453,6 +1538,38 @@ struct llama_server_context
                } break;
            }
        }
+
+        // remove finished multitasks from the queue of multitasks, and add the corresponding result to the result queue
+        auto queue_iterator = queue_multitasks.begin();
+        while (queue_iterator != queue_multitasks.end())
+        {
+            if (queue_iterator->subtasks_remaining.empty())
+            {
+                // all subtasks done == multitask is done
+                task_result aggregate_result;
+                aggregate_result.id = queue_iterator->id;
+                aggregate_result.stop = true;
+                aggregate_result.error = false;
+
+                // collect json results into one json result
+                std::vector<json> result_jsons;
+                for (auto& subres : queue_iterator->results)
+                {
+                    result_jsons.push_back(subres.result_json);
+                    aggregate_result.error = aggregate_result.error && subres.error;
+                }
+                aggregate_result.result_json = json{ "results", result_jsons };
+
+                std::lock_guard<std::mutex> lock(mutex_results);
+                queue_results.push_back(aggregate_result);
+
+                queue_iterator = queue_multitasks.erase(queue_iterator);
+            }
+            else
+            {
+                ++queue_iterator;
+            }
+        }
    }

    bool update_slots() {
@@ -1844,6 +1961,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("    -spf FNAME, --system-prompt-file FNAME\n");
    printf("                        Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
    printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA.\n");
+    printf("  --log-disable         disables logging to a file.\n");
    printf("\n");
 }

@@ -2198,6 +2316,11 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.mmproj = argv[i];
        }
+        else if (arg == "--log-disable")
+        {
+            log_set_target(stdout);
+            LOG_INFO("logging to file is disabled.", {});
+        }
        else
        {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@@ -2596,7 +2719,7 @@ int main(int argc, char **argv)
    svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
            {
                json data = json::parse(req.body);
-                const int task_id = llama.request_completion(data, false, false);
+                const int task_id = llama.request_completion(data, false, false, -1);
                if (!json_value(data, "stream", false)) {
                    std::string completion_text;
                    task_result result = llama.next_result(task_id);
@@ -2685,7 +2808,7 @@ int main(int argc, char **argv)
            {
                json data = oaicompat_completion_params_parse(json::parse(req.body));

-                const int task_id = llama.request_completion(data, false, false);
+                const int task_id = llama.request_completion(data, false, false, -1);

                if (!json_value(data, "stream", false)) {
                    std::string completion_text;
@@ -2754,7 +2877,7 @@ int main(int argc, char **argv)
    svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
            {
                json data = json::parse(req.body);
-                const int task_id = llama.request_completion(data, true, false);
+                const int task_id = llama.request_completion(data, true, false, -1);
                if (!json_value(data, "stream", false)) {
                    std::string completion_text;
                    task_result result = llama.next_result(task_id);
@@ -2858,7 +2981,7 @@ int main(int argc, char **argv)
                {
                    prompt = "";
                }
-                const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true);
+                const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
                task_result result = llama.next_result(task_id);
                return res.set_content(result.result_json.dump(), "application/json");
            });
--- a/examples/speculative/README.md
+++ b/examples/speculative/README.md
@@ -0,0 +1,8 @@
+# llama.cpp/examples/speculative
+
+Demonstartion of speculative decoding and tree-based speculative decoding techniques
+
+More info:
+
+- https://github.com/ggerganov/llama.cpp/pull/2926
+- https://github.com/ggerganov/llama.cpp/pull/3624
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -1,20 +1,18 @@
+#include "ggml.h"
 #include "ggml-opencl.h"

 #include <array>
 #include <atomic>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
 #include <sstream>
 #include <vector>
-#include <limits>

 #define CL_TARGET_OPENCL_VERSION 110
 #include <clblast.h>

-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "ggml.h"
-
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
--- a/ggml.c
+++ b/ggml.c
@@ -9373,7 +9373,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
    // TODO: find the optimal values for these
    if (ggml_is_contiguous(src0) &&
        ggml_is_contiguous(src1) &&
-        src0->type == GGML_TYPE_F32 &&
+      //src0->type == GGML_TYPE_F32 &&
        src1->type == GGML_TYPE_F32 &&
        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {

--- a/ggml.h
+++ b/ggml.h
@@ -244,11 +244,10 @@
 #define GGML_ASSERT(x) \
    do { \
        if (!(x)) { \
-            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            fflush(stderr); \
            fflush(stdout); \
+            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
            ggml_print_backtrace(); \
-            exit(1); \
+            abort(); \
        } \
    } while (0)

--- a/llama.cpp
+++ b/llama.cpp
@@ -46,7 +46,6 @@
    #endif
    #include <windows.h>
    #include <io.h>
-    #include <stdio.h> // for _fseeki64
 #endif

 #include <algorithm>
@@ -2645,15 +2644,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
    }

    // general kv
-    LLAMA_LOG_INFO("%s: general.name   = %s\n",    __func__, model.name.c_str());
+    LLAMA_LOG_INFO("%s: general.name     = %s\n",    __func__, model.name.c_str());

    // special tokens
-    if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
-    if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
-    if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
-    if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
-    if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
-    if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token  = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].text.c_str() );    }
+    if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
+    if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
+    if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
+    if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
+    if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
+    if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].text.c_str() );    }
 }

 static void llm_load_tensors(
@@ -5550,18 +5549,8 @@ static int llama_decode_internal(
        n_threads = std::min(4, n_threads);
    }

-    // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
-    const bool full_offload_supported =
-        model.arch == LLM_ARCH_LLAMA      ||
-        model.arch == LLM_ARCH_BAICHUAN   ||
-        model.arch == LLM_ARCH_FALCON     ||
-        model.arch == LLM_ARCH_REFACT     ||
-        model.arch == LLM_ARCH_MPT        ||
-        model.arch == LLM_ARCH_STARCODER  ||
-        model.arch == LLM_ARCH_STABLELM;
-
    const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
-    if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
+    if (ggml_cpu_has_cublas() && fully_offloaded) {
        n_threads = 1;
    }

@@ -7037,6 +7026,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
    // Replace the data in candidates with the new_candidates data
    std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
    candidates->size = new_candidates.size();
+    candidates->sorted = false;

    if (ctx) {
        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
--- a/scripts/build-info.cmake
+++ b/scripts/build-info.cmake
@@ -1,5 +1,3 @@
-set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
-set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
 set(BUILD_NUMBER 0)
 set(BUILD_COMMIT "unknown")
 set(BUILD_COMPILER "unknown")
@@ -58,23 +56,3 @@ else()
    )
    set(BUILD_TARGET ${OUT})
 endif()
-
-# Only write the build info if it changed
-if(EXISTS ${OUTPUT_FILE})
-    file(READ ${OUTPUT_FILE} CONTENTS)
-    string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
-    set(OLD_COMMIT ${CMAKE_MATCH_1})
-    string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
-    set(OLD_COMPILER ${CMAKE_MATCH_1})
-    string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
-    set(OLD_TARGET ${CMAKE_MATCH_1})
-    if (
-        NOT OLD_COMMIT   STREQUAL BUILD_COMMIT   OR
-        NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
-        NOT OLD_TARGET   STREQUAL BUILD_TARGET
-    )
-        configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
-    endif()
-else()
-    configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
-endif()
--- a/scripts/gen-build-info-cpp.cmake
+++ b/scripts/gen-build-info-cpp.cmake
@@ -0,0 +1,24 @@
+include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
+
+set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
+set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
+
+# Only write the build info if it changed
+if(EXISTS ${OUTPUT_FILE})
+    file(READ ${OUTPUT_FILE} CONTENTS)
+    string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
+    set(OLD_COMMIT ${CMAKE_MATCH_1})
+    string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
+    set(OLD_COMPILER ${CMAKE_MATCH_1})
+    string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
+    set(OLD_TARGET ${CMAKE_MATCH_1})
+    if (
+        NOT OLD_COMMIT   STREQUAL BUILD_COMMIT   OR
+        NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
+        NOT OLD_TARGET   STREQUAL BUILD_TARGET
+    )
+        configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
+    endif()
+else()
+    configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
+endif()
Author	SHA1	Message	Date
Ziad Ben Hadj-Alouane	1d144112c0	server : add --log-disable to disable logging to file (#4260 ) * * add --log-disable to disable logging to file in the server example * * typo fix	2023-12-01 00:25:49 +02:00
Ziad Ben Hadj-Alouane	f43f09366d	server : add single-client multi-prompt support (#4232 ) * * add multiprompt support * * cleanup * * more cleanup * * remove atomicity of id_gen, and change lock_guard to unique_lock on completion requests * * remove all references to mutex_multitasks * Update examples/server/server.cpp Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com> * Update examples/server/server.cpp Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com> * Update examples/server/server.cpp Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com> * Update examples/server/server.cpp Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com> * * change to set --------- Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>	2023-12-01 00:25:04 +02:00
WillCorticesAI	d2809a3ba2	make : fix Apple clang determination bug (#4272 ) Co-authored-by: Will Findley <findley@gmail.com>	2023-12-01 00:23:44 +02:00
Jared Van Bortel	15f5d96037	build : fix build info generation and cleanup Makefile (#3920 ) * cmake : fix joining of REAL_GIT_DIR * fix includes with help from include-what-you-use * make : remove unneeded deps and add test-rope target * fix C includes in C++ source files * Revert "fix includes with help from include-what-you-use" This reverts commit `635e9fadfd`.	2023-12-01 00:23:08 +02:00
John	33c9892af5	llava : ShareGPT4V compatibility (vision encoder only loading) (#4172 ) * ShareGPT4 compatibility (vision encoder only loading) Load only a CLIP vision encoder (as supplied by ShareGPT finetunes) Corrects the argument parsing for --img_mean and --img_std (which were previously not parsed but attempted to access) Defines defaults for img_mean and img_std which are equal to the llava 1.5 CLIP encoder, so you do not have to provide them * Update convert-image-encoder-to-gguf.py	2023-11-30 23:11:14 +01:00
Andrew Godfrey	8efa0f6ebe	main : pass LOG_TEE callback to llama.cpp log (#4033 ) * main : Call llama_log_set to use LOG_TEE * tabs to spaces	2023-11-30 23:56:19 +02:00
vodkaslime	524907aa76	readme : fix (#4135 ) * fix: readme * chore: resolve comments * chore: resolve comments	2023-11-30 23:49:21 +02:00
Juraj Bednar	3bd2c7ce1b	docker : add finetune option (#4211 )	2023-11-30 23:46:01 +02:00
Miwa / Ensan	bde629bb53	batched.swift : update README.md (#4214 ) docs: update how to run	2023-11-30 23:45:17 +02:00
Li Tan	f7f9e06212	cmake : fix the metal file foder path (#4217 )	2023-11-30 23:44:11 +02:00
Dawid Wysocki	74daabae69	readme : fix typo (#4253 ) llama.cpp uses GitHub Actions, not Gitlab Actions.	2023-11-30 23:43:32 +02:00
Daniel Bevenius	b18c66ca6e	llama : fix alignment of general.name in print meta (#4254 ) * llama: fix alignment of general.name in print meta This commit fixes the alignment of the general.name field in the llm_load_print_meta function. Currently the output looks like this: ```console llm_load_print_meta: model ftype = mostly Q4_0 llm_load_print_meta: model params = 13.02 B llm_load_print_meta: model size = 6.86 GiB (4.53 BPW) llm_load_print_meta: general.name = LLaMA v2 ``` And with this commit it looks like this: ```console llm_load_print_meta: model ftype = mostly Q4_0 llm_load_print_meta: model params = 13.02 B llm_load_print_meta: model size = 6.86 GiB (4.53 BPW) llm_load_print_meta: general.name = LLaMA v2 ``` Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com> * llama: fix alignment of special tokens Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com> --------- Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>	2023-11-30 23:43:08 +02:00
slaren	f4d973cecb	convert.py : fix llama/llama2 conversion due to vocab_size=-1 (#4258 )	2023-11-30 23:42:23 +02:00
tarcey	954e22858c	llama : fix typical sampling (#4261 ) Typical sampling was broken because after copying new_candidates into canditates, the "sorted" bool is left at "true", but the new data is no longer sorted according to probability. Patch to set "sorted" to false. Test: Generating with temp=0.0001 (approx. argmax) should generate the same sequence at typical>=1.0 and typical=0.9999 (approx. disabled, but enters the typical sampling codepath).	2023-11-30 23:40:23 +02:00
rhjdvsgsgks	e2bd725f4b	py : fix oai proxy (#3972 ) * fix oai proxy fix generation not stoped while bot stop talking in chat mode fix possible `slot_id` not exist response for cors (and pre flight) * oai proxy: workaround for some client (such as Chatbox) * use stop as separator to replace hardcoded `\n`	2023-11-30 22:50:40 +02:00
Georgi Gerganov	1f5cd83275	examples : add readme files	2023-11-29 11:00:17 +02:00
Peter Sugihara	4fea3420ee	readme : add FreeChat (#4248 )	2023-11-29 09:16:34 +02:00
Jared Van Bortel	64e64aa255	ggml : restore abort() in GGML_ASSERT (#4242 )	2023-11-28 11:51:11 +02:00
Georgi Gerganov	8406b0924b	ggml : re-enable BLAS for CPU when src0 != F32 + remove redundant full offload checks in llama.cpp (#4240 ) * ggml : use blas even if src0 is not F32 * llama : use n_threads_batch only when n_tokens >= 32 ggml-ci * llama : revert n_threads_batch logic ggml-ci	2023-11-28 10:32:03 +02:00
bandoti	b38a16dfcf	cmake : fix issue with version info not getting baked into LlamaConfig.cmake (#3970 ) * Split CPP generation from build-info query * Remove blank lines * Add BUILD_SHARED_LIBS option	2023-11-27 21:25:42 +02:00