common : add --version option to show build info in CLI (#4433 )

readme : update hot topics
llama : add Mixtral support (#4406 )
2026-05-05 00:24:07 +00:00 · 2023-12-13 14:50:14 +02:00 · 2023-12-13 14:05:38 +02:00 · 2023-12-13 14:04:25 +02:00 · 2023-12-12 12:12:35 +02:00 · 2023-12-12 11:53:36 +02:00
58 changed files with 8189 additions and 1758 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -143,6 +143,9 @@ jobs:
          cd build
          ctest --verbose

+  # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
+  #       how to debug it.
+  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
  macOS-latest-make:
    runs-on: macos-latest

@@ -160,14 +163,18 @@ jobs:
      - name: Build
        id: make_build
        run: |
-          make -j $(sysctl -n hw.logicalcpu)
+          LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)

      - name: Test
        id: make_test
        run: |
-          make tests -j $(sysctl -n hw.logicalcpu)
-          make test -j $(sysctl -n hw.logicalcpu)
+          LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
+          LLAMA_NO_METAL=1 make test  -j $(sysctl -n hw.logicalcpu)

+  # TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
+  #       how to debug it.
+  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
+  #       would be great if we fix these
  macOS-latest-cmake:
    runs-on: macos-latest

@@ -188,7 +195,7 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake ..
+          cmake -DLLAMA_METAL=OFF ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
--- a/.gitignore
+++ b/.gitignore
@@ -101,3 +101,4 @@ poetry.toml
 /tests/test-tokenizer-1-llama
 /tests/test-tokenizer-1-bpe
 /tests/test-rope
+/tests/test-backend-ops
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,9 +97,9 @@ option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)

-option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_SERVER               "llama: build server example"                           ON)
+option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_SERVER                    "llama: build server example"                      ON)

 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
@@ -593,6 +593,11 @@ else()
    message(STATUS "Unknown architecture")
 endif()

+if (MINGW)
+    # Target Windows 8 for PrefetchVirtualMemory
+    add_compile_definitions(_WIN32_WINNT=0x602)
+endif()
+
 #
 # POSIX conformance
 #
@@ -662,11 +667,11 @@ add_library(ggml OBJECT
            ggml-backend.h
            ggml-quants.c
            ggml-quants.h
-            ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
+            ${GGML_SOURCES_CUDA}   ${GGML_HEADERS_CUDA}
            ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
-            ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
-            ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
-            ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
+            ${GGML_SOURCES_METAL}  ${GGML_HEADERS_METAL}
+            ${GGML_SOURCES_MPI}    ${GGML_HEADERS_MPI}
+            ${GGML_SOURCES_EXTRA}  ${GGML_HEADERS_EXTRA}
            )

 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
--- a/28
+++ b/28
@@ -8,7 +8,8 @@ BUILD_TARGETS = \
 TEST_TARGETS = \
 	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
-	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope
+	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
+	tests/test-backend-ops

 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -305,12 +306,15 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	#MK_CXXFLAGS += -mssse3
 endif

-# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
-# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
-# https://github.com/ggerganov/llama.cpp/issues/2922
 ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
+	# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
+	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
+	# https://github.com/ggerganov/llama.cpp/issues/2922
 	MK_CFLAGS   += -Xassembler -muse-unaligned-vector-move
 	MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
+
+	# Target Windows 8 for PrefetchVirtualMemory
+	MK_CPPFLAGS += -D_WIN32_WINNT=0x602
 endif

 ifneq ($(filter aarch64%,$(UNAME_M)),)
@@ -395,6 +399,11 @@ ifdef LLAMA_CUBLAS
 	MK_LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	OBJS         += ggml-cuda.o
 	NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
+
+ifdef LLAMA_DEBUG
+	NVCCFLAGS += -lineinfo
+endif
+
 ifdef LLAMA_CUDA_NVCC
 	NVCC = $(LLAMA_CUDA_NVCC)
 else
@@ -729,16 +738,16 @@ tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS)
 tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
@@ -746,3 +755,6 @@ tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)

 tests/test-c.o: tests/test-c.c llama.h
 	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
+
+tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
--- a/Package.swift
+++ b/Package.swift
@@ -2,33 +2,14 @@

 import PackageDescription

-#if arch(arm) || arch(arm64)
-let platforms: [SupportedPlatform]? = [
-    .macOS(.v12),
-    .iOS(.v14),
-    .watchOS(.v4),
-    .tvOS(.v14)
-]
-let exclude: [String] = []
-let resources: [Resource] = [
-    .process("ggml-metal.metal")
-]
-let additionalSources: [String] = ["ggml-metal.m"]
-let additionalSettings: [CSetting] = [
-    .unsafeFlags(["-fno-objc-arc"]),
-    .define("GGML_USE_METAL")
-]
-#else
-let platforms: [SupportedPlatform]? = nil
-let exclude: [String] = ["ggml-metal.metal"]
-let resources: [Resource] = []
-let additionalSources: [String] = []
-let additionalSettings: [CSetting] = []
-#endif
-
 let package = Package(
    name: "llama",
-    platforms: platforms,
+    platforms: [
+        .macOS(.v12),
+        .iOS(.v14),
+        .watchOS(.v4),
+        .tvOS(.v14)
+    ],
    products: [
        .library(name: "llama", targets: ["llama"]),
    ],
@@ -36,25 +17,30 @@ let package = Package(
        .target(
            name: "llama",
            path: ".",
-            exclude: exclude,
+            exclude: [],
            sources: [
                "ggml.c",
                "llama.cpp",
                "ggml-alloc.c",
                "ggml-backend.c",
                "ggml-quants.c",
-            ] + additionalSources,
-            resources: resources,
+                "ggml-metal.m",
+            ],
+            resources: [
+                .process("ggml-metal.metal")
+            ],
            publicHeadersPath: "spm-headers",
            cSettings: [
                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
-                .define("GGML_USE_ACCELERATE")
+                .define("GGML_USE_ACCELERATE"),
+                .unsafeFlags(["-fno-objc-arc"]),
+                .define("GGML_USE_METAL"),
                // NOTE: NEW_LAPACK will required iOS version 16.4+
                // We should consider add this in the future when we drop support for iOS 14
                // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
                // .define("ACCELERATE_NEW_LAPACK"),
                // .define("ACCELERATE_LAPACK_ILP64")
-            ] + additionalSettings,
+            ],
            linkerSettings: [
                .linkedFramework("Accelerate")
            ]
--- a/README.md
+++ b/README.md
@@ -10,6 +10,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

 ### Hot topics

+- Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406
+- **llama.h API change for handling KV cache offloading and data type: https://github.com/ggerganov/llama.cpp/pull/4309**
 - Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225
 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
 - Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -278,8 +278,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.yarn_beta_slow = std::stof(argv[i]);
-        } else if (arg == "--memory-f32") {
-            params.memory_f16 = false;
+        } else if (arg == "--samplers") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            sparams.samplers_sequence = parse_samplers_input(argv[i]);
+        } else if (arg == "--sampling-seq") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            sparams.samplers_sequence = argv[i];
        } else if (arg == "--top-p") {
            if (++i >= argc) {
                invalid_param = true;
@@ -498,6 +508,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            params.infill = true;
        } else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
            params.dump_kv_cache = true;
+        } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
+            params.no_kv_offload = true;
+        } else if (arg == "-ctk" || arg == "--cache-type-k") {
+            params.cache_type_k = argv[++i];
+        } else if (arg == "-ctv" || arg == "--cache-type-v") {
+            params.cache_type_v = argv[++i];
        } else if (arg == "--multiline-input") {
            params.multiline_input = true;
        } else if (arg == "--simple-io") {
@@ -640,6 +656,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        } else if (arg == "-h" || arg == "--help") {
            return false;

+        } else if (arg == "--version") {
+            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
+            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+            exit(0);
        } else if (arg == "--random-prompt") {
            params.random_prompt = true;
        } else if (arg == "--in-prefix-bos") {
@@ -678,6 +698,47 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                std::istreambuf_iterator<char>(),
                std::back_inserter(sparams.grammar)
            );
+        } else if (arg == "--override-kv") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            char * sep = strchr(argv[i], '=');
+            if (sep == nullptr || sep - argv[i] >= 128) {
+                fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
+            struct llama_model_kv_override kvo;
+            std::strncpy(kvo.key, argv[i], sep - argv[i]);
+            kvo.key[sep - argv[i]] = 0;
+            sep++;
+            if (strncmp(sep, "int:", 4) == 0) {
+                sep += 4;
+                kvo.tag = LLAMA_KV_OVERRIDE_INT;
+                kvo.int_value = std::atol(sep);
+            } else if (strncmp(sep, "float:", 6) == 0) {
+                sep += 6;
+                kvo.tag = LLAMA_KV_OVERRIDE_FLOAT;
+                kvo.float_value = std::atof(sep);
+            } else if (strncmp(sep, "bool:", 5) == 0) {
+                sep += 5;
+                kvo.tag = LLAMA_KV_OVERRIDE_BOOL;
+                if (std::strcmp(sep, "true") == 0) {
+                    kvo.bool_value = true;
+                } else if (std::strcmp(sep, "false") == 0) {
+                    kvo.bool_value = false;
+                } else {
+                    fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
+                    invalid_param = true;
+                    break;
+                }
+            } else {
+                fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
+            params.kv_overrides.push_back(kvo);
 #ifndef LOG_DISABLE_LOGS
        // Parse args for logging parameters
        } else if ( log_param_single_parse( argv[i] ) ) {
@@ -721,6 +782,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        }
    }

+    if (!params.kv_overrides.empty()) {
+        params.kv_overrides.emplace_back(llama_model_kv_override());
+        params.kv_overrides.back().key[0] = 0;
+    }
+
    return true;
 }

@@ -732,6 +798,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("\n");
    printf("options:\n");
    printf("  -h, --help            show this help message and exit\n");
+    printf("      --version         show version and build info\n");
    printf("  -i, --interactive     run in interactive mode\n");
    printf("  --interactive-first   run in interactive mode and wait for input right away\n");
    printf("  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
@@ -761,6 +828,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
    printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    printf("  --samplers            samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n");
+    printf("  --sampling-seq        simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str());
    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
    printf("  --min-p N             min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
@@ -798,8 +867,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --yarn-beta-fast N    YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
    printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
    printf("  --no-penalize-nl      do not penalize newline token\n");
-    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
-    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
    printf("  --temp N              temperature (default: %.1f)\n", (double)sparams.temp);
    printf("  --logits-all          return logits for all tokens in the batch (default: disabled)\n");
    printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
@@ -840,6 +907,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --verbose-prompt      print prompt before generation\n");
    printf("  -dkvc, --dump-kv-cache\n");
    printf("                        verbose print of the KV cache\n");
+    printf("  -nkvo, --no-kv-offload\n");
+    printf("                        disable KV offload\n");
+    printf("  -ctk TYPE, --cache-type-k TYPE\n");
+    printf("                        KV cache data type for K (default: %s)\n", params.cache_type_k.c_str());
+    printf("  -ctv TYPE, --cache-type-v TYPE\n");
+    printf("                        KV cache data type for V (default: %s)\n", params.cache_type_v.c_str());
    printf("  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
    printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
    printf("  --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
@@ -850,6 +923,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        draft model for speculative decoding (default: %s)\n", params.model.c_str());
    printf("  -ld LOGDIR, --logdir LOGDIR\n");
    printf("                        path under which to save YAML logs (no logging if unset)\n");
+    printf("  --override-kv KEY=TYPE:VALUE\n");
+    printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
+    printf("                        types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
    printf("\n");
 #ifndef LOG_DISABLE_LOGS
    log_print_usage();
@@ -886,6 +962,48 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
    GGML_UNREACHABLE();
 }

+//
+// String parsing
+//
+
+std::string parse_samplers_input(std::string input) {
+    std::string output = "";
+    // since samplers names are written multiple ways
+    // make it ready for both system names and input names
+    std::unordered_map<std::string, char> samplers_symbols {
+        {"top_k",      'k'},
+        {"top-k",      'k'},
+        {"top_p",      'p'},
+        {"top-p",      'p'},
+        {"nucleus",    'p'},
+        {"typical_p",  'y'},
+        {"typical-p",  'y'},
+        {"typical",    'y'},
+        {"min_p",      'm'},
+        {"min-p",      'm'},
+        {"tfs_z",      'f'},
+        {"tfs-z",      'f'},
+        {"tfs",        'f'},
+        {"temp",       't'},
+        {"temperature",'t'}
+    };
+    // expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p"
+    size_t separator = input.find(';');
+    while (separator != input.npos) {
+        std::string name = input.substr(0,separator);
+        input = input.substr(separator+1);
+        separator = input.find(';');
+
+        if (samplers_symbols.find(name) != samplers_symbols.end()) {
+            output += samplers_symbols[name];
+        }
+    }
+    if (samplers_symbols.find(input) != samplers_symbols.end()) {
+        output += samplers_symbols[input];
+    }
+    return output;
+}
+
 //
 // Model utils
 //
@@ -900,10 +1018,39 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
    mparams.tensor_split    = params.tensor_split;
    mparams.use_mmap        = params.use_mmap;
    mparams.use_mlock       = params.use_mlock;
+    if (params.kv_overrides.empty()) {
+        mparams.kv_overrides = NULL;
+    } else {
+        GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
+        mparams.kv_overrides = params.kv_overrides.data();
+    }

    return mparams;
 }

+static ggml_type kv_cache_type_from_str(const std::string & s) {
+    if (s == "f16") {
+        return GGML_TYPE_F16;
+    }
+    if (s == "q8_0") {
+        return GGML_TYPE_Q8_0;
+    }
+    if (s == "q4_0") {
+        return GGML_TYPE_Q4_0;
+    }
+    if (s == "q4_1") {
+        return GGML_TYPE_Q4_1;
+    }
+    if (s == "q5_0") {
+        return GGML_TYPE_Q5_0;
+    }
+    if (s == "q5_1") {
+        return GGML_TYPE_Q5_1;
+    }
+
+    throw std::runtime_error("Invalid cache type: " + s);
+}
+
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
    auto cparams = llama_context_default_params();

@@ -913,7 +1060,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
    cparams.mul_mat_q         = params.mul_mat_q;
    cparams.seed              = params.seed;
-    cparams.f16_kv            = params.memory_f16;
    cparams.logits_all        = params.logits_all;
    cparams.embedding         = params.embedding;
    cparams.rope_scaling_type = params.rope_scaling_type;
@@ -924,6 +1070,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.yarn_beta_fast    = params.yarn_beta_fast;
    cparams.yarn_beta_slow    = params.yarn_beta_slow;
    cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
+    cparams.offload_kqv       = !params.no_kv_offload;
+
+    cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
+    cparams.type_v = kv_cache_type_from_str(params.cache_type_v);

    return cparams;
 }
@@ -1336,7 +1486,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    }
    fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
-    fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
--- a/common/common.h
+++ b/common/common.h
@@ -86,6 +86,8 @@ struct gpt_params {
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
    std::string logdir            = "";  // directory in which to save YAML log files

+    std::vector<llama_model_kv_override> kv_overrides;
+
    // TODO: avoid tuple, use struct
    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
    std::string lora_base  = "";                              // base model path for the lora adapter
@@ -98,7 +100,6 @@ struct gpt_params {
    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score

    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
-    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
    bool interactive       = false; // interactive mode
@@ -123,10 +124,14 @@ struct gpt_params {
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool infill            = false; // use infill mode
    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
+    bool no_kv_offload     = false; // disable KV offloading
+
+    std::string cache_type_k = "f16"; // KV cache data type for the K
+    std::string cache_type_v = "f16"; // KV cache data type for the V

    // multimodal models (see examples/llava)
    std::string mmproj = ""; // path to multimodal projector
-    std::string image = ""; // path to an image file
+    std::string image  = ""; // path to an image file
 };

 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
@@ -141,6 +146,12 @@ std::string gpt_random_prompt(std::mt19937 & rng);

 void process_escapes(std::string& input);

+//
+// String parsing
+//
+
+std::string parse_samplers_input(std::string input);
+
 //
 // Model utils
 //
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -190,7 +190,7 @@ namespace grammar_parser {
                pos = parse_space(pos + 1, is_nested);
            } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
                if (last_sym_start == out_elements.size()) {
-                    throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos);
+                    throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
                }

                // apply transformation to previous symbol (last_sym_start to end) according to
--- a/common/log.h
+++ b/common/log.h
@@ -61,13 +61,13 @@
 //  #define LOG_TARGET stderr
 //  #include "log.h"
 //
-//  The log target can also be redirected to a diffrent function
+//  The log target can also be redirected to a different function
 //  like so:
 //
-//  #define LOG_TARGET log_handler_diffrent()
+//  #define LOG_TARGET log_handler_different()
 //  #include "log.h"
 //
-//  FILE* log_handler_diffrent()
+//  FILE* log_handler_different()
 //  {
 //      return stderr;
 //  }
@@ -421,7 +421,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriS

 // Disables logs entirely at runtime.
 //  Makes LOG() and LOG_TEE() produce no output,
-//  untill enabled back.
+//  until enabled back.
 #define log_disable() log_disable_impl()

 // INTERNAL, DO NOT USE
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -99,6 +99,56 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
    return std::string(result);
 }

+std::string llama_sampling_order_print(const llama_sampling_params & params) {
+    std::string result = "CFG -> Penalties ";
+    if (params.mirostat == 0) {
+        for (auto s : params.samplers_sequence) {
+            switch (s) {
+                case 'k': result += "-> top_k "; break;
+                case 'f': result += "-> tfs_z "; break;
+                case 'y': result += "-> typical_p "; break;
+                case 'p': result += "-> top_p "; break;
+                case 'm': result += "-> min_p "; break;
+                case 't': result += "-> temp "; break;
+                default : break;
+            }
+        }
+    } else {
+        result += "-> mirostat ";
+    }
+
+    return result;
+}
+
+// no reasons to expose this function in header
+static void sampler_queue(
+                   struct llama_context * ctx_main,
+            const llama_sampling_params & params,
+                 llama_token_data_array & cur_p,
+                                 size_t & min_keep) {
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
+
+    const float         temp              = params.temp;
+    const int32_t       top_k             = params.top_k <= 0 ? n_vocab : params.top_k;
+    const float         top_p             = params.top_p;
+    const float         min_p             = params.min_p;
+    const float         tfs_z             = params.tfs_z;
+    const float         typical_p         = params.typical_p;
+    const std::string & samplers_sequence = params.samplers_sequence;
+
+    for (auto s : samplers_sequence) {
+        switch (s){
+            case 'k': llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep); break;
+            case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep); break;
+            case 'y': llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
+            case 'p': llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
+            case 'm': llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
+            case 't': llama_sample_temp     (ctx_main, &cur_p, temp); break;
+            default : break;
+        }
+    }
+}
+
 llama_token llama_sampling_sample(
                  struct llama_sampling_context * ctx_sampling,
                  struct llama_context * ctx_main,
@@ -109,11 +159,6 @@ llama_token llama_sampling_sample(
    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));

    const float   temp            = params.temp;
-    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
-    const float   top_p           = params.top_p;
-    const float   min_p           = params.min_p;
-    const float   tfs_z           = params.tfs_z;
-    const float   typical_p       = params.typical_p;
    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
    const float   penalty_repeat  = params.penalty_repeat;
    const float   penalty_freq    = params.penalty_freq;
@@ -188,12 +233,7 @@ llama_token llama_sampling_sample(
            // temperature sampling
            size_t min_keep = std::max(1, params.n_probs);

-            llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep);
-            llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep);
-            llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep);
-            llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep);
-            llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep);
-            llama_sample_temp     (ctx_main, &cur_p, temp);
+            sampler_queue(ctx_main, params, cur_p, min_keep);

            id = llama_sample_token(ctx_main, &cur_p);

--- a/common/sampling.h
+++ b/common/sampling.h
@@ -10,22 +10,23 @@

 // sampling parameters
 typedef struct llama_sampling_params {
-    int32_t n_prev            = 64;    // number of previous tokens to remember
-    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t top_k             = 40;    // <= 0 to use vocab size
-    float   top_p             = 0.95f; // 1.0 = disabled
-    float   min_p             = 0.05f; // 0.0 = disabled
-    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float   typical_p         = 1.00f; // 1.0 = disabled
-    float   temp              = 0.80f; // 1.0 = disabled
-    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   penalty_repeat    = 1.10f; // 1.0 = disabled
-    float   penalty_freq      = 0.00f; // 0.0 = disabled
-    float   penalty_present   = 0.00f; // 0.0 = disabled
-    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau      = 5.00f; // target entropy
-    float   mirostat_eta      = 0.10f; // learning rate
-    bool    penalize_nl       = true;  // consider newlines as a repeatable token
+    int32_t     n_prev                = 64;       // number of previous tokens to remember
+    int32_t     n_probs               = 0;        // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t     top_k                 = 40;       // <= 0 to use vocab size
+    float       top_p                 = 0.95f;    // 1.0 = disabled
+    float       min_p                 = 0.05f;    // 0.0 = disabled
+    float       tfs_z                 = 1.00f;    // 1.0 = disabled
+    float       typical_p             = 1.00f;    // 1.0 = disabled
+    float       temp                  = 0.80f;    // 1.0 = disabled
+    int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float       penalty_repeat        = 1.10f;    // 1.0 = disabled
+    float       penalty_freq          = 0.00f;    // 0.0 = disabled
+    float       penalty_present       = 0.00f;    // 0.0 = disabled
+    int32_t     mirostat              = 0;        // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float       mirostat_tau          = 5.00f;    // target entropy
+    float       mirostat_eta          = 0.10f;    // learning rate
+    bool        penalize_nl           = true;     // consider newlines as a repeatable token
+    std::string samplers_sequence     = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp

    std::string grammar;  // optional BNF-like grammar to constrain sampling

@@ -80,6 +81,9 @@ std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama
 // Print sampling parameters into a string
 std::string llama_sampling_print(const llama_sampling_params & params);

+// Print sampling order into a string
+std::string llama_sampling_order_print(const llama_sampling_params & params);
+
 // this is a common sampling function used across the examples for convenience
 // it can serve as a starting point for implementing your own sampling function
 // Note: When using multiple sequences, it is the caller's responsibility to call
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -77,8 +77,18 @@ class Model:
            self.gguf_writer.add_embedding_length(n_embd)
        if (n_ff := self.hparams.get("intermediate_size")) is not None:
            self.gguf_writer.add_feed_forward_length(n_ff)
-        if (n_head := self.hparams.get("num_attention_head")) is not None:
+        if (n_head := self.hparams.get("num_attention_heads")) is not None:
            self.gguf_writer.add_head_count(n_head)
+        if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
+            self.gguf_writer.add_head_count_kv(n_head_kv)
+
+        if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
+            self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps)
+        if (n_experts := self.hparams.get("num_local_experts")) is not None:
+            self.gguf_writer.add_expert_count(n_experts)
+        if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
+            self.gguf_writer.add_expert_used_count(n_experts_used)
+
        self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))

    def write_tensors(self):
@@ -170,6 +180,8 @@ class Model:
            return StableLMModel
        if model_architecture == "QWenLMHeadModel":
            return QwenModel
+        if model_architecture == "MixtralForCausalLM":
+            return MixtralModel
        return Model

    def _is_model_safetensors(self) -> bool:
@@ -207,6 +219,8 @@ class Model:
            return gguf.MODEL_ARCH.STABLELM
        if arch == "QWenLMHeadModel":
            return gguf.MODEL_ARCH.QWEN
+        if arch == "MixtralForCausalLM":
+            return gguf.MODEL_ARCH.LLAMA

        raise NotImplementedError(f'Architecture "{arch}" not supported!')

@@ -837,6 +851,11 @@ class StableLMModel(Model):
        self.gguf_writer.add_layer_norm_eps(1e-5)


+class MixtralModel(Model):
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+
 class QwenModel(Model):
    @staticmethod
    def token_bytes_to_string(b):
--- a/convert.py
+++ b/convert.py
@@ -42,6 +42,7 @@ NDArray: TypeAlias = 'np.ndarray[Any, Any]'
 ARCH = gguf.MODEL_ARCH.LLAMA

 DEFAULT_CONCURRENCY = 8
+
 #
 # data types
 #
@@ -62,10 +63,10 @@ class UnquantizedDataType(DataType):
    pass


-DT_F16  = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
-DT_F32  = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
-DT_I32  = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
-DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
+DT_F16  = UnquantizedDataType('F16',  dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
+DT_F32  = UnquantizedDataType('F32',  dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
+DT_I32  = UnquantizedDataType('I32',  dtype = np.dtype(np.int16),   valid_conversions = [])
+DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16),  valid_conversions = ['F32', 'F16', 'Q8_0'])


@dataclass(frozen=True)
@@ -151,14 +152,16 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {

@dataclass
 class Params:
-    n_vocab:    int
-    n_embd:     int
-    n_layer:    int
-    n_ctx:      int
-    n_ff:       int
-    n_head:     int
-    n_head_kv:  int
-    f_norm_eps: float
+    n_vocab:        int
+    n_embd:         int
+    n_layer:        int
+    n_ctx:          int
+    n_ff:           int
+    n_head:         int
+    n_head_kv:      int
+    n_experts:      int | None = None
+    n_experts_used: int | None = None
+    f_norm_eps:     float | None = None

    rope_scaling_type: gguf.RopeScalingType | None = None
    f_rope_freq_base: float | None = None
@@ -233,6 +236,13 @@ class Params:
            raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")

+        n_experts      = None
+        n_experts_used = None
+
+        if "num_local_experts" in config:
+            n_experts = config["num_local_experts"]
+            n_experts_used = config["num_experts_per_tok"]
+
        return Params(
            n_vocab           = config["vocab_size"],
            n_embd            = config["hidden_size"],
@@ -241,6 +251,8 @@ class Params:
            n_ff              = config["intermediate_size"],
            n_head            = (n_head := config["num_attention_heads"]),
            n_head_kv         = config.get("num_key_value_heads", n_head),
+            n_experts         = n_experts,
+            n_experts_used    = n_experts_used,
            f_norm_eps        = config["rms_norm_eps"],
            f_rope_freq_base  = config.get("rope_theta"),
            rope_scaling_type = rope_scaling_type,
@@ -255,8 +267,15 @@ class Params:
    def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
        config = json.load(open(config_path))

+        n_experts      = None
+        n_experts_used = None
+        f_rope_freq_base = None
+
        # hack to determine LLaMA v1 vs v2 vs CodeLlama
-        if config.get("rope_theta") == 1000000:
+        if config.get("moe"):
+            # Mixtral
+            n_ctx = 32768
+        elif config.get("rope_theta") == 1000000:
            # CodeLlama
            n_ctx = 16384
        elif config["norm_eps"] == 1e-05:
@@ -266,16 +285,27 @@ class Params:
            # LLaMA v1
            n_ctx = 2048

+        if "layers.0.feed_forward.w1.weight" in model:
+            n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
+
+        if config.get("moe"):
+            n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
+            n_experts      = config["moe"]["num_experts"]
+            n_experts_used = config["moe"]["num_experts_per_tok"]
+            f_rope_freq_base = 1e6
+
        return Params(
            n_vocab          = model["tok_embeddings.weight"].shape[0],
            n_embd           = config["dim"],
            n_layer          = config["n_layers"],
            n_ctx            = n_ctx,
-            n_ff             = model["layers.0.feed_forward.w1.weight"].shape[0],
+            n_ff             = n_ff,
            n_head           = (n_head := config["n_heads"]),
            n_head_kv        = config.get("n_kv_heads", n_head),
+            n_experts        = n_experts,
+            n_experts_used   = n_experts_used,
            f_norm_eps       = config["norm_eps"],
-            f_rope_freq_base = config.get("rope_theta"),
+            f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
        )

    @staticmethod
@@ -585,7 +615,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:

    if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
        # Transformers models put different tensors in different files, but
-        # don't split indivdual tensors between files.
+        # don't split individual tensors between files.
        model: LazyModel = {}
        for mp in models_plus:
            model.update(mp.model)
@@ -678,7 +708,7 @@ class LazyUnpickler(pickle.Unpickler):
        return func(*args)

    CLASSES: dict[tuple[str, str], Any] = {
-        # getattr used here as a workaround for mypy not being smart enough to detrmine
+        # getattr used here as a workaround for mypy not being smart enough to determine
        # the staticmethods have a __func__ attribute.
        ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
        ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
@@ -832,7 +862,17 @@ class OutputFile:
        self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
        self.gguf.add_head_count          (params.n_head)
        self.gguf.add_head_count_kv       (params.n_head_kv)
-        self.gguf.add_layer_norm_rms_eps  (params.f_norm_eps)
+
+        if params.n_experts:
+            self.gguf.add_expert_count(params.n_experts)
+
+        if params.n_experts_used:
+            self.gguf.add_expert_used_count(params.n_experts_used)
+
+        if params.f_norm_eps:
+            self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
+        else:
+            raise ValueError('f_norm_eps is None')

        if params.f_rope_freq_base is not None:
            self.gguf.add_rope_freq_base(params.f_rope_freq_base)
@@ -956,7 +996,7 @@ class OutputFile:


 def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
-    wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) +".weight"].data_type
+    wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type

    if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
        return GGMLFileType.AllF32
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -215,9 +215,10 @@ print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end
 llama_print_timings(context)

 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
-    let n_tokens = text.count + (add_bos ? 1 : 0)
+    let utf8Count = text.utf8.count
+    let n_tokens = utf8Count + (add_bos ? 1 : 0)
    let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-    let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
+    let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
    var swiftTokens: [llama_token] = []
    for i in 0 ..< tokenCount {
        swiftTokens.append(tokens[Int(i)])
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -53,6 +53,13 @@ static std::vector<T> split(const std::string & str, char delim) {
    return values;
 }

+template<typename T, typename F>
+static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
+    std::vector<std::string> str_values;
+    std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
+    return str_values;
+}
+
 template<typename T>
 static T avg(const std::vector<T> & v) {
    if (v.empty()) {
@@ -126,7 +133,8 @@ struct cmd_params {
    std::vector<int> n_prompt;
    std::vector<int> n_gen;
    std::vector<int> n_batch;
-    std::vector<bool> f32_kv;
+    std::vector<ggml_type> type_k;
+    std::vector<ggml_type> type_v;
    std::vector<int> n_threads;
    std::vector<int> n_gpu_layers;
    std::vector<int> main_gpu;
@@ -142,7 +150,8 @@ static const cmd_params cmd_params_defaults = {
    /* n_prompt      */ {512},
    /* n_gen         */ {128},
    /* n_batch       */ {512},
-    /* f32_kv        */ {false},
+    /* type_k        */ {GGML_TYPE_F16},
+    /* type_v        */ {GGML_TYPE_F16},
    /* n_threads     */ {get_num_physical_cores()},
    /* n_gpu_layers  */ {99},
    /* main_gpu      */ {0},
@@ -162,7 +171,8 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -p, --n-prompt <n>                (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
    printf("  -n, --n-gen <n>                   (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
    printf("  -b, --batch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
-    printf("  --memory-f32 <0|1>                (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
+    printf("  -ctk <t>, --cache-type-k <t>      (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
+    printf("  -ctv <t>, --cache-type-v <t>      (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
    printf("  -t, --threads <n>                 (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
    printf("  -ngl, --n-gpu-layers <n>          (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
    printf("  -mg, --main-gpu <i>               (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
@@ -173,9 +183,32 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -v, --verbose                     (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
    printf("\n");
    printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
-
 }

+static ggml_type ggml_type_from_name(const std::string & s) {
+    if (s == "f16") {
+        return GGML_TYPE_F16;
+    }
+    if (s == "q8_0") {
+        return GGML_TYPE_Q8_0;
+    }
+    if (s == "q4_0") {
+        return GGML_TYPE_Q4_0;
+    }
+    if (s == "q4_1") {
+        return GGML_TYPE_Q4_1;
+    }
+    if (s == "q5_0") {
+        return GGML_TYPE_Q5_0;
+    }
+    if (s == "q5_1") {
+        return GGML_TYPE_Q5_1;
+    }
+
+    return GGML_TYPE_COUNT;
+}
+
+
 static cmd_params parse_cmd_params(int argc, char ** argv) {
    cmd_params params;
    std::string arg;
@@ -224,13 +257,38 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = split<int>(argv[i], split_delim);
            params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
-        } else if (arg == "--memory-f32") {
+        } else if (arg == "-ctk" || arg == "--cache-type-k") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<int>(argv[i], split_delim);
-            params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end());
+            auto p = split<std::string>(argv[i], split_delim);
+            std::vector<ggml_type> types;
+            for (const auto & t : p) {
+                ggml_type gt = ggml_type_from_name(t);
+                if (gt == GGML_TYPE_COUNT) {
+                    invalid_param = true;
+                    break;
+                }
+                types.push_back(gt);
+            }
+            params.type_k.insert(params.type_k.end(), types.begin(), types.end());
+        } else if (arg == "-ctv" || arg == "--cache-type-v") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<std::string>(argv[i], split_delim);
+            std::vector<ggml_type> types;
+            for (const auto & t : p) {
+                ggml_type gt = ggml_type_from_name(t);
+                if (gt == GGML_TYPE_COUNT) {
+                    invalid_param = true;
+                    break;
+                }
+                types.push_back(gt);
+            }
+            params.type_v.insert(params.type_v.end(), types.begin(), types.end());
        } else if (arg == "-t" || arg == "--threads") {
            if (++i >= argc) {
                invalid_param = true;
@@ -321,7 +379,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.n_prompt.empty())     { params.n_prompt = cmd_params_defaults.n_prompt; }
    if (params.n_gen.empty())        { params.n_gen = cmd_params_defaults.n_gen; }
    if (params.n_batch.empty())      { params.n_batch = cmd_params_defaults.n_batch; }
-    if (params.f32_kv.empty())       { params.f32_kv = cmd_params_defaults.f32_kv; }
+    if (params.type_k.empty())       { params.type_k = cmd_params_defaults.type_k; }
+    if (params.type_v.empty())       { params.type_v = cmd_params_defaults.type_v; }
    if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
    if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
    if (params.mul_mat_q.empty())    { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
@@ -336,7 +395,8 @@ struct cmd_params_instance {
    int n_prompt;
    int n_gen;
    int n_batch;
-    bool f32_kv;
+    ggml_type type_k;
+    ggml_type type_v;
    int n_threads;
    int n_gpu_layers;
    int main_gpu;
@@ -365,7 +425,8 @@ struct cmd_params_instance {

        cparams.n_ctx = n_prompt + n_gen;
        cparams.n_batch = n_batch;
-        cparams.f16_kv = !f32_kv;
+        cparams.type_k = type_k;
+        cparams.type_v = type_v;
        cparams.mul_mat_q = mul_mat_q;

        return cparams;
@@ -380,7 +441,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
    for (const auto & mg : params.main_gpu)
    for (const auto & ts : params.tensor_split)
    for (const auto & nb : params.n_batch)
-    for (const auto & fk : params.f32_kv)
+    for (const auto & tk : params.type_k)
+    for (const auto & tv : params.type_v)
    for (const auto & mmq : params.mul_mat_q)
    for (const auto & nt : params.n_threads) {
        cmd_params_instance instance = {
@@ -388,7 +450,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
            /* .n_prompt     = */ n_prompt,
            /* .n_gen        = */ n_gen,
            /* .n_batch      = */ nb,
-            /* .f32_kv       = */ fk,
+            /* .type_k       = */ tk,
+            /* .type_v       = */ tv,
            /* .n_threads    = */ nt,
            /* .n_gpu_layers = */ nl,
            /* .main_gpu     = */ mg,
@@ -410,7 +473,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
    for (const auto & mg : params.main_gpu)
    for (const auto & ts : params.tensor_split)
    for (const auto & nb : params.n_batch)
-    for (const auto & fk : params.f32_kv)
+    for (const auto & tk : params.type_k)
+    for (const auto & tv : params.type_v)
    for (const auto & mmq : params.mul_mat_q)
    for (const auto & nt : params.n_threads) {
        for (const auto & n_prompt : params.n_prompt) {
@@ -422,7 +486,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .n_prompt     = */ n_prompt,
                /* .n_gen        = */ 0,
                /* .n_batch      = */ nb,
-                /* .f32_kv       = */ fk,
+                /* .type_k       = */ tk,
+                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
                /* .main_gpu     = */ mg,
@@ -441,7 +506,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .n_prompt     = */ 0,
                /* .n_gen        = */ n_gen,
                /* .n_batch      = */ nb,
-                /* .f32_kv       = */ fk,
+                /* .type_k       = */ tk,
+                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
                /* .main_gpu     = */ mg,
@@ -489,7 +555,8 @@ struct test {
    uint64_t model_n_params;
    int n_batch;
    int n_threads;
-    bool f32_kv;
+    ggml_type type_k;
+    ggml_type type_v;
    int n_gpu_layers;
    int main_gpu;
    bool mul_mat_q;
@@ -508,7 +575,8 @@ struct test {
        model_n_params = llama_model_n_params(lmodel);
        n_batch = inst.n_batch;
        n_threads = inst.n_threads;
-        f32_kv = inst.f32_kv;
+        type_k = inst.type_k;
+        type_v = inst.type_v;
        n_gpu_layers = inst.n_gpu_layers;
        main_gpu = inst.main_gpu;
        mul_mat_q = inst.mul_mat_q;
@@ -571,7 +639,7 @@ struct test {
            "cuda", "opencl", "metal", "gpu_blas", "blas",
            "cpu_info", "gpu_info",
            "model_filename", "model_type", "model_size", "model_n_params",
-            "n_batch", "n_threads", "f16_kv",
+            "n_batch", "n_threads", "type_k", "type_v",
            "n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split",
            "n_prompt", "n_gen", "test_time",
            "avg_ns", "stddev_ns",
@@ -621,7 +689,7 @@ struct test {
            std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
-            std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
+            std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
            std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str,
            std::to_string(n_prompt), std::to_string(n_gen), test_time,
            std::to_string(avg_ns()), std::to_string(stdev_ns()),
@@ -805,8 +873,11 @@ struct markdown_printer : public printer {
        if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
            fields.push_back("n_batch");
        }
-        if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) {
-            fields.push_back("f16_kv");
+        if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
+            fields.push_back("type_k");
+        }
+        if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
+            fields.push_back("type_v");
        }
        if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
            fields.push_back("main_gpu");
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -11,6 +11,8 @@ actor LlamaContext {
    private var context: OpaquePointer
    private var batch: llama_batch
    private var tokens_list: [llama_token]
+    /// This variable is used to store temporarily invalid cchars
+    private var temporary_invalid_cchars: [CChar]

    var n_len: Int32 = 512
    var n_cur: Int32 = 0
@@ -21,6 +23,7 @@ actor LlamaContext {
        self.context = context
        self.tokens_list = []
        self.batch = llama_batch_init(512, 0, 1)
+        self.temporary_invalid_cchars = []
    }

    deinit {
@@ -61,6 +64,7 @@ actor LlamaContext {
        print("attempting to complete \"\(text)\"")

        tokens_list = tokenize(text: text, add_bos: true)
+        temporary_invalid_cchars = []

        let n_ctx = llama_n_ctx(context)
        let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
@@ -72,7 +76,7 @@ actor LlamaContext {
        }

        for id in tokens_list {
-            print(token_to_piece(token: id))
+            print(String(cString: token_to_piece(token: id) + [0]))
        }

        // batch = llama_batch_init(512, 0) // done in init()
@@ -115,10 +119,25 @@ actor LlamaContext {

        if new_token_id == llama_token_eos(context) || n_cur == n_len {
            print("\n")
-            return ""
+            let new_token_str = String(cString: temporary_invalid_cchars + [0])
+            temporary_invalid_cchars.removeAll()
+            return new_token_str
        }

-        let new_token_str = token_to_piece(token: new_token_id)
+        let new_token_cchars = token_to_piece(token: new_token_id)
+        temporary_invalid_cchars.append(contentsOf: new_token_cchars)
+        let new_token_str: String
+        if let string = String(validatingUTF8: temporary_invalid_cchars + [0]) {
+            temporary_invalid_cchars.removeAll()
+            new_token_str = string
+        } else if (0 ..< temporary_invalid_cchars.count).contains(where: {$0 != 0 && String(validatingUTF8: Array(temporary_invalid_cchars.suffix($0)) + [0]) != nil}) {
+            // in this case, at least the suffix of the temporary_invalid_cchars can be interpreted as UTF8 string
+            let string = String(cString: temporary_invalid_cchars + [0])
+            temporary_invalid_cchars.removeAll()
+            new_token_str = string
+        } else {
+            new_token_str = ""
+        }
        print(new_token_str)
        // tokens_list.append(new_token_id)

@@ -144,12 +163,14 @@ actor LlamaContext {

    func clear() {
        tokens_list.removeAll()
+        temporary_invalid_cchars.removeAll()
    }

    private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
-        let n_tokens = text.count + (add_bos ? 1 : 0)
+        let utf8Count = text.utf8.count
+        let n_tokens = utf8Count + (add_bos ? 1 : 0)
        let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-        let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, false)
+        let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)

        var swiftTokens: [llama_token] = []
        for i in 0..<tokenCount {
@@ -161,7 +182,8 @@ actor LlamaContext {
        return swiftTokens
    }

-    private func token_to_piece(token: llama_token) -> String {
+    /// - note: The result does not contain null-terminator
+    private func token_to_piece(token: llama_token) -> [CChar] {
        let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
        result.initialize(repeating: Int8(0), count: 8)
        defer {
@@ -175,10 +197,12 @@ actor LlamaContext {
            defer {
                newResult.deallocate()
            }
-            _ = llama_token_to_piece(model, token, newResult, -nTokens)
-            return String(cString: newResult)
+            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens)
+            let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
+            return Array(bufferPointer)
        } else {
-            return String(cString: result)
+            let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nTokens))
+            return Array(bufferPointer)
        }
    }
 }
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -739,7 +739,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
        temp->ny = longer_side;
        temp->size = 3 * longer_side * longer_side;
        temp->data = new uint8_t[temp->size]();
-        uint8_t bc[3] = {122, 116, 104}; // bakground color in RGB from LLaVA
+        uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA

        // fill with background color
        for (size_t i = 0; i < temp->size; i++) {
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -51,7 +51,7 @@ def bytes_to_unicode():
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
--- a/examples/lookahead/README.md
+++ b/examples/lookahead/README.md
@@ -1,6 +1,6 @@
 # llama.cpp/examples/lookahead

-Demonstartion of lookahead decoding technique:
+Demonstration of lookahead decoding technique:

 https://lmsys.org/blog/2023-11-21-lookahead-decoding/

--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -437,6 +437,7 @@ int main(int argc, char ** argv) {
        }
    }
    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
+    LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    LOG_TEE("\n\n");

--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -321,7 +321,6 @@ int main(int argc, char ** argv) {
        auto cparams = llama_context_default_params();
        cparams.n_ctx      = 256;
        cparams.seed       = 1;
-        cparams.f16_kv     = false;

        ctx = llama_new_context_with_model(model, cparams);

--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -222,7 +222,7 @@ node index.js

    `content`: Set the text to process.

-    **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
+-   **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.

    *Options:*

--- a/examples/server/api_like_OAI.py
+++ b/examples/server/api_like_OAI.py
@@ -70,6 +70,7 @@ def make_postData(body, chat=False, stream=False):
    if(is_present(body, "mirostat_tau")): postData["mirostat_tau"] = body["mirostat_tau"]
    if(is_present(body, "mirostat_eta")): postData["mirostat_eta"] = body["mirostat_eta"]
    if(is_present(body, "seed")): postData["seed"] = body["seed"]
+    if(is_present(body, "grammar")): postData["grammar"] = body["grammar"]
    if(is_present(body, "logit_bias")): postData["logit_bias"] = [[int(token), body["logit_bias"][token]] for token in body["logit_bias"].keys()]
    if (args.stop != ""):
        postData["stop"] = [args.stop]
--- a/examples/server/json.hpp
+++ b/examples/server/json.hpp
@@ -11227,7 +11227,7 @@ class binary_reader
                }
                if (is_ndarray) // ndarray dimensional vector can only contain integers, and can not embed another array
                {
-                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read, exception_message(input_format, "ndarray dimentional vector is not allowed", "size"), nullptr));
+                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read, exception_message(input_format, "ndarray dimensional vector is not allowed", "size"), nullptr));
                }
                std::vector<size_t> dim;
                if (JSON_HEDLEY_UNLIKELY(!get_ubjson_ndarray_size(dim)))
--- a/examples/server/public/completion.js
+++ b/examples/server/public/completion.js
@@ -114,7 +114,7 @@ export async function* llama(prompt, params = {}, config = {}) {
  return content;
 }

-// Call llama, return an event target that you can subcribe to
+// Call llama, return an event target that you can subscribe to
 //
 // Example:
 //
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -223,7 +223,7 @@
      repeat_last_n: 256, // 0 = disable penalty, -1 = context size
      repeat_penalty: 1.18, // 1.0 = disabled
      top_k: 40, // <= 0 to use vocab size
-      top_p: 0.5, // 1.0 = disabled
+      top_p: 0.95, // 1.0 = disabled
      min_p: 0.05, // 0 = disabled
      tfs_z: 1.0, // 1.0 = disabled
      typical_p: 1.0, // 1.0 = disabled
@@ -238,7 +238,7 @@
      cache_prompt: true
    })

-    /* START: Support for storing prompt templates and parameters in borwser LocalStorage */
+    /* START: Support for storing prompt templates and parameters in browsers LocalStorage */

    const local_storage_storageKey = "llamacpp_server_local_storage";

@@ -282,7 +282,7 @@
    let importedTemplates = local_storage_getDataAsObject('user_templates')

    if (importedTemplates) {
-      // saved templates were successfuly imported.
+      // saved templates were successfully imported.

      console.log('Processing saved templates and updating default template')
      params.value = { ...params.value, image_data: [] };
@@ -303,7 +303,7 @@
    }

    function userTemplateResetToDefault() {
-      console.log('Reseting themplate to default')
+      console.log('Resetting template to default')
      selectedUserTemplate.value.name = 'default';
      selectedUserTemplate.value.data = savedUserTemplates.value['default'];
    }
@@ -762,7 +762,7 @@

          <fieldset class="two">
            ${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
-            ${FloatField({ label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
+            ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
            ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
            ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
            ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2108,10 +2108,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.yarn_beta_slow = std::stof(argv[i]);
        }
-        else if (arg == "--memory-f32" || arg == "--memory_f32")
-        {
-            params.memory_f16 = false;
-        }
        else if (arg == "--threads" || arg == "-t")
        {
            if (++i >= argc)
@@ -2386,7 +2382,9 @@ json oaicompat_completion_params_parse(
    llama_params["__oaicompat"] = true;

    // Map OpenAI parameters to llama.cpp parameters
+    llama_params["model"]             = json_value(body, "model", std::string("uknown"));
    llama_params["prompt"]            = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
+    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
    llama_params["temperature"]       = json_value(body, "temperature", 0.8);
    llama_params["top_k"]             = json_value(body, "top_k", 40);
    llama_params["top_p"]             = json_value(body, "top_p", 0.95);
@@ -2410,9 +2408,7 @@ json oaicompat_completion_params_parse(
    }

    // Handle 'stop' field
-    if (body["stop"].is_null()) {
-        llama_params["stop"] = json::array({});
-    } else if (body["stop"].is_string()) {
+    if (body.contains("stop") && body["stop"].is_string()) {
        llama_params["stop"] = json::array({body["stop"].get<std::string>()});
    } else {
        llama_params["stop"] = json_value(body, "stop", json::array());
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -75,7 +75,7 @@ int main(int argc, char ** argv) {
    // make sure the KV cache is big enough to hold all the prompt and generated tokens
    if (n_kv_req > n_ctx) {
        LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
-        LOG_TEE("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
+        LOG_TEE("%s:        either reduce n_len or increase n_ctx\n", __func__);
        return 1;
    }

--- a/examples/speculative/README.md
+++ b/examples/speculative/README.md
@@ -1,6 +1,6 @@
 # llama.cpp/examples/speculative

-Demonstartion of speculative decoding and tree-based speculative decoding techniques
+Demonstration of speculative decoding and tree-based speculative decoding techniques

 More info:

--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -203,8 +203,9 @@ int main(int argc, char ** argv) {

            const std::string token_str = llama_token_to_piece(ctx_tgt, id);

-            printf("%s", token_str.c_str());
-            fflush(stdout);
+            if (!params.use_color) {
+                printf("%s", token_str.c_str());
+            }

            if (id == llama_token_eos(model_tgt)) {
                has_eos = true;
@@ -236,10 +237,18 @@ int main(int argc, char ** argv) {
                    ++n_past_tgt;
                    ++n_past_dft;
                    ++i_dft;
-
+                    if (params.use_color) {
+                        // Color token according to its origin sequence
+                        printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
+                        fflush(stdout);
+                    }
                    continue;
                }
            }
+            if (params.use_color) {
+                printf("%s", token_str.c_str());
+            }
+            fflush(stdout);

            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());

@@ -419,7 +428,7 @@ int main(int argc, char ** argv) {
            ++n_past_tgt;
        }

-        // the first token is always proposed by the traget model before the speculation loop so we erase it here
+        // the first token is always proposed by the target model before the speculation loop so we erase it here
        for (int s = 0; s < n_seq_dft; ++s) {
            if (!drafts[s].active) {
                continue;
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1295,10 +1295,6 @@ int main(int argc, char ** argv) {
        opt_cb_data.last_save_iter = opt->iter;
    }

-    if (alloc) {
-        ggml_allocr_free(alloc);
-    }
-
    ggml_free(opt->ctx);
    free_train_state(train);
    ggml_free(model.ctx);
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -168,10 +168,6 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
    size = aligned_offset(NULL, size, alloc->alignment);
    AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);

-    if (!alloc->measure) {
-        ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
-    }
-
 #ifdef GGML_ALLOCATOR_DEBUG
    remove_allocated_tensor(alloc, tensor);
 #endif
@@ -237,7 +233,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
 }

 ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) {
-    struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
+    struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size);

    ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));

@@ -449,7 +445,6 @@ static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * n
 static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
    ggml_tallocr_t alloc = node_tallocr(galloc, view);

-    //printf("init_view: %s from src %s\n", view->name, view->view_src->name);
    GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
    if (update_backend) {
        view->backend = view->view_src->backend;
@@ -459,7 +454,7 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd

    // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
    // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
-    assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
+    assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);

    if (!alloc->measure) {
        ggml_backend_buffer_init_tensor(alloc->buffer, view);
@@ -765,3 +760,43 @@ size_t ggml_allocr_max_size(ggml_allocr_t alloc) {
 size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) {
    return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
 }
+
+// utils
+ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
+    GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
+
+    size_t alignment = ggml_backend_buft_get_alignment(buft);
+
+    size_t nbytes = 0;
+    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+        if (t->data == NULL && t->view_src == NULL) {
+            nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
+        }
+    }
+
+    if (nbytes == 0) {
+        fprintf(stderr, "%s: no tensors to allocate\n", __func__);
+        return NULL;
+    }
+
+    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
+    ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
+
+    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+        if (t->data == NULL) {
+            if (t->view_src == NULL) {
+                ggml_tallocr_alloc(tallocr, t);
+            } else {
+                ggml_backend_view_init(buffer, t);
+            }
+        }
+    }
+
+    ggml_tallocr_free(tallocr);
+
+    return buffer;
+}
+
+ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
+    return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
+}
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@@ -8,6 +8,7 @@ extern "C" {

 struct ggml_backend;
 struct ggml_backend_buffer;
+struct ggml_backend_buffer_type;

 //
 // Legacy API
@@ -42,7 +43,7 @@ GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph
 // ggml-backend v2 API
 //

-// Seperate tensor and graph allocator objects
+// Separate tensor and graph allocator objects
 // This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
 // The original API is kept as a wrapper around the new API

@@ -80,6 +81,12 @@ GGML_API void   ggml_gallocr_alloc_graph_n(
                    struct ggml_hash_set hash_set,
                    ggml_tallocr_t * hash_node_talloc);

+
+// Utils
+// Create a buffer and allocate all the tensors in a ggml_context
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft);
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend);
+
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@@ -12,31 +12,50 @@ extern "C" {
    // Backend buffer
    //

+    // buffer type
+    typedef void * ggml_backend_buffer_type_context_t;
+
+    struct ggml_backend_buffer_type_i {
+        ggml_backend_buffer_t (*alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
+        size_t                (*get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
+        size_t                (*get_alloc_size)  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
+        bool                  (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
+    };
+
+    struct ggml_backend_buffer_type {
+        struct ggml_backend_buffer_type_i  iface;
+        ggml_backend_buffer_type_context_t context;
+    };
+
+    // buffer
    typedef void * ggml_backend_buffer_context_t;

    struct ggml_backend_buffer_i {
-        void   (*free_buffer)   (ggml_backend_buffer_t buffer);
-        void * (*get_base)      (ggml_backend_buffer_t buffer); // get base pointer
-        size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
-        void   (*init_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
-        void   (*free_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
+        void     (*free_buffer)(ggml_backend_buffer_t buffer);
+        //void     (*reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
+        void *   (*get_base)   (ggml_backend_buffer_t buffer);
+        void     (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+        void     (*set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void     (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        // (optional) copy tensor between different buffer-type, allow for single-copy tranfers
+        void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
+        void (*cpy_tensor_to)  (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
    };

    struct ggml_backend_buffer {
-        struct ggml_backend_buffer_i iface;
-
-        ggml_backend_t                backend;
+        struct ggml_backend_buffer_i  iface;
+        ggml_backend_buffer_type_t    buft;
        ggml_backend_buffer_context_t context;
-
        size_t size;
    };

-    GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
-            struct ggml_backend                  * backend,
+    ggml_backend_buffer_t ggml_backend_buffer_init(
+                   ggml_backend_buffer_type_t      buft,
            struct ggml_backend_buffer_i           iface,
                   ggml_backend_buffer_context_t   context,
                   size_t                          size);

+
    //
    // Backend
    //
@@ -49,20 +68,17 @@ extern "C" {
        void (*free)(ggml_backend_t backend);

        // buffer allocation
-        ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
+        ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);

-        // get buffer alignment
-        size_t (*get_alignment)(ggml_backend_t backend);
-
-        // tensor data access
-        // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
+        // (optional) asynchroneous tensor data access
        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        void (*synchronize)     (ggml_backend_t backend);

-        // (optional) copy tensor between different backends, allow for single-copy tranfers
-        void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
-        void (*cpy_tensor_to)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+        // (optional) asynchroneous tensor copy
+        void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+        void (*cpy_tensor_to_async)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+
+        void (*synchronize)     (ggml_backend_t backend);

        // compute graph with a plan
        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
@@ -82,6 +98,15 @@ extern "C" {
        ggml_backend_context_t context;
    };

+
+    //
+    // Backend registry
+    //
+
+    typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
+
+    void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
+
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml-backend.c
+++ b/ggml-backend.c
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -7,41 +7,44 @@
 extern "C" {
 #endif

+    typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
+    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+    typedef struct ggml_backend * ggml_backend_t;
+    typedef void * ggml_backend_graph_plan_t;
+
    //
    // Backend buffer
    //

-    struct ggml_backend_buffer;
-    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+    // buffer type
+    GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
+    GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
+    GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
+    GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);

-    // backend buffer functions
+    // buffer
    GGML_API void   ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
-    GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
    GGML_API void * ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
    GGML_API size_t ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
-    GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    GGML_API void   ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API void   ggml_backend_buffer_free_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+    GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);

    //
    // Backend
    //

-    struct ggml_backend;
-    typedef struct ggml_backend * ggml_backend_t;
-    typedef void * ggml_backend_graph_plan_t;
-
-    GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);

    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
    GGML_API void         ggml_backend_free(ggml_backend_t backend);

-    GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
+    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
+    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
+    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);

-    GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
-
-    GGML_API void ggml_backend_tensor_set_async(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);

    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
@@ -57,6 +60,7 @@ extern "C" {

    // tensor copy between different backends
    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
+    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy

    //
    // CPU backend
@@ -68,8 +72,23 @@ extern "C" {
    GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);

    // Create a backend buffer from an existing pointer
-    GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
+    GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);

+    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
+
+    //
+    // Backend registry
+    //
+
+    // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
+
+    GGML_API size_t                     ggml_backend_reg_get_count(void);
+    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name);
+    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
+    GGML_API const char *               ggml_backend_reg_get_name(size_t i);
+    GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
+    GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
+    GGML_API ggml_backend_buffer_t      ggml_backend_reg_alloc_buffer(size_t i, size_t size);

    //
    // Backend scheduler
@@ -131,6 +150,32 @@ extern "C" {
            ggml_backend_sched_t sched,
            struct ggml_cgraph * graph);

+
+    //
+    // Utils
+    //
+
+    struct ggml_backend_graph_copy {
+        ggml_backend_buffer_t buffer;
+        struct ggml_context * ctx_allocated;
+        struct ggml_context * ctx_unallocated;
+        struct ggml_cgraph * graph;
+    };
+
+    // Copy a graph to a different backend
+    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
+    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
+
+    typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
+
+    // Compare the output of two backends
+    GGML_API void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
+
+    // Tensor initialization
+    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
+    GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+
+
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -49,7 +49,15 @@ GGML_API int    ggml_cuda_get_device_count(void);
 GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);

 // backend API
-GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use
+GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
+
+GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
+GGML_API int  ggml_backend_cuda_get_device(ggml_backend_t backend);
+
+GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
+
+// pinned host buffer for use with CPU backend for faster copies between CPU and GPU
+GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);

 #ifdef  __cplusplus
 }
--- a/ggml-impl.h
+++ b/ggml-impl.h
@@ -232,7 +232,7 @@ bool   ggml_hash_contains      (const struct ggml_hash_set hash_set, struct ggml
 // returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
 size_t ggml_hash_find          (const struct ggml_hash_set hash_set, struct ggml_tensor * key);

-// returns GGML_HAHSHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
+// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
 size_t ggml_hash_insert        (      struct ggml_hash_set hash_set, struct ggml_tensor * key);

 // return index, asserts if table is full
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -99,6 +99,12 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
 GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);

 GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
+GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
+
+// helper to check if the device supports a specific family
+// ideally, the user code should be doing these checks
+// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
+GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);

 #ifdef __cplusplus
 }
--- a/ggml-metal.m
+++ b/ggml-metal.m
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -3114,7 +3114,7 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri

    size_t vl = __riscv_vsetvl_e8m1(qk/2);

-    // These tempory registers are for masking and shift operations
+    // These temporary registers are for masking and shift operations
    vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
    vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);

@@ -4757,7 +4757,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri

            vl = 16;

-            // retreive lane to multiply with scale
+            // retrieve lane to multiply with scale
            vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
            vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
            vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@@ -215,9 +215,9 @@
 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this

 #define GGML_MAX_DIMS           4
-#define GGML_MAX_PARAMS         1024
+#define GGML_MAX_PARAMS         2048
 #define GGML_MAX_CONTEXTS       64
-#define GGML_MAX_SRC            6
+#define GGML_MAX_SRC            10
 #define GGML_MAX_NAME           64
 #define GGML_MAX_OP_PARAMS      64
 #define GGML_DEFAULT_N_THREADS  4
@@ -283,6 +283,20 @@
    const type prefix##3 = (pointer)->array[3]; \
    GGML_UNUSED(prefix##3);

+#define GGML_TENSOR_UNARY_OP_LOCALS \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+
+#define GGML_TENSOR_BINARY_OP_LOCALS \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -381,6 +395,7 @@ extern "C" {
        GGML_OP_GROUP_NORM,

        GGML_OP_MUL_MAT,
+        GGML_OP_MUL_MAT_ID,
        GGML_OP_OUT_PROD,

        GGML_OP_SCALE,
@@ -407,8 +422,8 @@ extern "C" {
        GGML_OP_CONV_TRANSPOSE_2D,
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,
-
        GGML_OP_UPSCALE, // nearest interpolate
+        GGML_OP_ARGSORT,

        GGML_OP_FLASH_ATTN,
        GGML_OP_FLASH_FF,
@@ -448,7 +463,9 @@ extern "C" {
        GGML_UNARY_OP_GELU,
        GGML_UNARY_OP_GELU_QUICK,
        GGML_UNARY_OP_SILU,
-        GGML_UNARY_OP_LEAKY
+        GGML_UNARY_OP_LEAKY,
+
+        GGML_UNARY_OP_COUNT,
    };

    enum ggml_object_type {
@@ -631,6 +648,9 @@ extern "C" {
    GGML_API const char * ggml_op_name  (enum ggml_op   op);
    GGML_API const char * ggml_op_symbol(enum ggml_op   op);

+    GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
+    GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
+
    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);

    GGML_API bool    ggml_is_quantized(enum ggml_type type);
@@ -1027,6 +1047,16 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    // indirect matrix multiplication
+    //  ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
+    GGML_API struct ggml_tensor * ggml_mul_mat_id(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * const as[],
+            int                   n_as,
+            struct ggml_tensor  * ids,
+            int                   id,
+            struct ggml_tensor  * b);
+
    // A: m columns, n rows,
    // B: p columns, n rows,
    // result is m columns, p rows
@@ -1234,6 +1264,7 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // supports 3D: a->ne[2] == b->ne[1]
    GGML_API struct ggml_tensor * ggml_get_rows(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -1520,6 +1551,23 @@ extern "C" {
            struct ggml_tensor  * a,
            int                   scale_factor);

+    // sort rows
+    enum ggml_sort_order {
+        GGML_SORT_ASC,
+        GGML_SORT_DESC,
+    };
+
+    GGML_API struct ggml_tensor * ggml_argsort(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            enum ggml_sort_order  order);
+
+    // top k elements per row
+    GGML_API struct ggml_tensor * ggml_top_k(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   k);
+
    GGML_API struct ggml_tensor * ggml_flash_attn(
            struct ggml_context * ctx,
            struct ggml_tensor  * q,
@@ -1581,7 +1629,6 @@ extern "C" {
            int                   kh);

    // used in sam
-
    GGML_API struct ggml_tensor * ggml_add_rel_pos(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -1756,7 +1803,7 @@ extern "C" {
    GGML_API struct ggml_cgraph * ggml_new_graph         (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
    GGML_API struct ggml_cgraph * ggml_new_graph_custom  (struct ggml_context * ctx, size_t size, bool grads);
    GGML_API struct ggml_cgraph * ggml_graph_dup         (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-    GGML_API struct ggml_cgraph * ggml_graph_view        (struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i0, int i1);
+    GGML_API struct ggml_cgraph   ggml_graph_view        (struct ggml_cgraph * cgraph, int i0, int i1);
    GGML_API void                 ggml_graph_cpy         (struct ggml_cgraph * src, struct ggml_cgraph * dst);
    GGML_API void                 ggml_graph_reset       (struct ggml_cgraph * cgraph);  // zero grads
    GGML_API void                 ggml_graph_clear       (struct ggml_cgraph * cgraph);
--- a/gguf-py/README.md
+++ b/gguf-py/README.md
@@ -61,7 +61,7 @@ If you want to publish the package manually for any reason, you need to have `tw
 pip install build twine
 ```

-Then, folow these steps to release a new version:
+Then, follow these steps to release a new version:

 1. Bump the version in `pyproject.toml`.
 2. Build the package:
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -38,6 +38,8 @@ class Keys:
        FEED_FORWARD_LENGTH   = "{arch}.feed_forward_length"
        USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
        TENSOR_DATA_LAYOUT    = "{arch}.tensor_data_layout"
+        EXPERT_COUNT          = "{arch}.expert_count"
+        EXPERT_USED_COUNT     = "{arch}.expert_used_count"

    class Attention:
        HEAD_COUNT        = "{arch}.attention.head_count"
@@ -111,10 +113,14 @@ class MODEL_TENSOR(IntEnum):
    ATTN_NORM       = auto()
    ATTN_NORM_2     = auto()
    ATTN_ROT_EMBD   = auto()
+    FFN_GATE_INP    = auto()
+    FFN_NORM        = auto()
    FFN_GATE        = auto()
    FFN_DOWN        = auto()
    FFN_UP          = auto()
-    FFN_NORM        = auto()
+    FFN_GATE_EXP    = auto()
+    FFN_DOWN_EXP    = auto()
+    FFN_UP_EXP      = auto()
    ATTN_Q_NORM     = auto()
    ATTN_K_NORM     = auto()

@@ -154,10 +160,14 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.ATTN_ROT_EMBD:   "blk.{bid}.attn_rot_embd",
    MODEL_TENSOR.ATTN_Q_NORM:     "blk.{bid}.attn_q_norm",
    MODEL_TENSOR.ATTN_K_NORM:     "blk.{bid}.attn_k_norm",
+    MODEL_TENSOR.FFN_GATE_INP:    "blk.{bid}.ffn_gate_inp",
    MODEL_TENSOR.FFN_NORM:        "blk.{bid}.ffn_norm",
    MODEL_TENSOR.FFN_GATE:        "blk.{bid}.ffn_gate",
    MODEL_TENSOR.FFN_DOWN:        "blk.{bid}.ffn_down",
    MODEL_TENSOR.FFN_UP:          "blk.{bid}.ffn_up",
+    MODEL_TENSOR.FFN_GATE_EXP:    "blk.{bid}.ffn_gate.{xid}",
+    MODEL_TENSOR.FFN_DOWN_EXP:    "blk.{bid}.ffn_down.{xid}",
+    MODEL_TENSOR.FFN_UP_EXP:      "blk.{bid}.ffn_up.{xid}",
 }

 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -172,10 +182,14 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_GATE_INP,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_GATE,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
    ],
    MODEL_ARCH.GPTNEOX: [
        MODEL_TENSOR.TOKEN_EMBD,
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -339,6 +339,12 @@ class GGUFWriter:
    def add_clamp_kqv(self, value: float) -> None:
        self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)

+    def add_expert_count(self, count: int) -> None:
+        self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)
+
+    def add_expert_used_count(self, count: int) -> None:
+        self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count)
+
    def add_layer_norm_eps(self, value: float) -> None:
        self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)

--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -149,6 +149,11 @@ class TensorNameMap:
            "model.layers.{bid}.ln2",                                        # yi
        ),

+        MODEL_TENSOR.FFN_GATE_INP: (
+            "layers.{bid}.feed_forward.gate",           # mixtral
+            "model.layers.{bid}.block_sparse_moe.gate", # mixtral
+        ),
+
        # Feed-forward up
        MODEL_TENSOR.FFN_UP: (
            "gpt_neox.layers.{bid}.mlp.dense_h_to_4h",                # gptneox
@@ -164,11 +169,21 @@ class TensorNameMap:
            "transformer.h.{bid}.mlp.w1",                             # qwen
        ),

+        MODEL_TENSOR.FFN_UP_EXP: (
+            "layers.{bid}.feed_forward.experts.{xid}.w3",           # mixtral
+            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral
+        ),
+
        # Feed-forward gate
        MODEL_TENSOR.FFN_GATE: (
-            "model.layers.{bid}.mlp.gate_proj",  # llama-hf refact
-            "layers.{bid}.feed_forward.w1",      # llama-pth
-            "transformer.h.{bid}.mlp.w2",        # qwen
+            "model.layers.{bid}.mlp.gate_proj",           # llama-hf refact
+            "layers.{bid}.feed_forward.w1",               # llama-pth
+            "transformer.h.{bid}.mlp.w2",                 # qwen
+        ),
+
+        MODEL_TENSOR.FFN_GATE_EXP: (
+            "layers.{bid}.feed_forward.experts.{xid}.w1",           # mixtral
+            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral
        ),

        # Feed-forward down
@@ -185,6 +200,11 @@ class TensorNameMap:
            "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
        ),

+        MODEL_TENSOR.FFN_DOWN_EXP: (
+            "layers.{bid}.feed_forward.experts.{xid}.w2",           # mixtral
+            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral
+        ),
+
        MODEL_TENSOR.ATTN_Q_NORM: (
            "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
        ),
@@ -213,11 +233,14 @@ class TensorNameMap:
            for tensor, keys in self.block_mappings_cfg.items():
                if tensor not in MODEL_TENSORS[arch]:
                    continue
-                tensor_name = TENSOR_NAMES[tensor].format(bid = bid)
-                self.mapping[tensor_name] = (tensor, tensor_name)
-                for key in keys:
-                    key = key.format(bid = bid)
-                    self.mapping[key] = (tensor, tensor_name)
+                # TODO: make this configurable
+                n_experts = 8
+                for xid in range(n_experts):
+                    tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
+                    self.mapping[tensor_name] = (tensor, tensor_name)
+                    for key in keys:
+                        key = key.format(bid = bid, xid = xid)
+                        self.mapping[key] = (tensor, tensor_name)

    def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
        result = self.mapping.get(key)
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.6.0"
+version = "0.7.0"
 description = "Read and write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@@ -42,7 +42,7 @@
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'

 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 2
+#define LLAMA_SESSION_VERSION 3

 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
@@ -158,6 +158,22 @@ extern "C" {
        llama_seq_id all_seq_id; // used if seq_id == NULL
    } llama_batch;

+    enum llama_model_kv_override_type {
+        LLAMA_KV_OVERRIDE_INT,
+        LLAMA_KV_OVERRIDE_FLOAT,
+        LLAMA_KV_OVERRIDE_BOOL,
+    };
+
+    struct llama_model_kv_override {
+        char key[128];
+        enum llama_model_kv_override_type tag;
+        union {
+            int64_t int_value;
+            double float_value;
+            bool bool_value;
+        };
+    };
+
    struct llama_model_params {
        int32_t n_gpu_layers; // number of layers to store in VRAM
        int32_t main_gpu;     // the GPU that is used for scratch and small tensors
@@ -165,9 +181,13 @@ extern "C" {

        // called with a progress value between 0 and 1, pass NULL to disable
        llama_progress_callback progress_callback;
+
        // context pointer passed to the progress callback
        void * progress_callback_user_data;

+        // override key-value pairs of the model meta data
+        const struct llama_model_kv_override * kv_overrides;
+
        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool vocab_only; // only load the vocabulary, no weights
        bool use_mmap;   // use mmap if possible
@@ -191,11 +211,14 @@ extern "C" {
        float    yarn_beta_slow;   // YaRN high correction dim
        uint32_t yarn_orig_ctx;    // YaRN original context size

+        enum ggml_type type_k; // data type for K cache
+        enum ggml_type type_v; // data type for V cache
+
        // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
-        bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
-        bool logits_all; // the llama_eval() call computes all logits, not just the last one
-        bool embedding;  // embedding mode only
+        bool mul_mat_q;   // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
+        bool logits_all;  // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
+        bool embedding;   // embedding mode only
+        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
    };

    // model quantization parameters
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@@ -20,5 +20,6 @@ cp -rpv ../ggml/include/ggml/ggml.h         ./ggml.h
 cp -rpv ../ggml/include/ggml/ggml-alloc.h   ./ggml-alloc.h
 cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h

-cp -rpv ../ggml/tests/test-opt.cpp    ./tests/test-opt.cpp
-cp -rpv ../ggml/tests/test-grad0.cpp  ./tests/test-grad0.cpp
+cp -rpv ../ggml/tests/test-opt.cpp         ./tests/test-opt.cpp
+cp -rpv ../ggml/tests/test-grad0.cpp       ./tests/test-grad0.cpp
+cp -rpv ../ggml/tests/test-backend-ops.cpp ./tests/test-backend-ops.cpp
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -22,26 +22,32 @@ endfunction()
 llama_build_and_test_executable(test-quantize-fns.cpp)
 llama_build_and_test_executable(test-quantize-perf.cpp)
 llama_build_and_test_executable(test-sampling.cpp)
+
 llama_build_executable(test-tokenizer-0-llama.cpp)
 llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
+
 llama_build_executable(test-tokenizer-0-falcon.cpp)
 llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+
 llama_build_executable(test-tokenizer-1-llama.cpp)
-llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
-llama_test_executable(test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
+llama_test_executable (test-tokenizer-1-llama    test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
+llama_test_executable (test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
+
 llama_build_executable(test-tokenizer-1-bpe.cpp)
-llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
-llama_test_executable(test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
-llama_test_executable(test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf)
-llama_test_executable(test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
-llama_test_executable(test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
-llama_test_executable(test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
-# llama_test_executable(test-tokenizer-1-bloom test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
+llama_test_executable (test-tokenizer-1-falcon           test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+llama_test_executable (test-tokenizer-1-aquila           test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
+llama_test_executable (test-tokenizer-1-mpt              test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
+llama_test_executable (test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf)
+llama_test_executable (test-tokenizer-1-gpt-neox         test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
+llama_test_executable (test-tokenizer-1-refact           test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
+llama_test_executable (test-tokenizer-1-starcoder        test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
+# llama_test_executable (test-tokenizer-1-bloom test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
+
 llama_build_and_test_executable(test-grammar-parser.cpp)
 llama_build_and_test_executable(test-llama-grammar.cpp)
-llama_build_and_test_executable(test-grad0.cpp) # SLOW
+llama_build_and_test_executable(test-grad0.cpp)
 # llama_build_and_test_executable(test-opt.cpp) # SLOW
+llama_build_and_test_executable(test-backend-ops.cpp)

 llama_build_and_test_executable(test-rope.cpp)

--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@@ -1,4 +1,4 @@
-#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
+#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
 #include "ggml.h"

 #include <cmath>
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@@ -117,7 +117,7 @@ static void usage(char * argv[]) {
    printf("  --size SIZE           set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE);
    printf("  -3                    use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE);
    printf("  -4                    use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE);
-    printf("  --op OP               set test opration as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
+    printf("  --op OP               set test operation as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
    printf("                        quantize_row_q_dot, vec_dot_q (all)\n");
    printf("  --type TYPE           set test type as");
    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
@@ -202,7 +202,7 @@ int main(int argc, char * argv[]) {
            }
            int alignment = std::stoi(argv[i]);
            if (alignment < 0 || alignment > MAX_ALIGNMENT) {
-            fprintf(stderr, "error: aligment-offset must be less than %d\n", MAX_ALIGNMENT);
+            fprintf(stderr, "error: alignment-offset must be less than %d\n", MAX_ALIGNMENT);
                invalid_param = true;
                break;
            }
Author	SHA1	Message	Date
Siwen Yu	9fb13f9584	common : add `--version` option to show build info in CLI (#4433 )	2023-12-13 14:50:14 +02:00
Georgi Gerganov	113f9942fc	readme : update hot topics	2023-12-13 14:05:38 +02:00
slaren	799a1cb13b	llama : add Mixtral support (#4406 ) * convert : support Mixtral as LLAMA arch * convert : fix n_ff typo * llama : model loading * ggml : sync latest ggml_mul_mat_id * llama : update graph to support MoE * llama : fix cur -> cur_expert * llama : first working version * llama : fix expert weighting in the FFN * ggml : ggml_get_rows support 2D indexing [n_tokens, n_experts] (cpu only) * ggml : add n_as argument to ggml_mul_mat_id * ggml : fix ggml_get_rows to take into account ne02 / ne11 * metal : add more general support for ggml_get_rows + tests * llama : add basic support for offloading moe with CUDA * metal : add/mul/div use general kernel when src1 not cont * metal : reduce the kernel launches for ggml_mul_mat_id * ggml : get_rows : support non-contiguos tensors with gaps, generalize up to 3D * ggml : update get_rows f16 and q * cuda : support non-contiguous src1 in get_rows * llama : offload missing ffn_moe_silu * metal : fix ggml_get_rows to work with non-cont src1 * metal : add indirect mat-vec kernels for all quantization types * llama : do not quantize expert gating tensors * llama : add n_expert and n_expert_used to hparams + change quants * test-backend-ops : add moe test * cuda : fix get_rows when ncols is odd * convert : determine n_ctx correctly * metal : fix ggml_mul_mat_id for F32 * test-backend-ops : make experts more evenly probable (test_moe) * test-backend-ops : cleanup, add moe test for batches * test-backend-ops : add cpy from f32 -> all types test * test-backend-ops : fix dequantize block offset * llama : fix hard-coded number of experts * test-backend-ops : simplify and disable slow tests to avoid CI timeout * test-backend-ops : disable MOE test with thread sanitizer * cuda : fix mul_mat_id with multi gpu * convert : use 1e6 rope_freq_base for mixtral * convert : fix style * convert : support safetensors format * gguf-py : bump version * metal : add cpy f16 -> f32 kernel * metal : fix binary ops for ne10 % 4 != 0 * test-backend-ops : add one more sum_rows test * ggml : do not use BLAS with ggml_mul_mat_id * convert-hf : support for mixtral-instruct (#4428) * convert : typo fix, add additional hyperparameters, use LLaMA arch for Mixtral-instruct * convert : use sentencepiece tokenizer for Mixtral-instruct * convert : make flake8 happy * metal : fix soft_max kernels ref: `1914017863` * metal : limit kernels to not use more than the allowed threads --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Radek Pilar <github@mrkva.eu>	2023-12-13 14:04:25 +02:00
kalomaze	fecac45658	server : tweak default sampling parameters (#4367 ) * Set a more typical Top P setting as the default * Update temp max	2023-12-12 12:12:35 +02:00
Richard Kiss	9494d7c477	english : use `typos` to fix comments and logs (#4354 )	2023-12-12 11:53:36 +02:00
Jared Van Bortel	6138963fb2	build : target Windows 8 for standard mingw-w64 (#4405 ) * build : target Windows 8 for standard mingw-w64 * make : fix missing console.o deps This was causing a link error with `make all` on Windows.	2023-12-12 11:27:26 +02:00
crasm	6391817cd1	llama : document logits_all deprecation (#4418 ) llama_context_params.logits_all is a parameter for controlling llama_eval. This documents that logits_all should not be used with llama_decode and llama_batch.	2023-12-12 11:25:57 +02:00
Vladimir Zorin	d9d4cfef64	server : fix local model name in server (#4420 )	2023-12-12 11:25:29 +02:00
Taikono-Himazin	41a11aaf99	ggml : increased GGML_MAX_PARAMS to allow finetuning of 70b models (#4424 )	2023-12-12 11:24:32 +02:00
Yueh-Po Peng	8a7b2fa528	Update README.md (#4388 ) Fix small typo.	2023-12-10 23:27:38 +01:00
Xiang (Kevin) Li	e18f7345a3	grammar : revert the replacement of llama_token_to_piece with id_to_token (#4396 )	2023-12-09 23:29:27 +02:00
Georgi Gerganov	fe680e3d10	sync : ggml (new ops, tests, backend, etc.) (#4359 ) * sync : ggml (part 1) * sync : ggml (part 2, CUDA) * sync : ggml (part 3, Metal) * ggml : build fixes ggml-ci * cuda : restore lost changes * cuda : restore lost changes (StableLM rope) * cmake : enable separable compilation for CUDA ggml-ci * ggml-cuda : remove device side dequantize * Revert "cmake : enable separable compilation for CUDA" This reverts commit `09e35d04b1`. * cuda : remove assert for rope * tests : add test-backend-ops * ggml : fix bug in ggml_concat * ggml : restore `ggml_get_n_tasks()` logic in `ggml_graph_plan()` * ci : try to fix macOS * ggml-backend : remove backend self-registration * ci : disable Metal for macOS cmake build ggml-ci * metal : fix "supports family" call * metal : fix assert * metal : print resource path ggml-ci --------- Co-authored-by: slaren <slarengh@gmail.com>	2023-12-07 22:26:54 +02:00
Georgi Gerganov	bcc0eb4591	llama : per-layer KV cache + quantum K cache (#4309 ) * per-layer KV * remove unnecessary copies * less code duplication, offload k and v separately * llama : offload KV cache per-layer * llama : offload K shift tensors * llama : offload for rest of the model arches * llama : enable offload debug temporarily * llama : keep the KV related layers on the device * llama : remove mirrors, perform Device -> Host when partial offload * common : add command-line arg to disable KV cache offloading * llama : update session save/load * llama : support quantum K cache (#4312) * llama : support quantum K cache (wip) * metal : add F32 -> Q8_0 copy kernel * cuda : add F32 -> Q8_0 copy kernel ggml-ci * cuda : use mmv kernel for quantum cache ops * llama : pass KV cache type through API * llama : fix build ggml-ci * metal : add F32 -> Q4_0 copy kernel * metal : add F32 -> Q4_1 copy kernel * cuda : wip * cuda : add F32 -> Q4_0 and F32 -> Q4_1 copy kernels * llama-bench : support type_k/type_v * metal : use mm kernel only for quantum KV cache * cuda : add comment * llama : remove memory_f16 and kv_f16 flags --------- Co-authored-by: slaren <slarengh@gmail.com> * readme : add API change notice --------- Co-authored-by: slaren <slarengh@gmail.com>	2023-12-07 13:03:17 +02:00
Hongyu Ouyang	81bc9214a3	train : fix #4227 (double free in examples/train-text-from-scratch/train-text-from-scratch.cpp) (#4351 ) On commit b1108 (`44c117f4`) xaedes added ggml_allocr * alloc = NULL; ... (many lines in between) if (alloc) { ggml_allocr_free(alloc); } Which is correct, but it's easy to lose context after many lines in between. On commit b1287 (`0e76a899`) xaedes made a big change. From here on, alloc is freed eagerly. alloc = ggml_allocr_new(...) ... (short lines of code) ggml_allocr_free(alloc) This happens a few times, but alloc is never set to NULL, and many lines below, we still have if (alloc) { ggml_allocr_free(alloc); } which causes a double-free.	2023-12-07 12:25:22 +02:00
Georgi Gerganov	05cd6e5036	server : recognize cache_prompt parameter in OAI API (#4347 )	2023-12-06 20:21:59 +02:00
Georgi Gerganov	caa9249217	common : fix compile warning	2023-12-06 10:41:03 +02:00
stduhpf	da5eaef1f3	speculative : support `--color` (#4343 ) * speculative: add some colors * minor : add braces --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-12-06 10:08:17 +02:00
Marcus Dunn	5f6e0c0dff	grammar : pre-computed pieces + reserve mem + less string copies (#4330 ) * reserve space for codepoints * improvement for the appended 0 * used precomputed token text for grammar sample * reserve canidates_decoded * reserve canidates_grammar * remove candidates_decoded * Revert "remove candidates_decoded" This reverts commit `3773328080`. * changed decode_utf8 to take src by ref	2023-12-05 22:55:12 +02:00
Kerfuffle	5aa365d88f	llama : allow overriding GGUF metadata when loading model (#4092 ) * feat: Allow overriding GGUF metadata when loading model * Fix the one time GCC is stricter than clang about something * Step1 * Refactor... basically everything! * Nuke obsolete GetArrayLen struct * simplify std::string specialization * Various cleanups Add informational output when overrides are applied Warn user when an override with the wrong type is specified * Fix broken logic for parsing bool KV overrides Fix issue where overrides didn't apply when key missing in GGUF metadata Resolve merge changes * llama : rearrange model params * Update new GET_KEY call Add note that metadata KV overrides aren't reflected in initial metadata KV info dump --------- Co-authored-by: cebtenzzre <cebtenzzre@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-12-05 19:19:18 +02:00
MaggotHATE	52c8bc3cf3	sampling : custom samplers order (#4285 ) * Samplers sequence order w parameter * Cleaned commented code * Fixed formatting * Rewrote with unordered_map * Revert and rewrite, too many problems and safeguards would be needed * Fixed code style * Code style fixes according to review * More readable samplers input string, fixed help * Style fix in sampler_queue * Formatting fixes * Fixing whitespaces	2023-12-05 12:05:51 +02:00
kchro3	e4b76bbe31	swift : revert compiler checks for swift package (#4332 )	2023-12-05 09:29:46 +02:00
Daniel Bevenius	23b5e12eb5	simple : update error message for KV cache check (#4324 ) This commit updates the error message that is printed when the KV cache is not big enough to hold all the prompt and generated tokens. Specifically it removes the reference to n_parallel and replaces it with n_len. Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>	2023-12-04 18:04:21 +02:00
Miwa / Ensan	d208995c6d	swift : fix concatenation method to avoid invalid UTF8 stringfication (#4325 )	2023-12-04 18:03:49 +02:00
Miwa / Ensan	5c9f90cba1	swift : fix prompt tokenization logic (#4321 )	2023-12-04 15:43:45 +02:00
Ikko Eltociear Ashimine	4fa44e84ad	grammar-parser : fix typo (#4318 ) preceeding -> preceding	2023-12-04 09:57:35 +02:00
Georgi Gerganov	fbbc42827b	ggml : reuse ggml_get_n_tasks() in ggml_graph_plan() (#4308 ) * ggml : fix soft max out-of-bounds access ggml-ci * ggml : reuse ggml_get_n_tasks() in ggml_graph_plan() ggml-ci	2023-12-03 15:56:35 +02:00
Georgi Gerganov	adf3de4f69	ggml : fix soft max out-of-bounds access (#4307 ) ggml-ci	2023-12-03 15:56:22 +02:00
Ed Lee	33e171d1e9	server : fix OpenAI API `stop` field to be optional (#4299 ) (cherry picked from commit Mozilla-Ocho/llamafile@e8c92bcb84)	2023-12-03 11:10:43 +02:00
Rickard Edén	6949b50df5	py : add grammar to oai like api (#4294 )	2023-12-03 11:03:25 +02:00
Georgi Gerganov	d7b800b8bc	llama : pad KV cache size (#4280 ) * llama : pad KV cache size to 32 * metal : try to improve batched decoding	2023-12-03 10:58:16 +02:00