metal : remove unused n_buffers and buffers (#5129 )

gguf : fix "general.alignment" type in gguf_reader.py (#5136 )
readme : update hot topics
2026-05-07 17:44:09 +00:00 · 2024-01-26 14:16:07 +02:00 · 2024-01-26 11:10:28 +02:00 · 2024-01-26 10:52:33 +02:00 · 2024-01-26 09:14:39 +02:00 · 2024-01-25 22:14:32 +02:00
45 changed files with 2863 additions and 1142 deletions
--- a/.devops/main-intel.Dockerfile
+++ b/.devops/main-intel.Dockerfile
@@ -0,0 +1,26 @@
+ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
+ARG UBUNTU_VERSION=22.04
+
+FROM intel/hpckit:$ONEAPI_VERSION as build
+
+RUN apt-get update && \
+    apt-get install -y git
+
+WORKDIR /app
+
+COPY . .
+
+# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
+RUN mkdir build && \
+    cd build && \
+    cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
+    cmake --build . --config Release --target main server
+
+FROM ubuntu:$UBUNTU_VERSION as runtime
+
+COPY --from=build /app/build/bin/main /main
+COPY --from=build /app/build/bin/server /server
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/main" ]
--- a/.devops/nix/nixpkgs-instances.nix
+++ b/.devops/nix/nixpkgs-instances.nix
@@ -7,6 +7,18 @@
    { system, ... }:
    {
      _module.args = {
+        # Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
+        # again, the below creates several nixpkgs instances which the
+        # flake-centric CLI will be forced to evaluate e.g. on `nix flake show`.
+        #
+        # This is currently "slow" and "expensive", on a certain scale.
+        # This also isn't "right" in that this hinders dependency injection at
+        # the level of flake inputs. This might get removed in the foreseeable
+        # future.
+        #
+        # Note that you can use these expressions without Nix
+        # (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point).
+
        pkgsCuda = import inputs.nixpkgs {
          inherit system;
          # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -73,6 +73,7 @@ let
    ps: [
      ps.numpy
      ps.sentencepiece
+      ps.tiktoken
      ps.torchWithoutCuda
      ps.transformers
    ]
@@ -114,14 +115,22 @@ effectiveStdenv.mkDerivation (
    pname = "llama-cpp${pnameSuffix}";
    version = llamaVersion;

+    # Note: none of the files discarded here are visible in the sandbox or
+    # affect the output hash. This also means they can be modified without
+    # triggering a rebuild.
    src = lib.cleanSourceWith {
      filter =
        name: type:
-        !(builtins.any (_: _) [
+        let
+          noneOf = builtins.all (x: !x);
+          baseName = baseNameOf name;
+        in
+        noneOf [
          (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
-          (name == "README.md") # Ignore *.md changes whe computing outPaths
-          (lib.hasPrefix "." name) # Skip hidden files and directories
-        ]);
+          (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
+          (lib.hasPrefix "." baseName) # Skip hidden files and directories
+          (baseName == "flake.lock")
+        ];
      src = lib.cleanSource ../../.;
    };

@@ -159,7 +168,7 @@ effectiveStdenv.mkDerivation (

    cmakeFlags =
      [
-        (cmakeBool "LLAMA_NATIVE" true)
+        (cmakeBool "LLAMA_NATIVE" false)
        (cmakeBool "LLAMA_BUILD_SERVER" true)
        (cmakeBool "BUILD_SHARED_LIBS" true)
        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
@@ -216,6 +225,9 @@ effectiveStdenv.mkDerivation (
        description = "contains numpy and sentencepiece";
        buildInputs = [ llama-python ];
        inputsFrom = [ finalAttrs.finalPackage ];
+        shellHook = ''
+          addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
+        '';
      };

      shell-extra = mkShell {
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@@ -4,6 +4,10 @@
  llamaVersion ? "0.0.0",
 }:

+# We're using `makeScope` instead of just writing out an attrset
+# because it allows users to apply overlays later using `overrideScope'`.
+# Cf. https://noogle.dev/f/lib/makeScope
+
 lib.makeScope newScope (
  self: {
    inherit llamaVersion;
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -295,7 +295,7 @@ jobs:
      OPENBLAS_VERSION: 0.3.23
      OPENCL_VERSION: 2023.04.17
      CLBLAST_VERSION: 1.6.0
-      SDE_VERSION: 9.21.1-2023-04-24
+      SDE_VERSION: 9.33.0-2024-01-07

    strategy:
      matrix:
@@ -400,7 +400,7 @@ jobs:
        id: cmake_test_sde
        if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
        run: |
-          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz"
+          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
          # for some weird reason windows tar doesn't like sde tar.xz
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -35,6 +35,7 @@ jobs:
          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v3
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@@ -2,13 +2,20 @@ name: Nix aarch64 builds

 on:
  workflow_dispatch: # allows manual triggering
+  schedule:
+    # Rebuild daily rather than on every push because QEMU is expensive (e.g.
+    # 1.5h instead of minutes with the cold cache).
+    #
+    # randint(0, 59), randint(0, 23)
+    - cron: '26 12 * * *'
+  # But also rebuild if we touched any of the Nix expressions:
  push:
    branches:
      - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
+    paths: ['**/*.nix', 'flake.lock']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
+    paths: ['**/*.nix', 'flake.lock']

 jobs:
  nix-build-aarch64:
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -5,10 +5,8 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']

 jobs:
  nix-eval:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,6 +47,7 @@ option(BUILD_SHARED_LIBS                "build shared libraries"
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
 option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
 option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)
+option(LLAMA_CCACHE                     "llama: use ccache if available"                        ON)

 # debug
 option(LLAMA_ALL_WARNINGS               "llama: enable all compiler warnings"                   ON)
@@ -107,6 +108,13 @@ option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STA
 option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER                    "llama: build server example"                      ON)

+
+# add perf arguments
+option(LLAMA_PERF                            "llama: enable perf"                               OFF)
+if (LLAMA_PERF)
+    add_definitions(-DGGML_PERF)
+endif()
+
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)

@@ -470,6 +478,11 @@ function(get_flags CCID CCVER)
        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
            set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi)
        endif()
+    elseif (CCID MATCHES "Intel")
+        # enable max optimization level when using Intel compiler
+        set(C_FLAGS   -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
+        set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
+        add_link_options(-fuse-ld=lld -static-intel)
    endif()

    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
@@ -561,6 +574,17 @@ if (LLAMA_LTO)
    endif()
 endif()

+if (LLAMA_CCACHE)
+    find_program(LLAMA_CCACHE_FOUND ccache)
+    if (LLAMA_CCACHE_FOUND)
+        set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
+        set(ENV{CCACHE_SLOPPINESS} time_macros)
+        message(STATUS "Using ccache")
+    else()
+        message(STATUS "Warning: ccache not found - consider installing it or use LLAMA_CCACHE=OFF")
+    endif ()
+endif()
+
 # this version of Apple ld64 is buggy
 execute_process(
    COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
--- a/README.md
+++ b/README.md
@@ -10,11 +10,11 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

 ### Hot topics

+- ⚠️ Incoming backends: https://github.com/ggerganov/llama.cpp/discussions/5138
 - New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
 - Collecting Apple Silicon performance stats:
  - M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
  - A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
- Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406
 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216

 ----
@@ -112,6 +112,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
 - [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
 - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
+- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)


 **Bindings:**
@@ -128,6 +129,7 @@ as the main playground for developing new features for the [ggml](https://github
 - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
 - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
 - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
+- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)

 **UI:**

--- a/common/common.cpp
+++ b/common/common.cpp
@@ -203,6 +203,23 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            params.prompt_cache_all = true;
        } else if (arg == "--prompt-cache-ro") {
            params.prompt_cache_ro = true;
+        } else if (arg == "-bf" || arg == "--binary-file") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::ifstream file(argv[i], std::ios::binary);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
+            // store the external file name in params
+            params.prompt_file = argv[i];
+            std::ostringstream ss;
+            ss << file.rdbuf();
+            params.prompt = ss.str();
+            fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
        } else if (arg == "-f" || arg == "--file") {
            if (++i >= argc) {
                invalid_param = true;
@@ -653,6 +670,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            if (params.logdir.back() != DIRECTORY_SEPARATOR) {
                params.logdir += DIRECTORY_SEPARATOR;
            }
+        } else if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.logits_file = argv[i];
        } else if (arg == "--perplexity" || arg == "--all-logits") {
            params.logits_all = true;
        } else if (arg == "--ppl-stride") {
@@ -689,6 +712,16 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.winogrande_tasks = std::stoi(argv[i]);
+        } else if (arg == "--multiple-choice") {
+            params.multiple_choice = true;
+        } else if (arg == "--multiple-choice-tasks") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.multiple_choice_tasks = std::stoi(argv[i]);
+        } else if (arg == "--kl-divergence") {
+            params.kl_divergence = true;
        } else if (arg == "--ignore-eos") {
            params.ignore_eos = true;
        } else if (arg == "--no-penalize-nl") {
@@ -888,6 +921,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
    printf("  -f FNAME, --file FNAME\n");
    printf("                        prompt file to start generation.\n");
+    printf("  -bf FNAME, --binary-file FNAME\n");
+    printf("                        binary file containing multiple choice tasks.\n");
    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
    printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
@@ -936,6 +971,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
    printf("  --winogrande          compute Winogrande score over random tasks from datafile supplied with -f\n");
    printf("  --winogrande-tasks N  number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
+    printf("  --multiple-choice     compute multiple choice score over random tasks from datafile supplied with -f\n");
+    printf("  --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
+    printf("  --kl-divergence       computes KL-divergence to logits provided via --kl-divergence-base");
    printf("  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
    printf("  --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
    printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
--- a/common/common.h
+++ b/common/common.h
@@ -91,6 +91,7 @@ struct gpt_params {
    std::string input_suffix      = "";  // string to suffix user inputs with
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
    std::string logdir            = "";  // directory in which to save YAML log files
+    std::string logits_file       = "";  // file for saving *all* logits

    std::vector<llama_model_kv_override> kv_overrides;

@@ -108,6 +109,11 @@ struct gpt_params {
    bool   winogrande      = false; // compute Winogrande score over random tasks from datafile supplied in prompt
    size_t winogrande_tasks= 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed

+    bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
+    size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
+
+    bool   kl_divergence   = false; // compute KL-divergence
+
    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -129,6 +129,8 @@ static void sampler_queue(
    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));

    const float         temp              = params.temp;
+    const float         dynatemp_range    = params.dynatemp_range;
+    const float         dynatemp_exponent = params.dynatemp_exponent;
    const int32_t       top_k             = params.top_k <= 0 ? n_vocab : params.top_k;
    const float         top_p             = params.top_p;
    const float         min_p             = params.min_p;
@@ -143,7 +145,15 @@ static void sampler_queue(
            case 'y': llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
            case 'p': llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
            case 'm': llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
-            case 't': llama_sample_temp     (ctx_main, &cur_p, temp); break;
+            case 't':
+                if (dynatemp_range > 0) {
+                    float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
+                    float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
+                    llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
+                } else {
+                    llama_sample_temp(ctx_main, &cur_p, temp);
+                }
+                break;
            default : break;
        }
    }
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -18,6 +18,8 @@ typedef struct llama_sampling_params {
    float       tfs_z                 = 1.00f;    // 1.0 = disabled
    float       typical_p             = 1.00f;    // 1.0 = disabled
    float       temp                  = 0.80f;    // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float       dynatemp_range        = 0.00f;    // 0.0 = disabled
+    float       dynatemp_exponent     = 1.00f;    // controls how entropy maps to temperature in dynamic temperature sampler
    int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size)
    float       penalty_repeat        = 1.10f;    // 1.0 = disabled
    float       penalty_freq          = 0.00f;    // 0.0 = disabled
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -10,7 +10,7 @@ import re
 import sys
 from enum import IntEnum
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional
+from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast

 import numpy as np
 import torch
@@ -289,6 +289,58 @@ class Model:
        special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
        special_vocab.add_to_gguf(self.gguf_writer)

+    def _set_vocab_qwen(self):
+        dir_model = self.dir_model
+        hparams = self.hparams
+        tokens: list[bytearray] = []
+        toktypes: list[int] = []
+
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
+        vocab_size = hparams["vocab_size"]
+        assert max(tokenizer.get_vocab().values()) < vocab_size
+
+        merges = []
+        vocab = {}
+        mergeable_ranks = tokenizer.mergeable_ranks
+        for token, rank in mergeable_ranks.items():
+            vocab[QwenModel.token_bytes_to_string(token)] = rank
+            if len(token) == 1:
+                continue
+            merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
+            assert len(merged) == 2
+            merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
+
+        # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
+        added_vocab = tokenizer.special_tokens
+        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
+
+        for i in range(vocab_size):
+            if i not in reverse_vocab:
+                pad_token = f"[PAD{i}]".encode("utf-8")
+                tokens.append(bytearray(pad_token))
+                toktypes.append(gguf.TokenType.USER_DEFINED)
+            elif reverse_vocab[i] in added_vocab:
+                tokens.append(reverse_vocab[i])
+                toktypes.append(gguf.TokenType.CONTROL)
+            else:
+                tokens.append(reverse_vocab[i])
+                toktypes.append(gguf.TokenType.NORMAL)
+
+        self.gguf_writer.add_tokenizer_model("gpt2")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
+        special_vocab.merges = merges
+        # only add special tokens when they were not already loaded from config.json
+        if len(special_vocab.special_token_ids) == 0:
+            special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
+            special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
+        # this one is usually not in config.json anyway
+        special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
+        special_vocab.add_to_gguf(self.gguf_writer)
+
    def _set_vocab_sentencepiece(self):
        from sentencepiece import SentencePieceProcessor

@@ -487,7 +539,8 @@ class MPTModel(Model):
            # map tensor names
            if "scales" in name:
                new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
-                new_name = new_name.replace("scales", "act.scales")
+                if new_name is not None:
+                    new_name = new_name.replace("scales", "act.scales")
            else:
                new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
@@ -876,6 +929,13 @@ class PersimmonModel(Model):


 class StableLMModel(Model):
+    def set_vocab(self):
+        if (self.dir_model / "tokenizer.json").is_file():
+            self._set_vocab_gpt2()
+        else:
+            # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
+            self._set_vocab_qwen()
+
    def set_gguf_parameters(self):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
@@ -904,7 +964,7 @@ class QwenModel(Model):
        return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])

    @staticmethod
-    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:
+    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
        parts = [bytes([b]) for b in token]
        while True:
            min_idx = None
@@ -921,52 +981,7 @@ class QwenModel(Model):
        return parts

    def set_vocab(self):
-        dir_model = self.dir_model
-        hparams = self.hparams
-        tokens: list[bytearray] = []
-        toktypes: list[int] = []
-
-        from transformers import AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
-        vocab_size = hparams["vocab_size"]
-        assert max(tokenizer.get_vocab().values()) < vocab_size
-
-        merges = []
-        vocab = {}
-        mergeable_ranks = tokenizer.mergeable_ranks
-        for token, rank in mergeable_ranks.items():
-            vocab[self.token_bytes_to_string(token)] = rank
-            if len(token) == 1:
-                continue
-            merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
-            assert len(merged) == 2
-            merges.append(' '.join(map(self.token_bytes_to_string, merged)))
-
-        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in vocab.items()}
-        added_vocab = tokenizer.special_tokens
-
-        for i in range(vocab_size):
-            if i not in reverse_vocab:
-                pad_token = f"[PAD{i}]".encode("utf-8")
-                tokens.append(bytearray(pad_token))
-                toktypes.append(gguf.TokenType.USER_DEFINED)
-            elif reverse_vocab[i] in added_vocab:
-                tokens.append(reverse_vocab[i])
-                toktypes.append(gguf.TokenType.CONTROL)
-            else:
-                tokens.append(reverse_vocab[i])
-                toktypes.append(gguf.TokenType.NORMAL)
-
-        self.gguf_writer.add_tokenizer_model("gpt2")
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_types(toktypes)
-
-        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
-        special_vocab.merges = merges
-        special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
-        special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
-        special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
-        special_vocab.add_to_gguf(self.gguf_writer)
+        self._set_vocab_qwen()

    def set_gguf_parameters(self):
        self.gguf_writer.add_name("Qwen")
@@ -1285,7 +1300,7 @@ def main() -> None:

    if args.awq_path:
        sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
-        from awq.apply_awq import add_scale_weights
+        from awq.apply_awq import add_scale_weights  # type: ignore[import-not-found]
        tmp_model_path = args.model / "weighted_model"
        dir_model = tmp_model_path
        if tmp_model_path.is_dir():
--- a/convert-llama-ggml-to-gguf.py
+++ b/convert-llama-ggml-to-gguf.py
@@ -2,6 +2,7 @@
 from __future__ import annotations

 import argparse
+import os
 import struct
 import sys
 from enum import IntEnum
@@ -9,7 +10,6 @@ from pathlib import Path

 import numpy as np

-import os
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
@@ -371,15 +371,11 @@ def handle_metadata(cfg, hp):
        params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
    else:
        raise ValueError('Unable to load metadata')
-    vocab = convert.load_vocab(
-        cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
-        cfg.vocabtype)
-    # FIXME: Respect cfg.vocab_dir?
-    svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
-                               load_merges = cfg.vocabtype == 'bpe',
-                               n_vocab = vocab.vocab_size)
+    vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
+    vocab_factory = convert.VocabFactory(vocab_path)
+    vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir)
    convert.check_vocab_size(params, vocab)
-    return (params, vocab, svocab)
+    return params, vocab, special_vocab


 def handle_args():
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -5,17 +5,16 @@ import json
 import os
 import struct
 import sys
+from pathlib import Path
 from typing import Any, BinaryIO, Sequence

 import numpy as np
 import torch

-from pathlib import Path
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf

-
 NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}


@@ -60,7 +59,14 @@ if __name__ == '__main__':
    input_model = os.path.join(sys.argv[1], "adapter_model.bin")
    output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")

-    model = torch.load(input_model, map_location="cpu")
+    if os.path.exists(input_model):
+        model = torch.load(input_model, map_location="cpu")
+    else:
+        input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
+        # lazy import load_file only if lora is in safetensors format.
+        from safetensors.torch import load_file
+        model = load_file(input_model, device="cpu")
+
    arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"

    if arch_name not in gguf.MODEL_ARCH_NAMES.values():
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@@ -1,11 +1,13 @@
 #!/usr/bin/env python3
-import torch
-import os
-from pprint import pprint
-import sys
 import argparse
+import os
+import sys
 from pathlib import Path
+from pprint import pprint
+
+import torch
 from sentencepiece import SentencePieceProcessor
+
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
@@ -69,7 +71,7 @@ def main():
    persimmon_model = torch.load(args.ckpt_path)
    hparams = persimmon_model['args']
    pprint(hparams)
-    tensors = {}
+    tensors: dict[str, torch.Tensor] = {}
    _flatten_dict(persimmon_model['model'], tensors, None)

    arch = gguf.MODEL_ARCH.PERSIMMON
--- a/convert.py
+++ b/convert.py
@@ -17,58 +17,28 @@ import signal
 import struct
 import sys
 import time
-import warnings
 import zipfile
 from abc import ABCMeta, abstractmethod
-from argparse import ArgumentParser
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import (
-    IO,
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    Iterable,
-    Literal,
-    Optional,
-    Tuple,
-    TypeVar,
-)
+from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar

 import numpy as np
 from sentencepiece import SentencePieceProcessor

-try:
-    from transformers import AutoTokenizer
-except ModuleNotFoundError as e:
-    warnings.warn(f"Could not import AutoTokenizer from transformers: {e}")
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+import gguf

-# If NO_LOCAL_GGUF is not set, try to import gguf from the local gguf-py directory
-if "NO_LOCAL_GGUF" not in os.environ:
-    # Use absolute path to the gguf-py directory
-    gguf_py_dir = str(Path(__file__).resolve().parent / "gguf-py")
-    print(gguf_py_dir)  # NOTE: Remove this once path is verified after changes are completed
-    if gguf_py_dir not in sys.path:
-        sys.path.insert(1, gguf_py_dir)
+if TYPE_CHECKING:
+    from typing import TypeAlias

-# Import gguf module
-try:
-    import gguf
-except ModuleNotFoundError as e:
-    print(f"Could not import gguf: {e}")
-    sys.exit(1)
-
-if TYPE_CHECKING:  # NOTE: This isn't necessary.
-    from typing import TypeAlias  # This can technically be omitted.
-
-if hasattr(faulthandler, "register") and hasattr(signal, "SIGUSR1"):
+if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
    faulthandler.register(signal.SIGUSR1)

-# NOTE: n-dimensional arrays should be directly referenced
-NDArray: TypeAlias = "np.ndarray[Any, Any]"
+NDArray: TypeAlias = 'np.ndarray[Any, Any]'

-# Why is this here? LLAMA and GPT are technically the only compatible ARCHs.
 ARCH = gguf.MODEL_ARCH.LLAMA

 DEFAULT_CONCURRENCY = 8
@@ -78,7 +48,6 @@ DEFAULT_CONCURRENCY = 8
 #


-# TODO: Clean up and refactor data types
@dataclass(frozen=True)
 class DataType:
    name: str
@@ -183,85 +152,65 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {

@dataclass
 class Params:
-    n_vocab: int
-    n_embd: int
-    n_layer: int
-    n_ctx: int
-    n_ff: int
-    n_head: int
-    n_head_kv: int
-    f_norm_eps: Optional[float] = None
-    n_experts: Optional[int] = None
-    n_experts_used: Optional[int] = None
+    n_vocab:        int
+    n_embd:         int
+    n_layer:        int
+    n_ctx:          int
+    n_ff:           int
+    n_head:         int
+    n_head_kv:      int
+    n_experts:      int | None = None
+    n_experts_used: int | None = None
+    f_norm_eps:     float | None = None

-    rope_scaling_type: Optional[gguf.RopeScalingType] = None
-    f_rope_freq_base: Optional[float] = None
-    f_rope_scale: Optional[float] = None
-    n_orig_ctx: Optional[int] = None
-    rope_finetuned: Optional[bool] = None
+    rope_scaling_type: gguf.RopeScalingType | None = None
+    f_rope_freq_base: float | None = None
+    f_rope_scale: float | None = None
+    n_orig_ctx: int | None = None
+    rope_finetuned: bool | None = None

-    ftype: Optional[GGMLFileType] = None
+    ftype: GGMLFileType | None = None

    # path to the directory containing the model files
-    path_model: Optional[Path] = None
+    path_model: Path | None = None

    @staticmethod
-    def guessed(model: LazyModel) -> "Params":
+    def guessed(model: LazyModel) -> Params:
        # try transformer naming first
-        n_vocab, n_embd = (
-            model["model.embed_tokens.weight"].shape
-            if "model.embed_tokens.weight" in model
-            else model["tok_embeddings.weight"].shape
-        )
+        n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape

        # try transformer naming first
        if "model.layers.0.self_attn.q_proj.weight" in model:
-            n_layer = next(
-                i
-                for i in itertools.count()
-                if f"model.layers.{i}.self_attn.q_proj.weight" not in model
-            )
-        elif (
-            "model.layers.0.self_attn.W_pack.weight" in model
-        ):  # next: try baichuan naming
-            n_layer = next(
-                i
-                for i in itertools.count()
-                if f"model.layers.{i}.self_attn.W_pack.weight" not in model
-            )
+            n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
+        elif "model.layers.0.self_attn.W_pack.weight" in model:   # next: try baichuan naming
+            n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
        else:
-            n_layer = next(
-                i
-                for i in itertools.count()
-                if f"layers.{i}.attention.wq.weight" not in model
-            )
+            n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)

        if n_layer < 1:
-            raise Exception(
-                "failed to guess 'n_layer'. This model is unknown or unsupported.\n"
-                "Suggestion: provide 'config.json' of the model in the same directory containing model files."
-            )
+            raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
+                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")

-        n_head = n_embd // 128  # guessed
-        n_mult = 256  # guessed
+        n_head = n_embd // 128 # guessed
+        n_mult = 256           # guessed

        # TODO: verify this
        n_ff = int(2 * (4 * n_embd) / 3)
        n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult)

        return Params(
-            n_vocab=n_vocab,
-            n_embd=n_embd,
-            n_layer=n_layer,
-            n_ctx=-1,
-            n_ff=n_ff,
-            n_head=n_head,
-            n_head_kv=n_head,
-            f_norm_eps=1e-5,
+            n_vocab    = n_vocab,
+            n_embd     = n_embd,
+            n_layer    = n_layer,
+            n_ctx      = -1,
+            n_ff       = n_ff,
+            n_head     = n_head,
+            n_head_kv  = n_head,
+            f_norm_eps = 1e-5,
        )

    @staticmethod
-    def load_transformers_config(model: LazyModel, config_path: Path) -> "Params":
+    def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
        config = json.load(open(config_path))

        rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
@@ -274,22 +223,20 @@ class Params:
                rope_scaling_type = gguf.RopeScalingType.LINEAR
            elif typ == "yarn":
                rope_scaling_type = gguf.RopeScalingType.YARN
-                n_orig_ctx = rope_scaling["original_max_position_embeddings"]
-                rope_finetuned = rope_scaling["finetuned"]
+                n_orig_ctx = rope_scaling['original_max_position_embeddings']
+                rope_finetuned = rope_scaling['finetuned']
            else:
-                raise NotImplementedError(f"Unknown rope scaling type: {typ}")
+                raise NotImplementedError(f'Unknown rope scaling type: {typ}')

        if "max_sequence_length" in config:
            n_ctx = config["max_sequence_length"]
        elif "max_position_embeddings" in config:
            n_ctx = config["max_position_embeddings"]
        else:
-            raise Exception(
-                "failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
-                "Suggestion: provide 'config.json' of the model in the same directory containing model files."
-            )
+            raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
+                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")

-        n_experts = None
+        n_experts      = None
        n_experts_used = None

        if "num_local_experts" in config:
@@ -297,30 +244,30 @@ class Params:
            n_experts_used = config["num_experts_per_tok"]

        return Params(
-            n_vocab=config["vocab_size"],
-            n_embd=config["hidden_size"],
-            n_layer=config["num_hidden_layers"],
-            n_ctx=n_ctx,
-            n_ff=config["intermediate_size"],
-            n_head=(n_head := config["num_attention_heads"]),
-            n_head_kv=config.get("num_key_value_heads", n_head),
-            n_experts=n_experts,
-            n_experts_used=n_experts_used,
-            f_norm_eps=config["rms_norm_eps"],
-            f_rope_freq_base=config.get("rope_theta"),
-            rope_scaling_type=rope_scaling_type,
-            f_rope_scale=f_rope_scale,
-            n_orig_ctx=n_orig_ctx,
-            rope_finetuned=rope_finetuned,
+            n_vocab           = config["vocab_size"],
+            n_embd            = config["hidden_size"],
+            n_layer           = config["num_hidden_layers"],
+            n_ctx             = n_ctx,
+            n_ff              = config["intermediate_size"],
+            n_head            = (n_head := config["num_attention_heads"]),
+            n_head_kv         = config.get("num_key_value_heads", n_head),
+            n_experts         = n_experts,
+            n_experts_used    = n_experts_used,
+            f_norm_eps        = config["rms_norm_eps"],
+            f_rope_freq_base  = config.get("rope_theta"),
+            rope_scaling_type = rope_scaling_type,
+            f_rope_scale      = f_rope_scale,
+            n_orig_ctx        = n_orig_ctx,
+            rope_finetuned    = rope_finetuned,
        )

    # LLaMA v2 70B params.json
    # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
    @staticmethod
-    def load_torch_params(model: LazyModel, config_path: Path) -> "Params":
+    def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
        config = json.load(open(config_path))

-        n_experts = None
+        n_experts      = None
        n_experts_used = None
        f_rope_freq_base = None

@@ -343,50 +290,50 @@ class Params:

        if config.get("moe"):
            n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
-            n_experts = config["moe"]["num_experts"]
+            n_experts      = config["moe"]["num_experts"]
            n_experts_used = config["moe"]["num_experts_per_tok"]
            f_rope_freq_base = 1e6

        return Params(
-            n_vocab=model["tok_embeddings.weight"].shape[0],
-            n_embd=config["dim"],
-            n_layer=config["n_layers"],
-            n_ctx=n_ctx,
-            n_ff=n_ff,
-            n_head=(n_head := config["n_heads"]),
-            n_head_kv=config.get("n_kv_heads", n_head),
-            n_experts=n_experts,
-            n_experts_used=n_experts_used,
-            f_norm_eps=config["norm_eps"],
-            f_rope_freq_base=config.get("rope_theta", f_rope_freq_base),
+            n_vocab          = model["tok_embeddings.weight"].shape[0],
+            n_embd           = config["dim"],
+            n_layer          = config["n_layers"],
+            n_ctx            = n_ctx,
+            n_ff             = n_ff,
+            n_head           = (n_head := config["n_heads"]),
+            n_head_kv        = config.get("n_kv_heads", n_head),
+            n_experts        = n_experts,
+            n_experts_used   = n_experts_used,
+            f_norm_eps       = config["norm_eps"],
+            f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
        )

    @staticmethod
-    def load(model_plus: ModelPlus) -> "Params":
-        hf_config_path = model_plus.paths[0].parent / "config.json"
+    def load(model_plus: ModelPlus) -> Params:
+        hf_config_path   = model_plus.paths[0].parent / "config.json"
        orig_config_path = model_plus.paths[0].parent / "params.json"

        if hf_config_path.exists():
-            params = Params.load_transformers_config(model_plus.model, hf_config_path)
+            params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
        elif orig_config_path.exists():
-            params = Params.load_torch_params(model_plus.model, orig_config_path)
-        elif model_plus.format != "none":
+            params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
+        elif model_plus.format != 'none':
            params = Params.guessed(model_plus.model)
        else:
-            raise ValueError("Cannot guess params when model format is none")
+            raise ValueError('Cannot guess params when model format is none')

        params.path_model = model_plus.paths[0].parent

        return params


-class BpeVocab:  # GPT
-    def __init__(
-        self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]
-    ) -> None:
-        self.bpe_tokenizer = json.loads(
-            open(str(fname_tokenizer), encoding="utf-8").read()
-        )
+#
+# vocab
+#
+
+class BpeVocab:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
+        self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
        self.vocab = self.bpe_tokenizer["model"]["vocab"]
        added_tokens: dict[str, int]
        if fname_added_tokens is not None:
@@ -394,34 +341,31 @@ class BpeVocab:  # GPT
            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
        else:
            # Fall back to trying to find the added tokens in tokenizer.json
-            tokenizer_json_file = fname_tokenizer.parent / "tokenizer.json"
+            tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
            if not tokenizer_json_file.is_file():
                added_tokens = {}
            else:
                tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
                added_tokens = dict(
-                    (item["content"], item["id"])
-                    for item in tokenizer_json.get("added_tokens", [])
+                    (item['content'], item['id'])
+                    for item in tokenizer_json.get('added_tokens', [])
                    # Added tokens here can be duplicates of the main vocabulary.
-                    if item["content"] not in self.bpe_tokenizer
-                )
+                    if item['content'] not in self.bpe_tokenizer)

        vocab_size: int = len(self.vocab)
-        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
-        actual_ids = sorted(added_tokens.values())
+        expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids      = sorted(added_tokens.values())
        if expected_ids != actual_ids:
            expected_end_id = vocab_size + len(actual_ids) - 1
-            raise Exception(
-                f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}"
-            )
+            raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")

        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
-        self.added_tokens_dict = added_tokens
-        self.added_tokens_list = [text for (text, idx) in items]
+        self.added_tokens_dict    = added_tokens
+        self.added_tokens_list    = [text for (text, idx) in items]
        self.vocab_size_base: int = vocab_size
-        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer = fname_tokenizer
-        self.fname_added_tokens = fname_added_tokens
+        self.vocab_size: int      = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer      = fname_tokenizer
+        self.fname_added_tokens   = fname_added_tokens

    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
@@ -442,10 +386,8 @@ class BpeVocab:  # GPT
        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"


-class SentencePieceVocab:  # LlaMa
-    def __init__(
-        self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]
-    ) -> None:
+class SentencePieceVocab:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
        added_tokens: dict[str, int]
        if fname_added_tokens is not None:
@@ -455,23 +397,19 @@ class SentencePieceVocab:  # LlaMa

        vocab_size: int = self.sentencepiece_tokenizer.vocab_size()

-        new_tokens = {
-            id: piece for piece, id in added_tokens.items() if id >= vocab_size
-        }
+        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
-        actual_new_ids = sorted(new_tokens.keys())
+        actual_new_ids   = sorted(new_tokens.keys())

        if expected_new_ids != actual_new_ids:
-            raise ValueError(
-                f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}"
-            )
+            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")

        # Token pieces that were added to the base vocabulary.
        self.added_tokens_dict = added_tokens
-        self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
-        self.vocab_size_base = vocab_size
-        self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer = fname_tokenizer
+        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
+        self.vocab_size_base    = vocab_size
+        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer    = fname_tokenizer
        self.fname_added_tokens = fname_added_tokens

    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
@@ -512,11 +450,15 @@ class SentencePieceVocab:  # LlaMa


 class HfVocab:
-    def __init__(
-        self,
-        fname_tokenizer: Path,
-        fname_added_tokens: Optional[Path] = None,
-    ) -> None:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
+        try:
+            from transformers import AutoTokenizer
+        except ImportError as e:
+            raise ImportError(
+                "To use HfVocab, please install the `transformers` package. "
+                "You can install it with `pip install transformers`."
+            ) from e
+
        print("fname_tokenizer:", fname_tokenizer)
        # Allow the tokenizer to default to slow or fast versions.
        # Explicitly set tokenizer to use local paths.
@@ -529,7 +471,7 @@ class HfVocab:
        # Initialize lists and dictionaries for added tokens
        self.added_tokens_list = []
        self.added_tokens_dict = dict()
-        self.added_tokens_ids = set()
+        self.added_tokens_ids  = set()

        # Process added tokens
        for tok, tokidx in sorted(
@@ -550,12 +492,12 @@ class HfVocab:

        # Set vocabulary sizes
        self.vocab_size_base = self.tokenizer.vocab_size
-        self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
+        self.vocab_size      = self.vocab_size_base + len(self.added_tokens_list)

-        self.fname_tokenizer = fname_tokenizer
+        self.fname_tokenizer    = fname_tokenizer
        self.fname_added_tokens = fname_added_tokens

-    def hf_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
+    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {
            id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
        }
@@ -573,11 +515,9 @@ class HfVocab:
                token_id, self.special_ids  # Reuse already stored special IDs
            )

-    def get_token_type(self, token_id: int, special_ids: set) -> gguf.TokenType:
+    def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType:
        # Determine token type based on whether it's a special token
-        return (
-            gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
-        )
+        return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL

    def get_token_score(self, token_id: int) -> float:
        # Placeholder for actual logic to determine the token's score
@@ -589,7 +529,6 @@ class HfVocab:
            if text in self.specials:
                toktype = self.get_token_type(self.specials[text], self.special_ids)
                score = self.get_token_score(self.specials[text])
-
            else:
                toktype = gguf.TokenType.USER_DEFINED
                score = -1000.0
@@ -783,7 +722,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
    else:
        model = merge_sharded([mp.model for mp in models_plus])

-    return ModelPlus(model, paths, format, vocab)
+    return ModelPlus(model, paths, format, vocab)  # pytype: disable=wrong-arg-types


 def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
@@ -871,17 +810,13 @@ class LazyUnpickler(pickle.Unpickler):
    CLASSES: dict[tuple[str, str], Any] = {
        # getattr used here as a workaround for mypy not being smart enough to determine
        # the staticmethods have a __func__ attribute.
-        ("torch._tensor", "_rebuild_from_type_v2"): getattr(
-            rebuild_from_type_v2, "__func__"
-        ),
-        ("torch._utils", "_rebuild_tensor_v2"): getattr(
-            lazy_rebuild_tensor_v2, "__func__"
-        ),
-        ("torch", "BFloat16Storage"): LazyStorageKind(DT_BF16),
-        ("torch", "HalfStorage"): LazyStorageKind(DT_F16),
-        ("torch", "FloatStorage"): LazyStorageKind(DT_F32),
-        ("torch", "IntStorage"): LazyStorageKind(DT_I32),
-        ("torch", "Tensor"): LazyTensor,
+        ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
+        ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
+        ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
+        ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
+        ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
+        ('torch', 'IntStorage'): LazyStorageKind(DT_I32),
+        ('torch', 'Tensor'): LazyTensor,
    }

    def find_class(self, module: str, name: str) -> Any:
@@ -968,7 +903,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
        executor_class = ProcessPoolExecutor
    else:
        executor_class = ThreadPoolExecutor
-    with executor_class(max_workers = max_workers) as executor:
+    with executor_class(max_workers=max_workers) as executor:
        futures: list[concurrent.futures.Future[Out]] = []
        done = False
        for _ in range(concurrency):
@@ -1022,12 +957,8 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N


 class OutputFile:
-    def __init__(
-        self, fname_out: Path, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE
-    ) -> None:
-        self.gguf = gguf.GGUFWriter(
-            fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess
-        )
+    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
+        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)

    def add_meta_arch(self, params: Params) -> None:
        name = "LLaMA"
@@ -1036,21 +967,16 @@ class OutputFile:
        if params.n_ctx == 4096:
            name = "LLaMA v2"
        elif params.path_model is not None:
-            name = str(params.path_model.parent).split("/")[-1]
+            name = str(params.path_model.parent).split('/')[-1]

-        self.gguf.add_name(name)
-        self.gguf.add_context_length(params.n_ctx)
-        self.gguf.add_embedding_length(params.n_embd)
-        self.gguf.add_block_count(params.n_layer)
-        self.gguf.add_feed_forward_length(params.n_ff)
+        self.gguf.add_name                (name)
+        self.gguf.add_context_length      (params.n_ctx)
+        self.gguf.add_embedding_length    (params.n_embd)
+        self.gguf.add_block_count         (params.n_layer)
+        self.gguf.add_feed_forward_length (params.n_ff)
        self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
-        self.gguf.add_head_count(params.n_head)
-        self.gguf.add_head_count_kv(params.n_head_kv)
-
-        if params.f_norm_eps is None:
-            raise ValueError("f_norm_eps is None")
-
-        self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
+        self.gguf.add_head_count          (params.n_head)
+        self.gguf.add_head_count_kv       (params.n_head_kv)

        if params.n_experts:
            self.gguf.add_expert_count(params.n_experts)
@@ -1058,6 +984,11 @@ class OutputFile:
        if params.n_experts_used:
            self.gguf.add_expert_used_count(params.n_experts_used)

+        if params.f_norm_eps:
+            self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
+        else:
+            raise ValueError('f_norm_eps is None')
+
        if params.f_rope_freq_base is not None:
            self.gguf.add_rope_freq_base(params.f_rope_freq_base)

@@ -1089,7 +1020,7 @@ class OutputFile:

        return tokenizer_model

-    def extract_vocabulary_from_model(self, vocab: Vocab) -> Tuple[list, list, list]:
+    def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
        tokens = []
        scores = []
        toktypes = []
@@ -1124,14 +1055,10 @@ class OutputFile:

    def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
        n_elements = int(np.prod(tensor.shape))
-        raw_dtype = getattr(tensor.data_type, "ggml_type", None)
-        data_type = (
-            getattr(tensor.data_type, "quantized_type", None) or tensor.data_type.dtype
-        )
+        raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
+        data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
        data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
-        self.gguf.add_tensor_info(
-            name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype
-        )
+        self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype)

    def write_meta(self) -> None:
        self.gguf.write_header_to_file()
@@ -1145,14 +1072,10 @@ class OutputFile:

    @staticmethod
    def write_vocab_only(
-        fname_out: Path,
-        params: Params,
-        vocab: Vocab,
-        svocab: gguf.SpecialVocab,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
-        pad_vocab: bool = False,
+        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
    ) -> None:
-        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
+        check_vocab_size(params, vocab, pad_vocab = pad_vocab)

        of = OutputFile(fname_out, endianess=endianess)

@@ -1180,14 +1103,8 @@ class OutputFile:

    @staticmethod
    def write_all(
-        fname_out: Path,
-        ftype: GGMLFileType,
-        params: Params,
-        model: LazyModel,
-        vocab: Vocab,
-        svocab: gguf.SpecialVocab,
-        concurrency: int = DEFAULT_CONCURRENCY,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
+        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
+        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
@@ -1207,26 +1124,19 @@ class OutputFile:
        of.write_tensor_info()

        # tensor data
-        ndarrays_inner = bounded_parallel_map(
-            OutputFile.do_item, model.items(), concurrency=concurrency
-        )
+        ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
        if ftype == GGMLFileType.MostlyQ8_0:
            ndarrays = bounded_parallel_map(
-                OutputFile.maybe_do_quantize,
-                ndarrays_inner,
-                concurrency=concurrency,
-                max_workers=concurrency,
+                OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
                use_processpool_executor=True,
            )
        else:
            ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)

        start = time.time()
-        for i, ((name, lazy_tensor), ndarray) in enumerate(
-            zip(model.items(), ndarrays)
-        ):
+        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
            elapsed = time.time() - start
-            size = " x ".join(f"{dim:6d}" for dim in lazy_tensor.shape)
+            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
            padi = len(str(len(model)))
            print(
                f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
@@ -1363,7 +1273,7 @@ def load_some_model(path: Path) -> ModelPlus:
 class VocabFactory:
    def __init__(self, path: Path):
        self.path = path
-        self.files = {
+        self.files: dict[str, Path | None] = {
            "tokenizer.model": None,
            "vocab.json": None,
            "tokenizer.json": None,
@@ -1380,24 +1290,18 @@ class VocabFactory:
                self.files[file] = parent_file_path
        print(f"Found vocab files: {self.files}")

-    def _select_file(self, vocabtype: Optional[str]) -> Path:
+    def _select_file(self, vocabtype: str | None) -> Path:
        if vocabtype in ["spm", "bpe"]:
            for file_key in self.files.keys():
-                if self.files[file_key]:
-                    return self.files[file_key]
+                if (file := self.files[file_key]) is not None:
+                    return file
            raise FileNotFoundError(f"{vocabtype} vocab not found.")
-        elif vocabtype == "hfft":
+        if vocabtype == "hfft":
            # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
            return self.path
-        else:
-            raise ValueError(f"Unsupported vocabulary type {vocabtype}")
+        raise ValueError(f"Unsupported vocabulary type {vocabtype}")

-    def _create_special_vocab(
-        self,
-        vocab: Vocab,
-        vocabtype: str,
-        model_parent_path: Path,
-    ) -> gguf.SpecialVocab:
+    def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
        load_merges = vocabtype == "bpe"
        n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
        return gguf.SpecialVocab(
@@ -1407,13 +1311,12 @@ class VocabFactory:
            n_vocab=n_vocab,
        )

-    def load_vocab(
-        self, vocabtype: str, model_parent_path: Path
-    ) -> Tuple[Vocab, gguf.SpecialVocab]:
+    def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
        path = self._select_file(vocabtype)
        print(f"Loading vocab file '{path}', type '{vocabtype}'")

        added_tokens_path = path.parent / "added_tokens.json"
+        vocab: Vocab
        if vocabtype == "bpe":
            vocab = BpeVocab(
                path, added_tokens_path if added_tokens_path.exists() else None
@@ -1428,6 +1331,7 @@ class VocabFactory:
            )
        else:
            raise ValueError(f"Unsupported vocabulary type {vocabtype}")
+        # FIXME: Respect --vocab-dir?
        special_vocab = self._create_special_vocab(
            vocab,
            vocabtype,
@@ -1436,18 +1340,17 @@ class VocabFactory:
        return vocab, special_vocab


-def default_output_file(model_paths: list[Path], file_type: GGMLFileType) -> Path:
+def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
    namestr = {
-        GGMLFileType.AllF32: "f32",
+        GGMLFileType.AllF32:    "f32",
        GGMLFileType.MostlyF16: "f16",
-        GGMLFileType.MostlyQ8_0: "q8_0",
+        GGMLFileType.MostlyQ8_0:"q8_0",
    }[file_type]
    ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
    if ret in model_paths:
        sys.stderr.write(
            f"Error: Default output path ({ret}) would overwrite the input. "
-            "Please explicitly specify a path using --outfile.\n"
-        )
+            "Please explicitly specify a path using --outfile.\n")
        sys.exit(1)
    return ret

@@ -1457,111 +1360,34 @@ def do_dump_model(model_plus: ModelPlus) -> None:
    print(f"model_plus.format = {model_plus.format!r}")
    print(f"model_plus.vocab = {model_plus.vocab!r}")
    for name, lazy_tensor in model_plus.model.items():
-        print(
-            f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}"
-        )
+        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")


-def get_argument_parser() -> ArgumentParser:
+def main(args_in: list[str] | None = None) -> None:
    output_choices = ["f32", "f16"]
    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
        # We currently only support Q8_0 output on little endian systems.
        output_choices.append("q8_0")
+    vocab_types = ["spm", "bpe", "hfft"]
+    parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
+    parser.add_argument("--awq-path",    type=Path,              help="Path to scale awq cache file", default=None)
+    parser.add_argument("--dump",        action="store_true",    help="don't convert, just show what's in the model")
+    parser.add_argument("--dump-single", action="store_true",    help="don't convert, just show what's in a single model file")
+    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
+    parser.add_argument("--outtype",     choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
+    parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
+    parser.add_argument("--vocab-type",  choices=vocab_types,    help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
+    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
+    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
+    parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
+    parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
+    parser.add_argument("--big-endian",  action="store_true",    help="model is executed on big endian machine")
+    parser.add_argument("--pad-vocab",   action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")

-    parser = argparse.ArgumentParser(
-        description="Convert a LLaMa model to a GGML compatible file"
-    )
-
-    parser.add_argument(
-        "model",
-        type=Path,
-        help="Directory containing the model file or the model file itself (*.pth, *.pt, *.bin)",
-    )
-
-    parser.add_argument(
-        "--awq-path",
-        type=Path,
-        help="Path to the Activation-aware Weight Quantization cache file",
-        default=None,
-    )
-
-    parser.add_argument(
-        "--dump",
-        action="store_true",
-        help="Display the model content without converting it",
-    )
-
-    parser.add_argument(
-        "--dump-single",
-        action="store_true",
-        help="Display the content of a single model file without conversion",
-    )
-
-    parser.add_argument(
-        "--vocab-only",
-        action="store_true",
-        help="Extract and output only the vocabulary",
-    )
-
-    parser.add_argument(
-        "--outtype",
-        choices=output_choices,
-        help="Output format - note: q8_0 may be very slow (default: f16 or f32 based on input)",
-    )
-
-    parser.add_argument(
-        "--vocab-dir",
-        type=Path,
-        help="Directory containing the tokenizer.model, if separate from the model file",
-    )
-
-    parser.add_argument(
-        "--vocab-type",
-        choices=["spm", "bpe", "hfft"],  # hfft: Hugging Face Fast Tokenizer
-        default="spm",
-        help="The vocabulary format used to define the tokenizer model (default: spm)",
-    )
-
-    parser.add_argument(
-        "--pad-vocab",
-        action="store_true",
-        help="Add padding tokens when the model's vocabulary size exceeds the tokenizer metadata",
-    )
-
-    parser.add_argument(
-        "--outfile",
-        type=Path,
-        help="Specify the path for the output file (default is based on input)",
-    )
-
-    parser.add_argument(
-        "--ctx", type=int, help="Model training context (default is based on input)"
-    )
-
-    parser.add_argument(
-        "--concurrency",
-        type=int,
-        help=f"Concurrency used for conversion (default: {DEFAULT_CONCURRENCY})",
-        default=DEFAULT_CONCURRENCY,
-    )
-
-    parser.add_argument(
-        "--big-endian",
-        action="store_true",
-        help="Indicate that the model is executed on a big-endian machine",
-    )
-
-    return parser
-
-
-def main(argv: Optional[list[str]] = None) -> None:
-    parser = get_argument_parser()
-    args = parser.parse_args(argv)
-
+    args = parser.parse_args(args_in)
    if args.awq_path:
-        sys.path.insert(1, str(Path(__file__).resolve().parent / "awq-py"))
-        from awq.apply_awq import add_scale_weights
-
+        sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
+        from awq.apply_awq import add_scale_weights  # type: ignore[import-not-found]
        tmp_model_path = args.model / "weighted_model"
        if tmp_model_path.is_dir():
            print(f"{tmp_model_path} exists as a weighted model.")
@@ -1580,14 +1406,11 @@ def main(argv: Optional[list[str]] = None) -> None:
    if not args.vocab_only:
        model_plus = load_some_model(args.model)
    else:
-        model_plus = ModelPlus(
-            model={}, paths=[args.model / "dummy"], format="none", vocab=None
-        )
+        model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)

    if args.dump:
        do_dump_model(model_plus)
        return
-
    endianess = gguf.GGUFEndian.LITTLE
    if args.big_endian:
        endianess = gguf.GGUFEndian.BIG
@@ -1595,12 +1418,10 @@ def main(argv: Optional[list[str]] = None) -> None:
    params = Params.load(model_plus)
    if params.n_ctx == -1:
        if args.ctx is None:
-            raise Exception(
-                "The model doesn't have a context size, and you didn't specify one with --ctx\n"
-                "Please specify one with --ctx:\n"
-                " - LLaMA v1: --ctx 2048\n"
-                " - LLaMA v2: --ctx 4096\n"
-            )
+            raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
+                            "Please specify one with --ctx:\n"
+                            " - LLaMA v1: --ctx 2048\n"
+                            " - LLaMA v2: --ctx 4096\n")
        params.n_ctx = args.ctx

    if args.outtype:
@@ -1621,42 +1442,30 @@ def main(argv: Optional[list[str]] = None) -> None:
        if not args.outfile:
            raise ValueError("need --outfile if using --vocab-only")
        outfile = args.outfile
-        OutputFile.write_vocab_only(
-            outfile,
-            params,
-            vocab,
-            special_vocab,
-            endianess=endianess,
-            pad_vocab=args.pad_vocab,
-        )
+        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
+                                    endianess=endianess, pad_vocab=args.pad_vocab)
        print(f"Wrote {outfile}")
        return

    if model_plus.vocab is not None and args.vocab_dir is None:
        vocab = model_plus.vocab

-    model = model_plus.model
-    model = convert_model_names(model, params)
-    ftype = pick_output_type(model, args.outtype)
-    model = convert_to_output_type(model, ftype)
-    outfile = args.outfile or default_output_file(model_plus.paths, ftype)
+    print(f"Vocab info: {vocab}")
+    print(f"Special vocab info: {special_vocab}")
+
+    model   = model_plus.model
+    model   = convert_model_names(model, params)
+    ftype   = pick_output_type(model, args.outtype)
+    model   = convert_to_output_type(model, ftype)
+    outfile = args.outfile or default_outfile(model_plus.paths, ftype)

    params.ftype = ftype
    print(f"Writing {outfile}, format {ftype}")

-    OutputFile.write_all(
-        outfile,
-        ftype,
-        params,
-        model,
-        vocab,
-        special_vocab,
-        concurrency=args.concurrency,
-        endianess=endianess,
-        pad_vocab=args.pad_vocab,
-    )
+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
+                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
    print(f"Wrote {outfile}")


-if __name__ == "__main__":
-    main(sys.argv[1:])  # Exclude the first element (script name) from sys.argv
+if __name__ == '__main__':
+    main()
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1800,6 +1800,8 @@ int main(int argc, char ** argv) {
    std::vector<size_t> train_samples_begin;
    std::vector<size_t> train_samples_size;
    printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
+    printf("%s: sample-start: %s\n", __func__, params.common.sample_start.c_str());
+    printf("%s: include-sample-start: %s\n", __func__, params.common.include_sample_start ? "true" : "false");
    tokenize_file(lctx,
            params.common.fn_train_data,
            params.common.sample_start,
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -26,6 +26,7 @@ struct StatParams {
    std::string ofile = "imatrix.dat";
    int         n_output_frequency = 10;
    int         verbosity = 1;
+    int         keep_every = 0;
    bool        collect_output_weight = false;
 };

@@ -42,6 +43,9 @@ private:
    int                                    m_last_call = 0;
    std::vector<float>                     m_src1_data;
    std::vector<int>                       m_ids; // the expert ids from ggml_mul_mat_id
+                                                  //
+    void save_imatrix(const char * file_name) const;
+    void keep_imatrix(int ncall) const;
 };

 bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@@ -117,6 +121,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                if (m_last_call % m_params.n_output_frequency == 0) {
                    save_imatrix();
                }
+                if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
+                    keep_imatrix(m_last_call);
+                }
            }
        }
    } else {
@@ -143,6 +150,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            if (m_last_call % m_params.n_output_frequency == 0) {
                save_imatrix();
            }
+            if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
+                keep_imatrix(m_last_call);
+            }
        }
    }

@@ -150,7 +160,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
 }

 void IMatrixCollector::save_imatrix() const {
-    const char * fname = m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str();
+    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
+}
+
+void IMatrixCollector::keep_imatrix(int ncall) const {
+    auto file_name = m_params.ofile;
+    if (file_name.empty()) file_name = "imatrix.dat";
+    file_name += ".at_";
+    file_name += std::to_string(ncall);
+    save_imatrix(file_name.c_str());
+}
+
+void IMatrixCollector::save_imatrix(const char * fname) const {
    std::ofstream out(fname, std::ios::binary);
    int n_entries = m_stats.size();
    out.write((const char*)&n_entries, sizeof(n_entries));
@@ -248,7 +269,7 @@ static void process_logits(
    }
 }

-static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
+static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl) {

    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
    const int n_ctx = llama_n_ctx(ctx);
@@ -269,10 +290,12 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
    }

    std::vector<float> logit_history;
-    logit_history.resize(tokens.size());
-
    std::vector<float> prob_history;
-    prob_history.resize(tokens.size());
+
+    if (compute_ppl) {
+        logit_history.resize(tokens.size());
+        prob_history.resize(tokens.size());
+    }

    const int n_chunk_max = tokens.size() / n_ctx;

@@ -288,12 +311,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {

    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);

+    const int num_batches = (n_ctx + n_batch - 1) / n_batch;
+
+    std::vector<float> logits;
+    if (compute_ppl && num_batches > 1) {
+        logits.reserve((size_t)n_ctx * n_vocab);
+    }
+
    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * n_ctx;
        const int end   = start + n_ctx;

-        const int num_batches = (n_ctx + n_batch - 1) / n_batch;
-
        std::vector<float> logits;

        const auto t_start = std::chrono::high_resolution_clock::now();
@@ -321,8 +349,10 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
            // restore the original token in case it was set to BOS
            tokens[batch_start] = token_org;

-            const auto * batch_logits = llama_get_logits(ctx);
-            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+            if (compute_ppl && num_batches > 1) {
+                const auto * batch_logits = llama_get_logits(ctx);
+                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+            }
        }

        const auto t_end = std::chrono::high_resolution_clock::now();
@@ -338,25 +368,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }

-        const int first = n_ctx/2;
-        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
-                       workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
-        count += n_ctx - first - 1;
+        if (compute_ppl) {
+            const int first = n_ctx/2;
+            const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
+            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
+            count += n_ctx - first - 1;

-        printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
-        fflush(stdout);
+            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+            fflush(stdout);
+
+            logits.clear();
+        }
    }
    printf("\n");

-    nll2 /= count;
-    nll /= count;
-    const double ppl = exp(nll);
-    nll2 -= nll * nll;
-    if (nll2 > 0) {
-        nll2 = sqrt(nll2/(count-1));
-        printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
-    } else {
-        printf("Unexpected negative standard deviation of log(prob)\n");
+    if (compute_ppl) {
+        nll2 /= count;
+        nll /= count;
+        const double ppl = exp(nll);
+        nll2 -= nll * nll;
+        if (nll2 > 0) {
+            nll2 = sqrt(nll2/(count-1));
+            printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+        } else {
+            printf("Unexpected negative standard deviation of log(prob)\n");
+        }
    }

    return true;
@@ -365,6 +402,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
 int main(int argc, char ** argv) {

    StatParams sparams;
+    bool compute_ppl = true;
    std::vector<char*> args;
    args.push_back(argv[0]);
    int iarg = 1;
@@ -381,12 +419,21 @@ int main(int argc, char ** argv) {
        }
        else if (arg == "--verbosity") {
            sparams.verbosity = std::stoi(argv[++iarg]);
+        } else if (arg == "--no-ppl") {
+            compute_ppl = false;
+        } else if (arg == "--keep-imatrix") {
+            sparams.keep_every = std::stoi(argv[++iarg]);
        } else {
            args.push_back(argv[iarg]);
        }
    }
    if (iarg < argc) {
-        args.push_back(argv[iarg]);
+        std::string arg{argv[iarg]};
+        if (arg == "--no-ppl") {
+            compute_ppl = false;
+        } else {
+            args.push_back(argv[iarg]);
+        }
    }

    gpt_params params;
@@ -448,7 +495,7 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }

-    bool OK = compute_imatrix(ctx, params);
+    bool OK = compute_imatrix(ctx, params, compute_ppl);
    if (!OK) {
        return 1;
    }
--- a/examples/llama.android/app/build.gradle.kts
+++ b/examples/llama.android/app/build.gradle.kts
@@ -30,6 +30,7 @@ android {
        }
        externalNativeBuild {
            cmake {
+                arguments += "-DCMAKE_BUILD_TYPE=Release"
                cppFlags += listOf()
                arguments += listOf()
            }
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -6,7 +6,7 @@
 " Similarly, you could add an insert mode keybind with
 " inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
 "
-" g:llama_api_url and g:llama_overrides can be configured in your .vimrc
+" g:llama_api_url, g:llama_api_key and g:llama_overrides can be configured in your .vimrc
 " let g:llama_api_url = "192.168.1.10:8080"
 " llama_overrides can also be set through buffer/window scopes. For instance
 " autocmd filetype python let b:llama_overrides = {"temp": 0.2}
@@ -82,6 +82,9 @@ func llama#doLlamaGen()
   endif
   let l:querydata.prompt = join(l:buflines, "\n")
   let l:curlcommand = copy(s:curlcommand)
+   if exists("g:llama_api_key")
+       call extend(l:curlcommand, ['--header', 'Authorization: Bearer ' .. g:llama_api_key])
+   endif
   let l:curlcommand[2] = json_encode(l:querydata)
   let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
 endfunction
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -0,0 +1,131 @@
+# MobileVLM
+
+Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants.
+
+for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM)
+
+The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
+
+## Usage
+Build with cmake or run `make llava-cli` to build it.
+
+After building, run: `./llava-cli` to see the usage. For example:
+
+```sh
+./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
+    --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
+    --image path/to/an/image.jpg \
+    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
+```
+
+## Model conversion
+
+- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
+
+```sh
+git clone https://huggingface.co/mtgv/MobileVLM-1.7B
+
+git clone https://huggingface.co/openai/clip-vit-large-patch14-336
+```
+
+2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+
+```sh
+python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
+```
+
+3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF:
+
+```sh
+python ./examples/llava/convert-image-encoder-to-gguf \
+    -m path/to/clip-vit-large-patch14-336 \
+    --llava-projector path/to/MobileVLM-1.7B/llava.projector \
+    --output-dir path/to/MobileVLM-1.7B \
+    --projector-type ldp
+```
+
+4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+
+```sh
+python ./convert.py path/to/MobileVLM-1.7B
+```
+
+5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
+```sh
+./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
+```
+
+Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
+
+## Android compile and run
+### compile
+refer to `examples/llava/android/build_64.sh`
+```sh
+mkdir examples/llava/android/build_64
+cd examples/llava/android/build_64
+../build_64.sh
+```
+### run on Android
+refer to `android/adb_run.sh`, modify resources' `name` and `path`
+
+## some result on Android with `Snapdragon 888` chip
+### case 1
+**input**
+```sh
+/data/local/tmp/llava-cli \
+    -m /data/local/tmp/ggml-model-q4_k.gguf \
+    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
+    -t 4 \
+    --image /data/local/tmp/demo.jpg \
+    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
+```
+**output**
+```sh
+encode_image_with_clip: image encoded in 21148.71 ms by CLIP (  146.87 ms per image patch)
+ Susan Wise Bauer
+llama_print_timings:        load time =   23574.72 ms
+llama_print_timings:      sample time =       1.24 ms /     6 runs   (    0.21 ms per token,  4850.44 tokens per second)
+llama_print_timings: prompt eval time =   12460.15 ms /   246 tokens (   50.65 ms per token,    19.74 tokens per second)
+llama_print_timings:        eval time =     424.86 ms /     6 runs   (   70.81 ms per token,    14.12 tokens per second)
+llama_print_timings:       total time =   34731.93 ms
+```
+### case 2
+**input**
+```sh
+/data/local/tmp/llava-cli \
+    -m /data/local/tmp/ggml-model-q4_k.gguf \
+    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
+    -t 4 \
+    --image /data/local/tmp/cat.jpeg \
+    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
+```
+
+**output**
+```sh
+encode_image_with_clip: image encoded in 21149.51 ms by CLIP (  146.87 ms per image patch)
+ The image depicts a cat sitting in the grass near some tall green plants.
+llama_print_timings:        load time =   23257.32 ms
+llama_print_timings:      sample time =       5.25 ms /    18 runs   (    0.29 ms per token,  3430.53 tokens per second)
+llama_print_timings: prompt eval time =   11900.73 ms /   232 tokens (   51.30 ms per token,    19.49 tokens per second)
+llama_print_timings:        eval time =    1279.03 ms /    18 runs   (   71.06 ms per token,    14.07 tokens per second)
+llama_print_timings:       total time =   34570.79 ms
+```
+
+## Minor shortcomings
+The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.
+
+## TODO
+
+- [ ] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid`
+- [ ] Optimize LDP projector performance
+
+      - Optimize the structure definition to avoid unnecessary memory rearrangements, to reduce the use of `ggml_permute_cpy`;
+      - Optimize operator implementation (ARM CPU/NVIDIA GPU): such as depthwise conv, hardswish, hardsigmoid, etc.
+- [ ] run MobileVLM on `Jetson Orin`
+- [ ] Support more model variants, such as `MobileVLM-3B`.
+
+
+## contributor
+```sh
+zhangjidong05, yangyang260, huyiming03, chenxiaotao03
+```
--- a/examples/llava/android/adb_run.sh
+++ b/examples/llava/android/adb_run.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+model_dir="/Users/cxt/model/llm/mobileVLM/MobileVLM-1.7B_processed"
+projector_name="mmproj-model-f16.gguf"
+llama_name="ggml-model-q4_k.gguf"
+img_dir="/Users/cxt/model/llm"
+img_name="demo.jpg"
+prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
+# img_name="cat.jpeg"
+# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
+
+program_dir="build_64/bin"
+binName="llava-cli"
+n_threads=4
+
+
+deviceDir="/data/local/tmp"
+saveDir="output"
+if [ ! -d ${saveDir} ]; then
+    mkdir ${saveDir}
+fi
+
+
+function android_run() {
+    # # copy resource into device
+    # adb push ${model_dir}/${projector_name} ${deviceDir}/${projector_name}
+    # adb push ${model_dir}/${llama_name} ${deviceDir}/${llama_name}
+    adb push ${img_dir}/${img_name} ${deviceDir}/${img_name}
+    # copy program into device
+    adb push ${program_dir}/${binName} ${deviceDir}/${binName}
+    adb shell "chmod 0777 ${deviceDir}/${binName}"
+
+    # run
+    adb shell "echo cd ${deviceDir} ${deviceDir}/${binName} \
+                                                 -m ${deviceDir}/${llama_name} \
+                                                 --mmproj ${deviceDir}/${projector_name} \
+                                                 -t ${n_threads} \
+                                                 --image ${deviceDir}/${img_name} \
+                                                 -p \"${prompt}\" \
+                                                 > ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt"
+    adb shell "cd ${deviceDir}; pwd; ${deviceDir}/${binName} \
+                                                 -m ${deviceDir}/${llama_name} \
+                                                 --mmproj ${deviceDir}/${projector_name} \
+                                                 -t ${n_threads} \
+                                                 --image ${deviceDir}/${img_name} \
+                                                 -p \"${prompt}\" \
+                                                 >> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt 2>&1"
+    adb pull ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt ${saveDir}
+}
+
+android_run
+
+echo "android_run is Done!"
--- a/examples/llava/android/build_64.sh
+++ b/examples/llava/android/build_64.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+cmake ../../../../ \
+-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+-DCMAKE_BUILD_TYPE=Release \
+-DANDROID_ABI="arm64-v8a" \
+-DANDROID_PLATFORM=android-23 $1
+
+make -j4
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2,17 +2,6 @@
 // so there might be still unnecessary artifacts hanging around
 // I'll gradually clean and extend it

-#include <cassert>
-#include <cmath>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <regex>
-#include <stdexcept>
-#include <vector>
-
 #include "clip.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
@@ -29,6 +18,19 @@
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"

+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <regex>
+#include <stdexcept>
+#include <vector>
+#include <sstream>
+#include <cinttypes>
+
 static std::string format(const char * fmt, ...) {
    va_list ap;
    va_list ap2;
@@ -67,6 +69,7 @@ static std::string format(const char * fmt, ...) {
 #define KEY_PATCH_SIZE "clip.vision.patch_size"
 #define KEY_IMAGE_MEAN "clip.vision.image_mean"
 #define KEY_IMAGE_STD "clip.vision.image_std"
+#define KEY_PROJ_TYPE "clip.projector_type"

 //
 // tensor name constants
@@ -89,6 +92,21 @@ static std::string format(const char * fmt, ...) {
 #define TN_TEXT_PROJ "text_projection.weight"
 #define TN_VIS_PROJ "visual_projection.weight"
 #define TN_LLAVA_PROJ "mm.%d.%s"
+#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
+#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
+
+
+enum projector_type {
+    PROJECTOR_TYPE_MLP,
+    PROJECTOR_TYPE_LDP,
+    PROJECTOR_TYPE_UNKNOWN,
+};
+
+static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
+    { PROJECTOR_TYPE_MLP,           "mlp"     },
+    { PROJECTOR_TYPE_LDP,          "ldp"    },
+};
+

 //
 // utilities to get data from a gguf file
@@ -129,6 +147,91 @@ static std::string get_ftype(int ftype) {
    return ggml_type_name(static_cast<ggml_type>(ftype));
 }

+static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
+    switch (type) {
+        case GGUF_TYPE_UINT8:   return std::to_string(((const uint8_t  *)data)[i]);
+        case GGUF_TYPE_INT8:    return std::to_string(((const int8_t   *)data)[i]);
+        case GGUF_TYPE_UINT16:  return std::to_string(((const uint16_t *)data)[i]);
+        case GGUF_TYPE_INT16:   return std::to_string(((const int16_t  *)data)[i]);
+        case GGUF_TYPE_UINT32:  return std::to_string(((const uint32_t *)data)[i]);
+        case GGUF_TYPE_INT32:   return std::to_string(((const int32_t  *)data)[i]);
+        case GGUF_TYPE_UINT64:  return std::to_string(((const uint64_t *)data)[i]);
+        case GGUF_TYPE_INT64:   return std::to_string(((const int64_t  *)data)[i]);
+        case GGUF_TYPE_FLOAT32: return std::to_string(((const float    *)data)[i]);
+        case GGUF_TYPE_FLOAT64: return std::to_string(((const double   *)data)[i]);
+        case GGUF_TYPE_BOOL:    return ((const bool *)data)[i] ? "true" : "false";
+        default:                return format("unknown type %d", type);
+    }
+}
+
+
+static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    std::string result;
+    for (size_t pos = 0; ; pos += search.length()) {
+        auto new_pos = s.find(search, pos);
+        if (new_pos == std::string::npos) {
+            result += s.substr(pos, s.size() - pos);
+            break;
+        }
+        result += s.substr(pos, new_pos - pos) + replace;
+        pos = new_pos;
+    }
+    s = std::move(result);
+}
+
+static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
+    const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
+
+    switch (type) {
+        case GGUF_TYPE_STRING:
+            return gguf_get_val_str(ctx_gguf, i);
+        case GGUF_TYPE_ARRAY:
+            {
+                const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
+                int arr_n = gguf_get_arr_n(ctx_gguf, i);
+                const void * data = gguf_get_arr_data(ctx_gguf, i);
+                std::stringstream ss;
+                ss << "[";
+                for (int j = 0; j < arr_n; j++) {
+                    if (arr_type == GGUF_TYPE_STRING) {
+                        std::string val = gguf_get_arr_str(ctx_gguf, i, j);
+                        // escape quotes
+                        replace_all(val, "\\", "\\\\");
+                        replace_all(val, "\"", "\\\"");
+                        ss << '"' << val << '"';
+                    } else if (arr_type == GGUF_TYPE_ARRAY) {
+                        ss << "???";
+                    } else {
+                        ss << gguf_data_to_str(arr_type, data, j);
+                    }
+                    if (j < arr_n - 1) {
+                        ss << ", ";
+                    }
+                }
+                ss << "]";
+                return ss.str();
+            }
+        default:
+            return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
+    }
+}
+
+static void print_tensor_info(const ggml_tensor* tensor, const char* prefix = "") {
+    size_t tensor_size = ggml_nbytes(tensor);
+    printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
+            prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
+            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
+}
+
+static projector_type clip_projector_type_from_string(const std::string & name) {
+    for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT
+        if (kv.second == name) {
+            return kv.first;
+        }
+    }
+    return PROJECTOR_TYPE_UNKNOWN;
+}
+
 //
 // image data
 //
@@ -205,6 +308,32 @@ struct clip_vision_model {
    struct ggml_tensor * mm_0_b;
    struct ggml_tensor * mm_2_w;
    struct ggml_tensor * mm_2_b;
+
+    // MobileVLM projection
+    struct ggml_tensor * mm_model_mlp_1_w;
+    struct ggml_tensor * mm_model_mlp_1_b;
+    struct ggml_tensor * mm_model_mlp_3_w;
+    struct ggml_tensor * mm_model_mlp_3_b;
+    struct ggml_tensor * mm_model_block_1_block_0_0_w;
+    struct ggml_tensor * mm_model_block_1_block_0_1_w;
+    struct ggml_tensor * mm_model_block_1_block_0_1_b;
+    struct ggml_tensor * mm_model_block_1_block_1_fc1_w;
+    struct ggml_tensor * mm_model_block_1_block_1_fc1_b;
+    struct ggml_tensor * mm_model_block_1_block_1_fc2_w;
+    struct ggml_tensor * mm_model_block_1_block_1_fc2_b;
+    struct ggml_tensor * mm_model_block_1_block_2_0_w;
+    struct ggml_tensor * mm_model_block_1_block_2_1_w;
+    struct ggml_tensor * mm_model_block_1_block_2_1_b;
+    struct ggml_tensor * mm_model_block_2_block_0_0_w;
+    struct ggml_tensor * mm_model_block_2_block_0_1_w;
+    struct ggml_tensor * mm_model_block_2_block_0_1_b;
+    struct ggml_tensor * mm_model_block_2_block_1_fc1_w;
+    struct ggml_tensor * mm_model_block_2_block_1_fc1_b;
+    struct ggml_tensor * mm_model_block_2_block_1_fc2_w;
+    struct ggml_tensor * mm_model_block_2_block_1_fc2_b;
+    struct ggml_tensor * mm_model_block_2_block_2_0_w;
+    struct ggml_tensor * mm_model_block_2_block_2_1_w;
+    struct ggml_tensor * mm_model_block_2_block_2_1_b;
 };

 struct clip_ctx {
@@ -213,6 +342,7 @@ struct clip_ctx {
    bool has_llava_projector = false;

    struct clip_vision_model vision_model;
+    projector_type proj_type = PROJECTOR_TYPE_MLP;

    float image_mean[3];
    float image_std[3];
@@ -430,16 +560,135 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            free(patches_data);
        }

+        // shape [1, 576, 1024]
+        // ne is whcn, ne = [1024, 576, 1, 1]
        embeddings = ggml_get_rows(ctx0, embeddings, patches);

-        // mm projection 0
-        embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
-        embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+        // print_tensor_info(embeddings, "embeddings");

-        embeddings = ggml_gelu(ctx0, embeddings);
+        // llava projector
+        if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
+            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);

-        embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
-        embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+            embeddings = ggml_gelu(ctx0, embeddings);
+
+            embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+        }
+        else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
+            // MobileVLM projector
+            int n_patch = 24;
+            struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
+            mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
+            mlp_1 = ggml_gelu(ctx0, mlp_1);
+            struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
+            mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
+            // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
+
+            // block 1
+            struct ggml_tensor * block_1 = nullptr;
+            {
+                // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
+                mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
+                mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
+                // stride = 1, padding = 1, bias is nullptr
+                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
+
+                // layer norm
+                // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
+                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+
+                // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                // hardswish
+                struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+
+                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                // pointwise conv
+                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
+                block_1 = ggml_relu(ctx0, block_1);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
+                block_1 = ggml_hardsigmoid(ctx0, block_1);
+                // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
+                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
+                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+
+                int w = block_1->ne[0], h = block_1->ne[1];
+                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+
+                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
+                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+
+                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+                // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                // residual
+                block_1 = ggml_add(ctx0, mlp_3, block_1);
+            }
+
+            // block_2
+            {
+                // stride = 2
+                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
+
+                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+                // layer norm
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
+                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+                // hardswish
+                struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+
+                // not sure the parameters is right for globalAvgPooling
+                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                // pointwise conv
+                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
+                block_1 = ggml_relu(ctx0, block_1);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
+                block_1 = ggml_hardsigmoid(ctx0, block_1);
+
+                // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
+                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+
+                int w = block_1->ne[0], h = block_1->ne[1];
+                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
+                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+
+
+                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
+                block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
+                // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
+            }
+            embeddings = block_1;
+        }
+        else {
+            GGML_ASSERT(false);
+        }
    }

    // build the graph
@@ -485,16 +734,47 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        printf("\n");
    }
    const int n_tensors = gguf_get_n_tensors(ctx);
+
    // kv
-    if (verbosity >= 3) {
-        const int n_kv = gguf_get_n_kv(ctx);
+    const int n_kv = gguf_get_n_kv(ctx);
+    printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
+        __func__, n_kv, n_tensors, fname);
+    {
+        std::map<enum ggml_type, uint32_t> n_type;

-        for (int i = 0; i < n_kv; ++i) {
-            const char * key = gguf_get_key(ctx, i);
+        for (int i = 0; i < n_tensors; i++) {
+            enum ggml_type type = gguf_get_tensor_type(ctx, i);

-            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
+            n_type[type]++;
+        }
+
+        printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+        for (int i = 0; i < n_kv; i++) {
+            const char * name           = gguf_get_key(ctx, i);
+            const enum gguf_type type   = gguf_get_kv_type(ctx, i);
+            const std::string type_name =
+                type == GGUF_TYPE_ARRAY
+                ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i))
+                : gguf_type_name(type);
+
+            std::string value          = gguf_kv_to_str(ctx, i);
+            const size_t MAX_VALUE_LEN = 40;
+            if (value.size() > MAX_VALUE_LEN) {
+                value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
+            }
+            replace_all(value, "\n", "\\n");
+
+            printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+        }
+
+        // print type counts
+        for (auto & kv : n_type) {
+            if (kv.second == 0) {
+                continue;
+            }
+
+            printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
        }
-        printf("\n");
    }

    // data
@@ -503,12 +783,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        for (int i = 0; i < n_tensors; ++i) {
            const char * name = gguf_get_tensor_name(ctx, i);
            const size_t offset = gguf_get_tensor_offset(ctx, i);
+            enum ggml_type type = gguf_get_tensor_type(ctx, i);
            struct ggml_tensor * cur = ggml_get_tensor(meta, name);
            size_t tensor_size = ggml_nbytes(cur);
            buffer_size += tensor_size;
            if (verbosity >= 3) {
-                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu\n", __func__, i,
-                       ggml_n_dims(cur), cur->name, tensor_size, offset);
+                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
+                       __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
            }
        }
    }
@@ -517,6 +798,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

    clip_ctx * new_clip = new clip_ctx;

+    // update projector type
+    {
+        int idx = gguf_find_key(ctx, KEY_PROJ_TYPE);
+        if (idx != -1) {
+            const std::string proj_type = gguf_get_val_str(ctx, idx);
+            new_clip->proj_type = clip_projector_type_from_string(proj_type);
+        }
+        else {
+            new_clip->proj_type = PROJECTOR_TYPE_MLP;
+        }
+    }
+
 #ifdef GGML_USE_CUBLAS
    new_clip->backend = ggml_backend_cuda_init(0);
    printf("%s: CLIP using CUDA backend\n", __func__);
@@ -661,10 +954,45 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
        vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
        vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
-        vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
-        vision_model.mm_0_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
-        vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
-        vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
+
+        // LLaVA projection
+        if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
+            vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
+            vision_model.mm_0_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
+            vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
+            vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
+        }
+        else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
+            // MobileVLM projection
+            vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
+            vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
+            vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
+            vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
+            vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
+            vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
+            vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
+            vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
+            vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
+            vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
+            vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
+            vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
+            vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
+            vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
+            vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
+            vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
+            vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
+            vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
+            vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
+            vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
+            vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
+            vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
+            vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
+            vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
+        }
+        else {
+            std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
+            throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
+        }

        vision_model.layers.resize(hparams.n_layer);
        for (int il = 0; il < hparams.n_layer; ++il) {
@@ -1100,13 +1428,25 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
 }

 int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
-    return ctx->vision_model.mm_2_b->ne[0];
+    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
+        return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
+    }
+    else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
+        return ctx->vision_model.mm_2_b->ne[0];
+    }
+    else {
+        std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
+        throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
+    }
 }

 int clip_n_patches(const struct clip_ctx * ctx) {
    auto & params = ctx->vision_model.hparams;
-
-    return (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
+    int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
+    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
+        n_patches /= 4;
+    }
+    return n_patches;
 }

 size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -81,6 +81,7 @@ ap.add_argument("--vision-only", action="store_true", required=False,
 ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
+ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
 ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
 ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
@@ -174,6 +175,8 @@ elif args.vision_only and not has_llava_projector:
    fout.add_description("vision-only CLIP model")
 elif has_llava_projector:
    fout.add_description("image encoder for LLaVA")
+    # add projector type
+    fout.add_string("clip.projector_type", args.projector_type)
 else:
    fout.add_description("two-tower CLIP model")

@@ -218,7 +221,8 @@ if has_llava_projector:
    projector = torch.load(args.llava_projector)
    for name, data in projector.items():
        name = get_tensor_name(name)
-        if data.ndim == 2:
+        # pw and dw conv ndim==4
+        if data.ndim == 2 or data.ndim == 4:
            data = data.squeeze().numpy().astype(np.float16)
        else:
            data = data.squeeze().numpy().astype(np.float32)
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -112,6 +112,43 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to
    return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
 }

+static inline int nearest_int(float fval) {
+    //assert(fval <= 4194303.f);
+    float val = fval + 12582912.f;
+    int i; memcpy(&i, &val, sizeof(int));
+    return (i & 0x007fffff) - 0x00400000;
+}
+
+static double log_softmax(int n_vocab, const float * logits, uint16_t * log_prob, int tok) {
+    float max_logit = logits[0];
+    float min_logit = logits[0];
+    for (int i = 1; i < n_vocab; ++i) {
+        max_logit = std::max(max_logit, logits[i]);
+        min_logit = std::min(min_logit, logits[i]);
+    }
+    min_logit = std::max(min_logit, max_logit - 16);
+    double sum_exp = 0.0;
+    for (int i = 0; i < n_vocab; ++i) {
+        sum_exp += expf(logits[i] - max_logit);
+    }
+    const float log_sum_exp = log(sum_exp);
+    const float min_log_prob = min_logit - max_logit - log_sum_exp;
+    const float scale = (max_logit - min_logit)/65535.f;
+    float * d = (float *)log_prob;
+    d[0] = scale;
+    d[1] = min_log_prob;
+    log_prob += 4;
+    if (scale) {
+        const float inv_scale = 1/scale;
+        for (int i = 0; i < n_vocab; ++i) {
+            log_prob[i] = logits[i] > min_logit ? nearest_int(inv_scale*(logits[i] - min_logit)) : 0;
+        }
+    } else {
+        std::memset(log_prob, 0, n_vocab*sizeof(uint16_t));
+    }
+    return max_logit + log_sum_exp - logits[tok];
+}
+
 static void process_logits(
    int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
    double & nll, double & nll2, float * logit_history, float * prob_history
@@ -147,6 +184,130 @@ static void process_logits(
    }
 }

+static void process_logits(std::ostream& out, int n_vocab, const float * logits, const int * tokens, int n_token,
+        std::vector<std::thread> & workers, std::vector<uint16_t> & log_probs, double & nll, double & nll2) {
+    std::mutex mutex;
+    const int nv = 2*((n_vocab + 1)/2) + 4;
+    int counter = 0;
+    auto compute = [&mutex, &counter, &log_probs, &nll, &nll2, n_vocab, logits, tokens, n_token, nv] () {
+        double local_nll  = 0;
+        double local_nll2 = 0;
+        while (true) {
+            std::unique_lock<std::mutex> lock(mutex);
+            int i = counter++;
+            if (i >= n_token) {
+                nll += local_nll; nll2 += local_nll2;
+                break;
+            }
+            lock.unlock();
+            const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
+            local_nll += v;
+            local_nll2 += v*v;
+        }
+    };
+    for (auto & w : workers) {
+        w = std::thread(compute);
+    }
+    compute();
+    for (auto & w : workers) {
+        w.join();
+    }
+    out.write((const char *)log_probs.data(), n_token*nv*sizeof(uint16_t));
+}
+
+struct kl_divergence_result {
+    double sum_nll  = 0;
+    double sum_nll2 = 0;
+    double sum_kld  = 0;
+    double sum_kld2 = 0;
+    double sum_nll_diff  = 0;
+    double sum_nll_diff2 = 0;
+    size_t n_same_top = 0;
+    size_t count = 0;
+};
+
+static double log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
+    float max_logit = logits[0];
+    int imax = 0;
+    for (int i = 1; i < n_vocab; ++i) {
+        if (logits[i] > max_logit) {
+            max_logit = logits[i];
+            imax = i;
+        }
+    }
+    double sum_exp = 0.0;
+    for (int i = 0; i < n_vocab; ++i) {
+        sum_exp += expf(logits[i] - max_logit);
+    }
+    const float log_sum_exp = log(sum_exp);
+    const float * d = (const float *)base_log_prob;
+    const float scale = d[0];
+    const float min_log_prob = d[1];
+    base_log_prob += 4;
+    float nll = max_logit + log_sum_exp - logits[tok];
+    kld.sum_nll  += nll;
+    kld.sum_nll2 += nll*nll;
+    nll += (scale*base_log_prob[tok] + min_log_prob);
+    kld.sum_nll_diff  += nll;
+    kld.sum_nll_diff2 += nll*nll;
+    max_logit += log_sum_exp;
+    double sum = 0;
+    int imax_base = -1;
+    float p_log_base_max = 0;
+    for (int i = 0; i < n_vocab; ++i) {
+        const float p_log_base = scale*base_log_prob[i] + min_log_prob;
+        if (i == 0 || p_log_base > p_log_base_max) {
+            p_log_base_max = p_log_base;
+            imax_base = i;
+        }
+        if (p_log_base > -16.f) {
+            const float p_base = expf(p_log_base);
+            sum += p_base * (p_log_base - logits[i] + max_logit);
+        }
+    }
+    kld.sum_kld  += sum;
+    kld.sum_kld2 += sum*sum;
+    ++kld.count;
+    if (imax == imax_base) ++kld.n_same_top;
+    return sum;
+}
+
+static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token,
+        std::vector<std::thread> & workers, const std::vector<uint16_t> & base_log_probs, kl_divergence_result & kld,
+        float * kld_values) {
+    std::mutex mutex;
+    const int nv = 2*((n_vocab + 1)/2) + 4;
+    int counter = 0;
+    auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values] () {
+        kl_divergence_result local_kld;
+        while (true) {
+            std::unique_lock<std::mutex> lock(mutex);
+            int i = counter++;
+            if (i >= n_token) {
+                kld.sum_nll  += local_kld.sum_nll;
+                kld.sum_nll2 += local_kld.sum_nll2;
+                kld.sum_kld  += local_kld.sum_kld;
+                kld.sum_kld2 += local_kld.sum_kld2;
+                kld.sum_nll_diff  += local_kld.sum_nll_diff;
+                kld.sum_nll_diff2 += local_kld.sum_nll_diff2;
+                kld.n_same_top += local_kld.n_same_top;
+                kld.count += local_kld.count;
+                break;
+            }
+            lock.unlock();
+            double v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
+            kld_values[i] = (float)v;
+        }
+    };
+    for (auto & w : workers) {
+        w = std::thread(compute);
+    }
+    compute();
+    for (auto & w : workers) {
+        w.join();
+    }
+}
+
 static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
@@ -294,6 +455,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
    const int n_ctx = llama_n_ctx(ctx);

+    std::ofstream logits_stream;
+    if (!params.logits_file.empty()) {
+        logits_stream.open(params.logits_file.c_str());
+        if (!logits_stream.is_open()) {
+            fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
+            return {};
+        }
+        fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
+        logits_stream.write("_logits_", 8);
+        logits_stream.write((const char *)&n_ctx, sizeof(n_ctx));
+    }
+
    auto tim1 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

@@ -336,6 +509,15 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par

    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);

+    std::vector<uint16_t> log_probs;
+    if (!params.logits_file.empty()) {
+        logits_stream.write((const char *)&n_vocab, sizeof(n_vocab));
+        logits_stream.write((const char *)&n_chunk, sizeof(n_chunk));
+        logits_stream.write((const char *)tokens.data(), n_chunk*n_ctx*sizeof(tokens[0]));
+        const int nv = 2*((n_vocab + 1)/2) + 4;
+        log_probs.resize(n_ctx * nv);
+    }
+
    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * n_ctx;
        const int end   = start + n_ctx;
@@ -398,8 +580,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        // process the entire prompt.
        const int first = n_ctx/2;
        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
-        process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
-                       workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
+        if (!params.logits_file.empty()) {
+            process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+                    workers, log_probs, nll, nll2);
+        } else {
+            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
+        }
        count += n_ctx - first - 1;

        // perplexity is e^(average negative log-likelihood)
@@ -458,23 +645,24 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
    return true;
 }

+#define K_TOKEN_CHUNK 4
+
 static void compute_logprobs(const float * batch_logits, int n_vocab, std::vector<std::thread>& workers,
        const std::vector<std::pair<size_t, llama_token>>& eval_pairs, std::vector<float>& eval_results) {
-    constexpr int k_token_chunk = 4;
    if (eval_results.size() != eval_pairs.size()) {
        eval_results.resize(eval_pairs.size());
    }
    if (eval_pairs.empty()) return;

-    size_t max_threads = std::min((eval_pairs.size() + k_token_chunk - 1)/k_token_chunk, workers.size());
+    size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size());

    std::atomic<int> counter(0);
    auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () {
-        float local_logprobs[k_token_chunk];
+        float local_logprobs[K_TOKEN_CHUNK];
        while (true) {
-            size_t first = counter.fetch_add(k_token_chunk, std::memory_order_relaxed);
+            size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
            if (first >= eval_results.size()) break;
-            size_t last = std::min(first + k_token_chunk, eval_results.size());
+            size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
            for (size_t i = first; i < last; ++i) {
                auto logits = batch_logits + eval_pairs[i].first * n_vocab;
                float max_logit = logits[0];
@@ -497,7 +685,6 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
    for (size_t it = 0; it < max_threads; ++it) {
        workers[it].join();
    }
-
 }

 static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
@@ -540,14 +727,14 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    // This is needed as usual for LLaMA models
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));

+    // The tasks should be randomized so the score stabilizes quickly.
+    bool randomize_tasks = true;
+
    // Number of tasks to use when computing the score
    if (params.hellaswag_tasks < hs_task_count) {
        hs_task_count = params.hellaswag_tasks;
    }

-    // The tasks should be randomized so the score stabilizes quickly.
-    bool randomize_tasks = true;
-
    // The random seed should not impact the final result if the computation is done over enough tasks, so kept hardcoded for now
    std::mt19937 rng(1);

@@ -1031,6 +1218,566 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
    printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
 }

+static bool deserialize_string(std::istream & in, std::string & str) {
+    uint32_t size;
+    if (!in.read((char *)&size, sizeof(size)).fail()) {
+        str.resize(size);
+        if (!in.read((char *)&str[0], size).fail()) return true;
+    }
+    return false;
+}
+
+struct multiple_choice_answers {
+    std::vector<std::string> answers;
+    std::vector<int>         labels;
+    bool deserialize(std::istream& in) {
+        uint32_t n;
+        in.read((char *)&n, sizeof(n));
+        if (in.fail() || n > 100) return false; // 100 as max. number of answers should be good enough for any practical purpose
+        answers.resize(n);
+        labels.resize(n);
+        for (auto& a : answers) {
+            if (!deserialize_string(in, a)) return false;
+        }
+        in.read((char *)labels.data(), n*sizeof(int));
+        return !in.fail();
+    }
+};
+
+struct multiple_choice_task {
+    std::string question;         // the question (or context that needs to be continued)
+    multiple_choice_answers mc1;  // possible answers (continuations) with a single correct answer
+    multiple_choice_answers mc2;  // possible answers (continuations) with multiple correct answers - not handled yet
+    bool deserialize(std::istream& in) {
+        if (!deserialize_string(in, question)) return false;
+        return mc1.deserialize(in) && mc2.deserialize(in);
+    }
+
+    // For evaluation
+    size_t i_batch;         // starting index in the llama_batch
+    size_t common_prefix;   // max number of initial tokens that are the same in all sentences
+    size_t required_tokens; // needed number of tokens to evaluate all answers
+    std::vector<std::vector<llama_token>> seq_tokens;
+    std::vector<float> log_probs;
+};
+
+static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, multiple_choice_task& task, bool log_error) {
+    if (task.question.empty() || task.mc1.answers.empty()) {
+        if (log_error) {
+            printf("%s: found bad task with empty question and/or answers\n", __func__);
+        }
+        return false;
+    }
+    task.seq_tokens.reserve(task.mc1.answers.size());
+    for (auto& answer : task.mc1.answers) {
+        if (answer.empty()) {
+            if (log_error) {
+                printf("%s: found empty answer\n", __func__);
+            }
+            return false;
+        }
+        task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, add_bos));
+    }
+    auto min_len = task.seq_tokens.front().size();
+    for (auto& seq : task.seq_tokens) {
+        min_len = std::min(min_len, seq.size());
+    }
+    task.common_prefix = 0;
+    for (size_t k = 0; k < min_len; ++k) {
+        auto token = task.seq_tokens[0][k];
+        bool all_same = true;
+        for (size_t i = 1; i < task.seq_tokens.size(); ++i) {
+            if (task.seq_tokens[i][k] != token) {
+                all_same = false;
+                break;
+            }
+        }
+        if (!all_same) {
+            break;
+        }
+        ++task.common_prefix;
+    }
+    task.required_tokens = task.common_prefix;
+    for (auto& seq : task.seq_tokens) {
+        task.required_tokens += seq.size() - task.common_prefix;
+    }
+    return true;
+}
+
+//
+// Calculates score for multiple choice tasks with single correct answer from prompt.
+// Commonly used LLM evaluation metrics of this type are
+//   * ARC
+//   * HellaSwag
+//   * MMLU
+//   * TruthfulQA
+//
+// Validation datasets for these 4 tests can be found at
+//     https://huggingface.co/datasets/ikawrakow/validation-datasets-for-llama.cpp
+// The data for these datasets was extracted from
+//     git@hf.co:datasets/allenai/ai2_arc
+//     https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
+//     git@hf.co:datasets/Stevross/mmlu
+//     https://huggingface.co/datasets/truthful_qa
+//
+static void multiple_choice_score(llama_context * ctx, const gpt_params & params) {
+
+    std::istringstream strstream(params.prompt);
+    uint32_t n_task;
+    strstream.read((char *)&n_task, sizeof(n_task));
+    if (strstream.fail() || n_task == 0) {
+        printf("%s: no tasks\n", __func__);
+        return;
+    }
+    printf("%s: there are %u tasks in prompt\n", __func__, n_task);
+    std::vector<uint32_t> task_pos(n_task);
+    strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
+    if (strstream.fail()) {
+        printf("%s: failed to raad task positions from prompt\n", __func__);
+        return;
+    }
+
+    std::vector<multiple_choice_task> tasks;
+    if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
+        // Use all tasks
+        tasks.resize(n_task);
+        printf("%s: reading tasks", __func__);
+        int n_dot = n_task/100;
+        int i = 0;
+        for (auto& task : tasks) {
+            ++i;
+            if (!task.deserialize(strstream)) {
+                printf("%s: failed to read task %d of %u\n", __func__, i, n_task);
+                return;
+            }
+            if (i%n_dot == 0) printf(".");
+        }
+        printf("done\n");
+    }
+    else {
+        printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
+        std::mt19937 rng(1);
+        std::vector<int> aux(n_task);
+        for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
+        float scale = 1.f/(1.f + (float)std::mt19937::max());
+        tasks.resize(params.multiple_choice_tasks);
+        for (auto& task : tasks) {
+            int j = (int)(scale * rng() * aux.size());
+            int idx = aux[j];
+            aux[j] = aux.back();
+            aux.pop_back();
+            strstream.seekg(task_pos[idx], std::ios::beg);
+            if (!task.deserialize(strstream)) {
+                printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
+                return;
+            }
+        }
+        n_task = params.multiple_choice_tasks;
+    }
+
+    // This is needed as usual for LLaMA models
+    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+
+    printf("%s: preparing task data", __func__);
+    fflush(stdout);
+    if (n_task > 500) {
+        printf("...");
+        fflush(stdout);
+        std::atomic<int> counter(0);
+        std::atomic<int> n_bad(0);
+        auto prepare = [&counter, &n_bad, &tasks, ctx, add_bos] () {
+            int num_tasks = tasks.size();
+            int n_bad_local = 0;
+            while (true) {
+                int first = counter.fetch_add(K_TOKEN_CHUNK);
+                if (first >= num_tasks) {
+                    if (n_bad_local > 0) n_bad += n_bad_local;
+                    break;
+                }
+                int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
+                for (int i = first; i < last; ++i) {
+                    if (!multiple_choice_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local;
+                }
+            }
+        };
+        size_t max_thread = std::thread::hardware_concurrency();
+        max_thread = std::min(max_thread, (tasks.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK);
+        std::vector<std::thread> workers(max_thread-1);
+        for (auto& w : workers) w = std::thread(prepare);
+        prepare();
+        for (auto& w : workers) w.join();
+        printf("done\n");
+        fflush(stdout);
+        int nbad = n_bad;
+        if (nbad > 0) {
+            printf("%s: found %d malformed tasks\n", __func__, nbad);
+            return;
+        }
+    } else {
+        int n_dot = n_task/100;
+        int i_task = 0;
+        for (auto& task : tasks) {
+            ++i_task;
+            if (!multiple_choice_prepare_one_task(ctx, add_bos, task, true)) {
+                return;
+            }
+            if (i_task%n_dot == 0) {
+                printf(".");
+                fflush(stdout);
+            }
+        }
+        printf("done\n");
+    }
+
+    printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
+
+    printf("\ntask\tacc_norm\n");
+
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+    const int n_ctx   = llama_n_ctx(ctx);
+    const int n_batch = params.n_batch;
+
+    const int max_tasks_per_batch = 32;
+    const int max_seq = 4*max_tasks_per_batch;
+
+    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
+
+    std::vector<float> tok_logits(n_vocab);
+    std::vector<float> batch_logits(n_vocab*n_ctx);
+
+    std::vector<std::pair<size_t, llama_token>> eval_pairs;
+    std::vector<float> eval_results;
+    std::vector<std::thread> workers(std::thread::hardware_concurrency());
+    std::vector<int> batch_indeces;
+
+    int n_done = 0;
+    int n_correct = 0;
+    int n_tot_answers = 0;
+
+    for (size_t i0 = 0; i0 < tasks.size(); i0++) {
+        int n_cur = 0;
+
+        size_t i1 = i0;
+        size_t i_batch = 0; // this tells us where in `llama_batch` we are currently
+
+        llama_batch_clear(batch);
+
+        // batch as much tasks as possible into the available context
+        // each task has 4 unique seuqnce ids - one for each ending
+        // the common prefix is shared among the 4 sequences to save tokens
+        // we extract logits only from the last common token and from all ending tokens of each sequence
+        int s0 = 0;
+        while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) {
+            auto& cur_task = tasks[i1];
+
+            int num_answers = cur_task.seq_tokens.size();
+            if (s0 + num_answers > max_seq) {
+                break;
+            }
+
+            if (int(batch_indeces.size()) != num_answers) {
+                batch_indeces.resize(num_answers);
+            }
+            for (int s = 0; s < num_answers; ++s) batch_indeces[s] = s0 + s;
+
+            for (size_t i = 0; i < cur_task.common_prefix; ++i) {
+                //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
+                llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
+            }
+            batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
+
+            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
+                for (size_t i = cur_task.common_prefix; i < cur_task.seq_tokens[s].size(); ++i) {
+                    llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, true);
+                }
+            }
+
+            s0 += num_answers;
+
+            cur_task.i_batch = i_batch;
+            i_batch += cur_task.required_tokens;
+
+            n_cur += cur_task.required_tokens;
+            if (++i1 == tasks.size()) {
+                break;
+            }
+        }
+
+        if (i0 == i1) {
+            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
+            return;
+        }
+
+        llama_kv_cache_clear(ctx);
+
+        // decode all tasks [i0, i1)
+        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
+            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
+            return;
+        }
+
+        // Compute log-probs in parallel
+        // First we collect all tasks
+        eval_pairs.clear();
+        for (size_t i = i0; i < i1; ++i) {
+            auto& cur_task = tasks[i];
+            size_t li = cur_task.common_prefix;
+            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
+                for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
+                    eval_pairs.push_back(std::make_pair(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]));
+                }
+                ++li;
+            }
+        }
+        // Then we do the actual calculation
+        compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
+
+        size_t ir = 0;
+
+        // compute the logprobs for each ending of the decoded tasks
+        for (size_t i = i0; i < i1; ++i) {
+            auto & cur_task = tasks[i];
+            //printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
+            //for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
+            //    if (cur_task.mc1.labels[j] == 1) {
+            //        printf("%d", j+1);
+            //    }
+            //}
+            //printf("\n    common_prefix: %zu\n", cur_task.common_prefix);
+
+            std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(cur_task.i_batch + cur_task.common_prefix - 1), n_vocab*sizeof(float));
+
+            const auto first_probs = softmax(tok_logits);
+
+            cur_task.log_probs.resize(cur_task.seq_tokens.size());
+            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
+                size_t count = 1;
+                float  log_prob  = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
+                for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
+                    //printf("        %zu  %g\n", ir, eval_results[ir]);
+                    ++count;
+                    log_prob += eval_results[ir++];
+                }
+                cur_task.log_probs[s] = log_prob / count;
+                //printf("        Final: %g\n", log_prob / count);
+                //printf("    <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
+            }
+
+            // Find the ending with maximum logprob
+            size_t logprob_max_idx = 0;
+            float  logprob_max_val = cur_task.log_probs[0];
+            for (size_t s = 1; s < cur_task.log_probs.size(); s++) {
+                if (cur_task.log_probs[s] > logprob_max_val) {
+                    logprob_max_val = cur_task.log_probs[s];
+                    logprob_max_idx = s;
+                }
+            }
+
+            n_tot_answers += cur_task.log_probs.size();
+            if (cur_task.mc1.labels[logprob_max_idx] == 1) {
+                ++n_correct;
+            }
+            ++n_done;
+
+            // Print the accumulated accuracy mean x 100
+            printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
+            fflush(stdout);
+        }
+
+        i0 = i1 - 1;
+    }
+
+    llama_batch_free(batch);
+
+    if (n_done < 100) return;
+
+    float p = 1.f*n_correct/n_done;
+    float sigma = sqrt(p*(1-p)/(n_done-1));
+    printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
+    p = 1.f*n_done/n_tot_answers;
+    sigma = sqrt(p*(1-p)/(n_done-1));
+    printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
+
+    printf("\n");
+}
+
+static void kl_divergence(llama_context * ctx, const gpt_params & params) {
+    if (params.logits_file.empty()) {
+        fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
+        return;
+    }
+    std::ifstream in(params.logits_file.c_str(), std::ios::binary);
+    if (!in) {
+        fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str());
+        return;
+    }
+    {
+        char check[9]; check[8] = 0;
+        in.read(check, 8);
+        if (in.fail() || strncmp("_logits_", check, 8) != 0) {
+            fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
+            return;
+        }
+    }
+
+    uint32_t n_ctx;
+    in.read((char *)&n_ctx, sizeof(n_ctx));
+    if (n_ctx > llama_n_ctx(ctx)) {
+        fprintf(stderr, "%s: %s has been computed with %d, while the current context is %d. Increase it with -c and retry\n",
+                __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
+    }
+
+    int n_vocab, n_chunk;
+    in.read((char *)&n_vocab, sizeof(n_vocab));
+    in.read((char *)&n_chunk, sizeof(n_chunk));
+    if (in.fail()) {
+        fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
+        return;
+    }
+    if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
+        fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
+    }
+
+    std::vector<llama_token> tokens(n_ctx * n_chunk);
+    if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
+        fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
+        return;
+    }
+
+    const int n_batch = params.n_batch;
+    const int num_batches = (n_ctx + n_batch - 1)/n_batch;
+    const int nv = 2*((n_vocab + 1)/2) + 4;
+    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+
+    std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
+    std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
+    std::vector<float> logits;
+    if (num_batches > 1) {
+        logits.reserve(n_ctx * n_vocab);
+    }
+
+    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
+
+    auto mean_and_uncertainty = [] (double sum, double sum2, size_t count) {
+        if (count < 1) {
+            return std::make_pair(0., 0.);
+        }
+        double f = sum/count;
+        double df = sum2/count - f*f;
+        df = df > 0 && count > 10 ? sqrt(df/(count-1)) : 0.;
+        return std::make_pair(f, df);
+    };
+
+    kl_divergence_result kld;
+    auto kld_ptr = kld_values.data();
+
+    for (int i = 0; i < n_chunk; ++i) {
+        const int start =     i * n_ctx;
+        const int end   = start + n_ctx;
+
+        const auto t_start = std::chrono::high_resolution_clock::now();
+
+        if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
+            fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i);
+            return;
+        }
+
+        // clear the KV cache
+        llama_kv_cache_clear(ctx);
+
+        for (int j = 0; j < num_batches; ++j) {
+            const int batch_start = start + j * n_batch;
+            const int batch_size  = std::min(end - batch_start, n_batch);
+
+            // save original token and restore it after eval
+            const auto token_org = tokens[batch_start];
+
+            // add BOS token for the first batch of each chunk
+            if (add_bos && j == 0) {
+                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
+            }
+
+            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
+                fprintf(stderr, "%s : failed to eval\n", __func__);
+                return;
+            }
+
+            // restore the original token in case it was set to BOS
+            tokens[batch_start] = token_org;
+
+            if (num_batches > 1) {
+                const auto * batch_logits = llama_get_logits(ctx);
+                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+            }
+        }
+
+        const auto t_end = std::chrono::high_resolution_clock::now();
+
+        if (i == 0) {
+            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
+            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            int total_seconds = (int)(t_total * n_chunk);
+            if (total_seconds >= 60*60) {
+                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                total_seconds = total_seconds % (60*60);
+            }
+            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+
+            printf("\nchunk        PPL          ln(PPL(Q)/PPL(base))          KL-Divergence           Same top\n");
+        }
+
+        const int first = n_ctx/2;
+        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
+        process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+                workers, log_probs_uint16, kld, kld_ptr);
+        kld_ptr += n_ctx - 1 - first;
+
+        auto ppl           = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
+        auto log_ppl_ratio = mean_and_uncertainty(kld.sum_nll_diff, kld.sum_nll_diff2, kld.count);
+        auto kl_div        = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
+        auto p_top = 1.*kld.n_same_top/kld.count;
+        auto d_p_top = sqrt(p_top*(1 - p_top)/(kld.count - 1));
+
+        printf("%4d    %10.4lf    %10.5lf ± %10.5f    %10.5f ± %10.5lf    %.5f ± %.5f\n", i+1, exp(ppl.first),
+                log_ppl_ratio.first, log_ppl_ratio.second, kl_div.first, kl_div.second,
+                p_top, d_p_top);
+
+        fflush(stdout);
+
+        logits.clear();
+    }
+    printf("\n");
+
+    if (kld.count < 100) return; // we do not wish to do statistics on so few values
+
+    std::sort(kld_values.begin(), kld_values.end());
+
+    printf("===== KL-divergence statistics\n");
+    auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
+    printf("Average: %10.6f ±%10.6lf\n", kl_div.first, kl_div.second);
+    auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
+                                               : kld_values[kld_values.size()/2];
+    printf("Median : %10.6f\n", kld_median);
+
+    auto percentile = [&kld_values] (float fraction) {
+        if (fraction <= 0) return kld_values.front();
+        if (fraction >= 1) return kld_values.back();
+        float p = fraction*(kld_values.size() - 1);
+        size_t ip = size_t(p); p -= ip;
+        return (1 - p)*kld_values[ip] + p*kld_values[std::min(ip+1, kld_values.size()-1)];
+    };
+
+    printf("Maximum: %10.6f\n", kld_values.back());
+    printf("KLD_99 : %10.6f\n", percentile(0.99f));
+    printf("KLD_95 : %10.6f\n", percentile(0.95f));
+    printf("KLD_90 : %10.6f\n", percentile(0.90f));
+
+    printf("Minimum: %10.6f\n", kld_values.front());
+    printf("KLD_01 : %10.6f\n", percentile(0.01f));
+    printf("KLD_05 : %10.6f\n", percentile(0.05f));
+    printf("KLD_10 : %10.6f\n", percentile(0.10f));
+
+}

 int main(int argc, char ** argv) {
    gpt_params params;
@@ -1091,6 +1838,10 @@ int main(int argc, char ** argv) {
        hellaswag_score(ctx, params);
    } else if (params.winogrande) {
        winogrande_score(ctx, params);
+    } else if (params.multiple_choice) {
+        multiple_choice_score(ctx, params);
+    } else if (params.kl_divergence) {
+        kl_divergence(ctx, params);
    } else {
        results = perplexity(ctx, params);
    }
--- a/examples/pydantic-models-to-grammar-examples.py
+++ b/examples/pydantic-models-to-grammar-examples.py
@@ -1,14 +1,14 @@
 # Function calling example using pydantic models.
 import datetime
+import importlib
 import json
 from enum import Enum
-from typing import Union, Optional
+from typing import Optional, Union

 import requests
 from pydantic import BaseModel, Field
-
-import importlib
-from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation, convert_dictionary_to_pydantic_model, add_run_method_to_dynamic_model, create_dynamic_model_from_function
+from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert_dictionary_to_pydantic_model,
+                                        create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)


 # Function to get completion on the llama.cpp server with grammar.
@@ -35,7 +35,7 @@ class SendMessageToUser(BaseModel):
        print(self.message)


-# Enum for the calculator function.
+# Enum for the calculator tool.
 class MathOperation(Enum):
    ADD = "add"
    SUBTRACT = "subtract"
@@ -43,7 +43,7 @@ class MathOperation(Enum):
    DIVIDE = "divide"


-# Very simple calculator tool for the agent.
+# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
 class Calculator(BaseModel):
    """
    Perform a math operation on two numbers.
@@ -148,37 +148,6 @@ def get_current_datetime(output_format: Optional[str] = None):
    return datetime.datetime.now().strftime(output_format)


-# Enum for the calculator tool.
-class MathOperation(Enum):
-    ADD = "add"
-    SUBTRACT = "subtract"
-    MULTIPLY = "multiply"
-    DIVIDE = "divide"
-
-
-
-# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
-class Calculator(BaseModel):
-    """
-    Perform a math operation on two numbers.
-    """
-    number_one: Union[int, float] = Field(..., description="First number.")
-    operation: MathOperation = Field(..., description="Math operation to perform.")
-    number_two: Union[int, float] = Field(..., description="Second number.")
-
-    def run(self):
-        if self.operation == MathOperation.ADD:
-            return self.number_one + self.number_two
-        elif self.operation == MathOperation.SUBTRACT:
-            return self.number_one - self.number_two
-        elif self.operation == MathOperation.MULTIPLY:
-            return self.number_one * self.number_two
-        elif self.operation == MathOperation.DIVIDE:
-            return self.number_one / self.number_two
-        else:
-            raise ValueError("Unknown operation.")
-
-
 # Example function to get the weather
 def get_current_weather(location, unit):
    """Get the current weather in a given location"""
--- a/examples/pydantic_models_to_grammar.py
+++ b/examples/pydantic_models_to_grammar.py
@@ -1,15 +1,21 @@
+from __future__ import annotations
+
 import inspect
 import json
+import re
 from copy import copy
-from inspect import isclass, getdoc
-from types import NoneType
+from enum import Enum
+from inspect import getdoc, isclass
+from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints

 from docstring_parser import parse
-from pydantic import BaseModel, create_model, Field
-from typing import Any, Type, List, get_args, get_origin, Tuple, Union, Optional, _GenericAlias
-from enum import Enum
-from typing import get_type_hints, Callable
-import re
+from pydantic import BaseModel, Field, create_model
+
+if TYPE_CHECKING:
+    from types import GenericAlias
+else:
+    # python 3.8 compat
+    from typing import _GenericAlias as GenericAlias


 class PydanticDataType(Enum):
@@ -43,7 +49,7 @@ class PydanticDataType(Enum):
    SET = "set"


-def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str:
+def map_pydantic_type_to_gbnf(pydantic_type: type[Any]) -> str:
    if isclass(pydantic_type) and issubclass(pydantic_type, str):
        return PydanticDataType.STRING.value
    elif isclass(pydantic_type) and issubclass(pydantic_type, bool):
@@ -57,22 +63,22 @@ def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str:

    elif isclass(pydantic_type) and issubclass(pydantic_type, BaseModel):
        return format_model_and_field_name(pydantic_type.__name__)
-    elif get_origin(pydantic_type) == list:
+    elif get_origin(pydantic_type) is list:
        element_type = get_args(pydantic_type)[0]
        return f"{map_pydantic_type_to_gbnf(element_type)}-list"
-    elif get_origin(pydantic_type) == set:
+    elif get_origin(pydantic_type) is set:
        element_type = get_args(pydantic_type)[0]
        return f"{map_pydantic_type_to_gbnf(element_type)}-set"
-    elif get_origin(pydantic_type) == Union:
+    elif get_origin(pydantic_type) is Union:
        union_types = get_args(pydantic_type)
        union_rules = [map_pydantic_type_to_gbnf(ut) for ut in union_types]
        return f"union-{'-or-'.join(union_rules)}"
-    elif get_origin(pydantic_type) == Optional:
+    elif get_origin(pydantic_type) is Optional:
        element_type = get_args(pydantic_type)[0]
        return f"optional-{map_pydantic_type_to_gbnf(element_type)}"
    elif isclass(pydantic_type):
        return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(pydantic_type.__name__)}"
-    elif get_origin(pydantic_type) == dict:
+    elif get_origin(pydantic_type) is dict:
        key_type, value_type = get_args(pydantic_type)
        return f"custom-dict-key-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(key_type))}-value-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(value_type))}"
    else:
@@ -106,7 +112,6 @@ def get_members_structure(cls, rule_name):
        return f"{cls.__name__.lower()} ::= " + " | ".join(members)
    if cls.__annotations__ and cls.__annotations__ != {}:
        result = f'{rule_name} ::= "{{"'
-        type_list_rules = []
        # Modify this comprehension
        members = [
            f'  "\\"{name}\\"" ":"  {map_pydantic_type_to_gbnf(param_type)}'
@@ -116,27 +121,25 @@ def get_members_structure(cls, rule_name):

        result += '"," '.join(members)
        result += '  "}"'
-        return result, type_list_rules
-    elif rule_name == "custom-class-any":
+        return result
+    if rule_name == "custom-class-any":
        result = f"{rule_name} ::= "
        result += "value"
-        type_list_rules = []
-        return result, type_list_rules
-    else:
-        init_signature = inspect.signature(cls.__init__)
-        parameters = init_signature.parameters
-        result = f'{rule_name} ::=  "{{"'
-        type_list_rules = []
-        # Modify this comprehension too
-        members = [
-            f'  "\\"{name}\\"" ":"  {map_pydantic_type_to_gbnf(param.annotation)}'
-            for name, param in parameters.items()
-            if name != "self" and param.annotation != inspect.Parameter.empty
-        ]
+        return result

-        result += '", "'.join(members)
-        result += '  "}"'
-        return result, type_list_rules
+    init_signature = inspect.signature(cls.__init__)
+    parameters = init_signature.parameters
+    result = f'{rule_name} ::=  "{{"'
+    # Modify this comprehension too
+    members = [
+        f'  "\\"{name}\\"" ":"  {map_pydantic_type_to_gbnf(param.annotation)}'
+        for name, param in parameters.items()
+        if name != "self" and param.annotation != inspect.Parameter.empty
+    ]
+
+    result += '", "'.join(members)
+    result += '  "}"'
+    return result


 def regex_to_gbnf(regex_pattern: str) -> str:
@@ -269,7 +272,7 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None

 def generate_gbnf_rule_for_type(
    model_name, field_name, field_type, is_optional, processed_models, created_rules, field_info=None
-) -> Tuple[str, list]:
+) -> tuple[str, list[str]]:
    """
    Generate GBNF rule for a given field type.

@@ -283,7 +286,7 @@ def generate_gbnf_rule_for_type(
    :param field_info: Additional information about the field (optional).

    :return: Tuple containing the GBNF type and a list of additional rules.
-    :rtype: Tuple[str, list]
+    :rtype: tuple[str, list]
    """
    rules = []

@@ -321,8 +324,7 @@ def generate_gbnf_rule_for_type(
        gbnf_type, rules = model_name + "-" + field_name, rules

    elif gbnf_type.startswith("custom-class-"):
-        nested_model_rules, field_types = get_members_structure(field_type, gbnf_type)
-        rules.append(nested_model_rules)
+        rules.append(get_members_structure(field_type, gbnf_type))
    elif gbnf_type.startswith("custom-dict-"):
        key_type, value_type = get_args(field_type)

@@ -341,14 +343,14 @@ def generate_gbnf_rule_for_type(
        union_rules = []

        for union_type in union_types:
-            if isinstance(union_type, _GenericAlias):
+            if isinstance(union_type, GenericAlias):
                union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(
                    model_name, field_name, union_type, False, processed_models, created_rules
                )
                union_rules.append(union_gbnf_type)
                rules.extend(union_rules_list)

-            elif not issubclass(union_type, NoneType):
+            elif not issubclass(union_type, type(None)):
                union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(
                    model_name, field_name, union_type, False, processed_models, created_rules
                )
@@ -424,14 +426,10 @@ def generate_gbnf_rule_for_type(
    else:
        gbnf_type, rules = gbnf_type, []

-    if gbnf_type not in created_rules:
-        return gbnf_type, rules
-    else:
-        if gbnf_type in created_rules:
-            return gbnf_type, rules
+    return gbnf_type, rules


-def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created_rules: dict) -> (list, bool, bool):
+def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[BaseModel]], created_rules: dict[str, list[str]]) -> tuple[list[str], bool]:
    """

    Generate GBnF Grammar
@@ -452,7 +450,7 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created
    ```
    """
    if model in processed_models:
-        return []
+        return [], False

    processed_models.add(model)
    model_name = format_model_and_field_name(model.__name__)
@@ -518,7 +516,7 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created


 def generate_gbnf_grammar_from_pydantic_models(
-    models: List[Type[BaseModel]], outer_object_name: str = None, outer_object_content: str = None,
+    models: list[type[BaseModel]], outer_object_name: str | None = None, outer_object_content: str | None = None,
    list_of_outputs: bool = False
 ) -> str:
    """
@@ -528,7 +526,7 @@ def generate_gbnf_grammar_from_pydantic_models(
    * grammar.

    Args:
-        models (List[Type[BaseModel]]): A list of Pydantic models to generate the grammar from.
+        models (list[type[BaseModel]]): A list of Pydantic models to generate the grammar from.
        outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
        outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
        list_of_outputs (str, optional): Allows a list of output objects
@@ -543,9 +541,9 @@ def generate_gbnf_grammar_from_pydantic_models(
        # root ::= UserModel | PostModel
        # ...
    """
-    processed_models = set()
+    processed_models: set[type[BaseModel]] = set()
    all_rules = []
-    created_rules = {}
+    created_rules: dict[str, list[str]] = {}
    if outer_object_name is None:
        for model in models:
            model_rules, _ = generate_gbnf_grammar(model, processed_models, created_rules)
@@ -608,7 +606,7 @@ def get_primitive_grammar(grammar):
    Returns:
        str: GBNF primitive grammar string.
    """
-    type_list = []
+    type_list: list[type[object]] = []
    if "string-list" in grammar:
        type_list.append(str)
    if "boolean-list" in grammar:
@@ -666,14 +664,14 @@ triple-quotes ::= "'''" """


 def generate_markdown_documentation(
-    pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
+    pydantic_models: list[type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
    documentation_with_field_description=True
 ) -> str:
    """
    Generate markdown documentation for a list of Pydantic models.

    Args:
-        pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes.
+        pydantic_models (list[type[BaseModel]]): list of Pydantic model classes.
        model_prefix (str): Prefix for the model section.
        fields_prefix (str): Prefix for the fields section.
        documentation_with_field_description (bool): Include field descriptions in the documentation.
@@ -731,7 +729,7 @@ def generate_markdown_documentation(


 def generate_field_markdown(
-    field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1,
+    field_name: str, field_type: type[Any], model: type[BaseModel], depth=1,
    documentation_with_field_description=True
 ) -> str:
    """
@@ -739,8 +737,8 @@ def generate_field_markdown(

    Args:
        field_name (str): Name of the field.
-        field_type (Type[Any]): Type of the field.
-        model (Type[BaseModel]): Pydantic model class.
+        field_type (type[Any]): Type of the field.
+        model (type[BaseModel]): Pydantic model class.
        depth (int): Indentation depth in the documentation.
        documentation_with_field_description (bool): Include field descriptions in the documentation.

@@ -798,7 +796,7 @@ def generate_field_markdown(
    return field_text


-def format_json_example(example: dict, depth: int) -> str:
+def format_json_example(example: dict[str, Any], depth: int) -> str:
    """
    Format a JSON example into a readable string with indentation.

@@ -819,14 +817,14 @@ def format_json_example(example: dict, depth: int) -> str:


 def generate_text_documentation(
-    pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
+    pydantic_models: list[type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
    documentation_with_field_description=True
 ) -> str:
    """
    Generate text documentation for a list of Pydantic models.

    Args:
-        pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes.
+        pydantic_models (list[type[BaseModel]]): List of Pydantic model classes.
        model_prefix (str): Prefix for the model section.
        fields_prefix (str): Prefix for the fields section.
        documentation_with_field_description (bool): Include field descriptions in the documentation.
@@ -885,7 +883,7 @@ def generate_text_documentation(


 def generate_field_text(
-    field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1,
+    field_name: str, field_type: type[Any], model: type[BaseModel], depth=1,
    documentation_with_field_description=True
 ) -> str:
    """
@@ -893,8 +891,8 @@ def generate_field_text(

    Args:
        field_name (str): Name of the field.
-        field_type (Type[Any]): Type of the field.
-        model (Type[BaseModel]): Pydantic model class.
+        field_type (type[Any]): Type of the field.
+        model (type[BaseModel]): Pydantic model class.
        depth (int): Indentation depth in the documentation.
        documentation_with_field_description (bool): Include field descriptions in the documentation.

@@ -1017,8 +1015,8 @@ def generate_and_save_gbnf_grammar_and_documentation(
    pydantic_model_list,
    grammar_file_path="./generated_grammar.gbnf",
    documentation_file_path="./generated_grammar_documentation.md",
-    outer_object_name: str = None,
-    outer_object_content: str = None,
+    outer_object_name: str | None = None,
+    outer_object_content: str | None = None,
    model_prefix: str = "Output Model",
    fields_prefix: str = "Output Fields",
    list_of_outputs: bool = False,
@@ -1053,8 +1051,8 @@ def generate_and_save_gbnf_grammar_and_documentation(

 def generate_gbnf_grammar_and_documentation(
    pydantic_model_list,
-    outer_object_name: str = None,
-    outer_object_content: str = None,
+    outer_object_name: str | None = None,
+    outer_object_content: str | None = None,
    model_prefix: str = "Output Model",
    fields_prefix: str = "Output Fields",
    list_of_outputs: bool = False,
@@ -1086,9 +1084,9 @@ def generate_gbnf_grammar_and_documentation(


 def generate_gbnf_grammar_and_documentation_from_dictionaries(
-    dictionaries: List[dict],
-    outer_object_name: str = None,
-    outer_object_content: str = None,
+    dictionaries: list[dict[str, Any]],
+    outer_object_name: str | None = None,
+    outer_object_content: str | None = None,
    model_prefix: str = "Output Model",
    fields_prefix: str = "Output Fields",
    list_of_outputs: bool = False,
@@ -1098,7 +1096,7 @@ def generate_gbnf_grammar_and_documentation_from_dictionaries(
    Generate GBNF grammar and documentation from a list of dictionaries.

    Args:
-        dictionaries (List[dict]): List of dictionaries representing Pydantic models.
+        dictionaries (list[dict]): List of dictionaries representing Pydantic models.
        outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
        outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
        model_prefix (str): Prefix for the model section in the documentation.
@@ -1120,7 +1118,7 @@ def generate_gbnf_grammar_and_documentation_from_dictionaries(
    return grammar, documentation


-def create_dynamic_model_from_function(func: Callable):
+def create_dynamic_model_from_function(func: Callable[..., Any]):
    """
    Creates a dynamic Pydantic model from a given function's type hints and adds the function as a 'run' method.

@@ -1135,6 +1133,7 @@ def create_dynamic_model_from_function(func: Callable):
    sig = inspect.signature(func)

    # Parse the docstring
+    assert func.__doc__ is not None
    docstring = parse(func.__doc__)

    dynamic_fields = {}
@@ -1157,7 +1156,6 @@ def create_dynamic_model_from_function(func: Callable):
                f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring")

        # Add parameter details to the schema
-        param_doc = next((d for d in docstring.params if d.arg_name == param.name), None)
        param_docs.append((param.name, param_doc))
        if param.default == inspect.Parameter.empty:
            default_value = ...
@@ -1166,10 +1164,10 @@ def create_dynamic_model_from_function(func: Callable):
        dynamic_fields[param.name] = (
            param.annotation if param.annotation != inspect.Parameter.empty else str, default_value)
    # Creating the dynamic model
-    dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)
+    dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)  # type: ignore[call-overload]

-    for param_doc in param_docs:
-        dynamic_model.model_fields[param_doc[0]].description = param_doc[1].description
+    for name, param_doc in param_docs:
+        dynamic_model.model_fields[name].description = param_doc.description

    dynamic_model.__doc__ = docstring.short_description

@@ -1182,16 +1180,16 @@ def create_dynamic_model_from_function(func: Callable):
    return dynamic_model


-def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable):
+def add_run_method_to_dynamic_model(model: type[BaseModel], func: Callable[..., Any]):
    """
    Add a 'run' method to a dynamic Pydantic model, using the provided function.

    Args:
-        model (Type[BaseModel]): Dynamic Pydantic model class.
+        model (type[BaseModel]): Dynamic Pydantic model class.
        func (Callable): Function to be added as a 'run' method to the model.

    Returns:
-        Type[BaseModel]: Pydantic model class with the added 'run' method.
+        type[BaseModel]: Pydantic model class with the added 'run' method.
    """

    def run_method_wrapper(self):
@@ -1204,15 +1202,15 @@ def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable):
    return model


-def create_dynamic_models_from_dictionaries(dictionaries: List[dict]):
+def create_dynamic_models_from_dictionaries(dictionaries: list[dict[str, Any]]):
    """
    Create a list of dynamic Pydantic model classes from a list of dictionaries.

    Args:
-        dictionaries (List[dict]): List of dictionaries representing model structures.
+        dictionaries (list[dict]): List of dictionaries representing model structures.

    Returns:
-        List[Type[BaseModel]]: List of generated dynamic Pydantic model classes.
+        list[type[BaseModel]]: List of generated dynamic Pydantic model classes.
    """
    dynamic_models = []
    for func in dictionaries:
@@ -1249,7 +1247,7 @@ def list_to_enum(enum_name, values):
    return Enum(enum_name, {value: value for value in values})


-def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "CustomModel") -> Type[BaseModel]:
+def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name: str = "CustomModel") -> type[Any]:
    """
    Convert a dictionary to a Pydantic model class.

@@ -1258,9 +1256,9 @@ def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "Cu
        model_name (str): Name of the generated Pydantic model.

    Returns:
-        Type[BaseModel]: Generated Pydantic model class.
+        type[BaseModel]: Generated Pydantic model class.
    """
-    fields = {}
+    fields: dict[str, Any] = {}

    if "properties" in dictionary:
        for field_name, field_data in dictionary.get("properties", {}).items():
@@ -1277,7 +1275,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "Cu
                    if items != {}:
                        array = {"properties": items}
                        array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items")
-                        fields[field_name] = (List[array_type], ...)
+                        fields[field_name] = (List[array_type], ...)  # type: ignore[valid-type]
                    else:
                        fields[field_name] = (list, ...)
                elif field_type == "object":
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -26,6 +26,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
    { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
    { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
+    { "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization"   , },
    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
--- a/flake.lock
+++ b/flake.lock
@@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1705133751,
-        "narHash": "sha256-rCIsyE80jgiOU78gCWN3A0wE0tR2GI5nH6MlS+HaaSQ=",
+        "lastModified": 1705677747,
+        "narHash": "sha256-eyM3okYtMgYDgmYukoUzrmuoY4xl4FUujnsv/P6I/zI=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "9b19f5e77dd906cb52dade0b7bd280339d2a1f3d",
+        "rev": "bbe7d8f876fbbe7c959c90ba2ae2852220573261",
        "type": "github"
      },
      "original": {
--- a/flake.nix
+++ b/flake.nix
@@ -1,3 +1,17 @@
+# The flake interface to llama.cpp's Nix expressions. The flake is used as a
+# more discoverable entry-point, as well as a way to pin the dependencies and
+# expose default outputs, including the outputs built by the CI.
+
+# For more serious applications involving some kind of customization  you may
+# want to consider consuming the overlay, or instantiating `llamaPackages`
+# directly:
+#
+# ```nix
+# pkgs.callPackage ${llama-cpp-root}/.devops/nix/scope.nix { }`
+# ```
+
+# Cf. https://jade.fyi/blog/flakes-arent-real/ for a more detailed exposition
+# of the relation between Nix and the Nix Flakes.
 {
  description = "Port of Facebook's LLaMA model in C/C++";

--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -109,8 +109,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
        if (block->size >= size) {
            best_fit_block = alloc->n_free_blocks - 1;
        } else {
-            fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
-                    __func__, size, max_avail);
+            fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n",
+                    __func__, tensor->name, size, max_avail);
            GGML_ASSERT(!"not enough space in the buffer");
            return;
        }
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -1191,6 +1191,24 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
                ggml_tallocr_t src_allocr = node_allocr(src);
                GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
                if (src_allocr != node_allocr) {
+                    // create a copy of the input in the split's backend
+                    size_t id = hash_id(src);
+                    if (sched->node_copies[id][cur_backend_id] == NULL) {
+                        ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
+                        struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
+                        ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
+
+                        sched->node_copies[id][cur_backend_id] = tensor_copy;
+                        node_allocr(tensor_copy) = cur_allocr;
+                        SET_CAUSE(tensor_copy, "4.cpy");
+
+                        int n_inputs = sched->splits[cur_split].n_inputs++;
+                        GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
+                        sched->splits[cur_split].inputs[n_inputs] = src;
+                    }
+                    node->src[j] = sched->node_copies[id][cur_backend_id];
+
+#if 0
                    // check if the input is already in the split
                    bool found = false;
                    for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
@@ -1206,19 +1224,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
                        GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
                        sched->splits[cur_split].inputs[n_inputs] = src;
                    }
-
-                    // create a copy of the input in the split's backend
-                    size_t id = hash_id(src);
-                    if (sched->node_copies[id][cur_backend_id] == NULL) {
-                        ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
-                        struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
-                        ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
-
-                        sched->node_copies[id][cur_backend_id] = tensor_copy;
-                        node_allocr(tensor_copy) = cur_allocr;
-                        SET_CAUSE(tensor_copy, "4.cpy");
-                    }
-                    node->src[j] = sched->node_copies[id][cur_backend_id];
+#endif
                }
            }
        }
@@ -1333,7 +1339,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
        uint64_t compute_start_us = ggml_time_us();
        if (!sched->callback_eval) {
            ggml_backend_graph_compute(split_backend, &split->graph);
-          //ggml_backend_synchronize(split_backend); // necessary to measure compute time
+            //ggml_backend_synchronize(split_backend); // necessary to measure compute time
        } else {
            // similar to ggml_backend_compare_graph_backend
            for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -13,6 +13,10 @@
 #include <map>
 #include <array>

+// stringize macro for converting __CUDA_ARCH_LIST__ (list of integers) to string
+#define STRINGIZE_IMPL(...) #__VA_ARGS__
+#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
+
 #if defined(GGML_USE_HIPBLAS)
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
@@ -584,13 +588,28 @@ static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0,
 static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};

 [[noreturn]]
-static __device__ void bad_arch() {
-    printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
+static __device__ void no_device_code(
+    const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
+           file_name, line, function_name, arch);
+    (void) arch_list;
+#else
+    printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
+           file_name, line, function_name, arch, arch_list);
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    __trap();

-    (void) bad_arch; // suppress unused function warning
+    (void) no_device_code; // suppress unused function warning
 }

+#ifdef __CUDA_ARCH__
+#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
+#else
+#define NO_DEVICE_CODE GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
+#endif // __CUDA_ARCH__
+
 static __device__ __forceinline__ float warp_reduce_sum(float x) {
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
@@ -617,7 +636,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
    return a;
 #else
    (void) a;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 }

@@ -638,7 +657,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
    return x;
 #else
    (void) x;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
 }

@@ -2421,7 +2440,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
    }
 #else
    (void) vx; (void) y; (void) k;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_PASCAL
 }

@@ -2452,7 +2471,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
    // second part effectively subtracts 8 from each quant value
    return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2489,7 +2508,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2524,7 +2543,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
    // second part effectively subtracts 16 from each quant value
    return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2569,7 +2588,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);

 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2590,7 +2609,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp

    return d8_0*d8_1 * sumi;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2620,7 +2639,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2655,7 +2674,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(

    return dm2f.x*sumf_d - dm2f.y*sumf_m;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2692,7 +2711,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(

    return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2732,7 +2751,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(

    return d3 * sumf;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2757,7 +2776,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(

    return d3*d8 * sumi;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2790,7 +2809,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
    return dm4f.x*sumf_d - dm4f.y*sumf_m;

 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2823,7 +2842,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
    return dm4f.x*sumf_d - dm4f.y*sumf_m;

 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2863,7 +2882,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
    return dm5f.x*sumf_d - dm5f.y*sumf_m;

 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2896,7 +2915,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
    return dm4f.x*sumf_d - dm4f.y*sumf_m;

 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2926,7 +2945,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(

    return d*sumf;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2957,7 +2976,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
    return d6 * sumf_d;

 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -3823,7 +3842,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
    return dall * sumf_d - dmin * sumf_m;

 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A

 #endif
@@ -4006,7 +4025,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
    return d * sumf_d;

 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A

 #endif
@@ -4264,7 +4283,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
        q8 += 8;
        aux32 >>= 7;
    }
-    const float d = (float)bq2->d * (0.5f + aux32) * (float)bq8_1[ib32].ds.x * 0.25f;
+    const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f;
    return d * sumi;
 #else
    // iqs is 0...15
@@ -4275,7 +4294,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
    const uint8_t  * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
    const uint8_t  * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
    const uint32_t aux32 = q2[2] | (q2[3] << 16);
-    const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * (float)bq8_1[ib32].ds.x * 0.25f;
+    const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f;
    const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
    const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
    const int8_t * q8 = bq8_1[ib32].qs + 16*il;
@@ -4320,7 +4339,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
        }
        q8 += 8;
    }
-    const float d = (float)bq2->d * (float)bq8_1[ib32].ds.x * 0.25f;
+    const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
 #else
    assert(false);
@@ -4501,7 +4520,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q4_0_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -4570,7 +4589,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q4_1_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -4637,7 +4656,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q5_0_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -4704,7 +4723,7 @@ mul_mat_q5_1(
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q5_1_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -4771,7 +4790,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q8_0_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -4838,7 +4857,7 @@ mul_mat_q2_K(
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q2_K_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -4907,7 +4926,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q3_K_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -4976,7 +4995,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q4_K_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -5043,7 +5062,7 @@ mul_mat_q5_K(
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q5_K_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -5112,7 +5131,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q6_K_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -5835,7 +5854,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
    }
 #else
    (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
 }

--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -26,15 +26,6 @@

 #define GGML_METAL_MAX_KERNELS 256

-struct ggml_metal_buffer {
-    const char * name;
-
-    void   * data;
-    size_t   size;
-
-    id<MTLBuffer> metal;
-};
-
 struct ggml_metal_kernel {
    id<MTLFunction>             function;
    id<MTLComputePipelineState> pipeline;
@@ -172,9 +163,6 @@ struct ggml_metal_context {

    dispatch_queue_t d_queue;

-    int n_buffers;
-    struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
-
    struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS];

    bool support_simdgroup_reduction;
@@ -242,24 +230,20 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
    // Show all the Metal device instances in the system
    NSArray * devices = MTLCopyAllDevices();
    for (id<MTLDevice> device in devices) {
-        NSString * s = [device name];
-        GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]);
+        GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]);
    }
    [devices release]; // since it was created by a *Copy* C method
 #endif

    // Pick and show default Metal device
    id<MTLDevice> device = MTLCreateSystemDefaultDevice();
-    NSString * s = [device name];
-    GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]);
+    GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);

    // Configure context
    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
    ctx->device = device;
    ctx->n_cb   = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
    ctx->queue  = [ctx->device newCommandQueue];
-    ctx->n_buffers = 0;
-
    ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);

    // load library
@@ -277,6 +261,10 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
            NSURL * libURL = [NSURL fileURLWithPath:libPath];
            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
            ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
+            if (error) {
+                GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+                return NULL;
+            }
        } else {
            GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);

@@ -315,13 +303,12 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
                //[options setFastMathEnabled:false];

                ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
+                if (error) {
+                    GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+                    return NULL;
+                }
            }
        }
-
-        if (error) {
-            GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
-            return NULL;
-        }
    }

    // print MTL GPU family:
@@ -531,10 +518,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
 static void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);

-    for (int i = 0; i < ctx->n_buffers; ++i) {
-        [ctx->buffers[i].metal release];
-    }
-
    for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) {
        if (ctx->kernels[i].pipeline) {
            [ctx->kernels[i].pipeline release];
@@ -577,51 +560,30 @@ struct ggml_backend_metal_buffer_context {
 // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
 // Metal buffer based on the host memory pointer
 //
-static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
+static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) {
    //GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);

    const int64_t tsize = ggml_nbytes(t);

    ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;

-    // compatibility with ggml-backend
-    if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) {
-        struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
-
-        // find the view that contains the tensor fully
-        for (int i = 0; i < buf_ctx->n_buffers; ++i) {
-            const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
-
-            //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
-            if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
-                *offs = (size_t) ioffs;
-
-                //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
-
-                return buf_ctx->buffers[i].metal;
-            }
-        }
-
-        GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
-
-        return nil;
-    }
+    struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;

    // find the view that contains the tensor fully
-    for (int i = 0; i < ctx->n_buffers; ++i) {
-        const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
+    for (int i = 0; i < buf_ctx->n_buffers; ++i) {
+        const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;

-        //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
-        if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
+        //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
+        if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
            *offs = (size_t) ioffs;

-            //GGML_METAL_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
+            //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);

-            return ctx->buffers[i].metal;
+            return buf_ctx->buffers[i].metal;
        }
    }

-    GGML_METAL_LOG_ERROR("%s: error: buffer is nil\n", __func__);
+    GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);

    return nil;
 }
@@ -668,7 +630,8 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
            return true;
        case GGML_OP_MUL_MAT:
        case GGML_OP_MUL_MAT_ID:
-            return ctx->support_simdgroup_reduction;
+            return ctx->support_simdgroup_reduction &&
+                (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F32);
        case GGML_OP_CPY:
        case GGML_OP_DUP:
        case GGML_OP_CONT:
@@ -813,9 +776,9 @@ static bool ggml_metal_graph_compute(
            const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
            const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;

-            id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil;
-            id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
-            id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(ctx, dst,  &offs_dst)  : nil;
+            id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil;
+            id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil;
+            id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(dst,  &offs_dst)  : nil;

            //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
            //if (src0) {
@@ -1597,7 +1560,7 @@ static bool ggml_metal_graph_compute(
                                struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];

                                size_t offs_src_cur = 0;
-                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
+                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur);

                                [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
                            }
@@ -1742,7 +1705,7 @@ static bool ggml_metal_graph_compute(
                                struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];

                                size_t offs_src_cur = 0;
-                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
+                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur);

                                [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
                            }
--- a/ggml.c
+++ b/ggml.c
@@ -1418,6 +1418,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
 inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
 inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
 inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
+// TODO: optimize performance
+inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
+inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }

 static const float GELU_COEF_A     = 0.044715f;
 static const float GELU_QUICK_COEF = -1.702f;
@@ -1776,9 +1779,11 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
    "GELU",
    "GELU_QUICK",
    "SILU",
+    "HARDSWISH",
+    "HARDSIGMOID",
 };

-static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10");
+static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");


 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@@ -3945,6 +3950,20 @@ struct ggml_tensor * ggml_silu_back(
    return result;
 }

+// ggml hardswish
+struct ggml_tensor * ggml_hardswish(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH);
+}
+
+// ggml hardsigmoid
+struct ggml_tensor * ggml_hardsigmoid(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID);
+}
+
 // ggml_norm

 static struct ggml_tensor * ggml_norm_impl(
@@ -5344,6 +5363,31 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
    return result;
 }

+// ggml_conv_depthwise
+struct ggml_tensor * ggml_conv_depthwise_2d(
+    struct ggml_context * ctx,
+    struct ggml_tensor * a,
+    struct ggml_tensor * b,
+    int                  s0,
+    int                  s1,
+    int                  p0,
+    int                  p1,
+    int                  d0,
+    int                  d1) {
+    struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
+    struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
+                                        ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
+                                        s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW]
+
+    struct ggml_tensor * result =
+        ggml_mul_mat(ctx,
+                ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1),                       // [OC，1, KH, KW] => [1, OC, 1, KH * KW]
+                ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
+
+    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
+
+    return result;
+}
 // ggml_conv_2d

 // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
@@ -7764,6 +7808,9 @@ static void ggml_compute_forward_acc_f32(
    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];

    if (!inplace && (params->type == GGML_TASK_INIT)) {
+        if (params->ith != 0) {
+            return;
+        }
        // memcpy needs to be synchronized across threads to avoid race conditions.
        // => do it in INIT phase
        memcpy(
@@ -9333,6 +9380,87 @@ static void ggml_compute_forward_silu_back(
    }
 }

+
+static void ggml_compute_forward_hardswish_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_hardswish_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+static void ggml_compute_forward_hardswish(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_hardswish_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+static void ggml_compute_forward_hardsigmoid_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    assert(dst->nb[0]  == sizeof(float));
+    assert(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_hardsigmoid_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_hardsigmoid(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_hardsigmoid_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
+
 // ggml_compute_forward_norm

 static void ggml_compute_forward_norm_f32(
@@ -9825,11 +9953,30 @@ static void ggml_compute_forward_mul_mat(

 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
    if (ggml_compute_forward_mul_mat_use_blas(dst)) {
-        if (params->ith != 0) {
-            return;
-        }
+        const int64_t ne_plane      = ne01*ne00;
+        const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
+        UNUSED(desired_wsize);

        if (params->type == GGML_TASK_INIT) {
+            if (type != GGML_TYPE_F32) {
+                assert(params->wsize >= desired_wsize);
+                // parallelize by src0 rows
+                for (int64_t i13 = 0; i13 < ne13; i13++) {
+                    for (int64_t i12 = 0; i12 < ne12; i12++) {
+                        // broadcast src0 into src1 across 2nd,3rd dimension
+                        const int64_t i03 = i13/r3;
+                        const int64_t i02 = i12/r2;
+
+                        const void           *       x        = (char *)  src0->data    + i02*nb02          + i03*nb03;
+                              float          * const wdata    = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
+                              ggml_to_float_t  const to_float = type_traits[type].to_float;
+
+                        for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
+                            to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
+                        }
+                    }
+                }
+            }
            return;
        }

@@ -9837,9 +9984,14 @@ static void ggml_compute_forward_mul_mat(
            return;
        }

+        // perform sgemm, parallelization controlled by blas lib
+        if (ith != 0) {
+            return;
+        }
+
+        //const int64_t tgemm0 = ggml_perf_time_us();
        for (int64_t i13 = 0; i13 < ne13; i13++) {
            for (int64_t i12 = 0; i12 < ne12; i12++) {
-                // broadcast src0 into src1 across 2nd,3rd dimension
                const int64_t i03 = i13/r3;
                const int64_t i02 = i12/r2;

@@ -9848,17 +10000,7 @@ static void ggml_compute_forward_mul_mat(
                      float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);

                if (type != GGML_TYPE_F32) {
-                            float * const wdata    = params->wdata;
-                    ggml_to_float_t const to_float = type_traits[type].to_float;
-
-                    size_t id = 0;
-                    for (int64_t i01 = 0; i01 < ne01; ++i01) {
-                        to_float((const char *) x + i01*nb01, wdata + id, ne00);
-                        id += ne00;
-                    }
-
-                    assert(id*sizeof(float) <= params->wsize);
-                    x = wdata;
+                    x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
                }

                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
@@ -9868,6 +10010,7 @@ static void ggml_compute_forward_mul_mat(
                         0.0f,    d, ne01);
            }
        }
+        //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);

        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);

@@ -9876,6 +10019,9 @@ static void ggml_compute_forward_mul_mat(
 #endif

    if (params->type == GGML_TASK_INIT) {
+        if (ith != 0) {
+            return;
+        }
        if (src1->type != vec_dot_type) {
            char * wdata = params->wdata;
            const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -10040,6 +10186,9 @@ static void ggml_compute_forward_mul_mat_id(
    #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]

   if (params->type == GGML_TASK_INIT) {
+        if (ith != 0) {
+            return;
+        }
        char * wdata = params->wdata;
        if (src1->type != vec_dot_type) {
            const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -10225,6 +10374,9 @@ static void ggml_compute_forward_out_prod_f32(
            return;
        }
 #endif
+        if (ith != 0) {
+            return;
+        }
        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
        return;
    }
@@ -10408,6 +10560,9 @@ static void ggml_compute_forward_out_prod_q_f32(
    // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)

    if (params->type == GGML_TASK_INIT) {
+        if (ith != 0) {
+            return;
+        }
        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
        return;
    }
@@ -10592,6 +10747,9 @@ static void ggml_compute_forward_set_f32(
    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];

    if (!inplace && (params->type == GGML_TASK_INIT)) {
+        if (params->ith != 0) {
+            return;
+        }
        // memcpy needs to be synchronized across threads to avoid race conditions.
        // => do it in INIT phase
        memcpy(
@@ -10916,6 +11074,9 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
    // ggml_compute_forward_dup_same_cont(params, opt0, dst);

    if (params->type == GGML_TASK_INIT) {
+        if (params->ith != 0) {
+            return;
+        }
        memset(dst->data, 0, ggml_nbytes(dst));
    }

@@ -10950,6 +11111,9 @@ static void ggml_compute_forward_get_rows_back_f32(
    // ggml_compute_forward_dup_same_cont(params, opt0, dst);

    if (params->type == GGML_TASK_INIT) {
+        if (params->ith != 0) {
+            return;
+        }
        memset(dst->data, 0, ggml_nbytes(dst));
    }

@@ -11087,6 +11251,9 @@ static void ggml_compute_forward_diag_mask_f32(
    GGML_ASSERT(n_past >= 0);

    if (!inplace && (params->type == GGML_TASK_INIT)) {
+        if (ith != 0) {
+            return;
+        }
        // memcpy needs to be synchronized across threads to avoid race conditions.
        // => do it in INIT phase
        GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
@@ -12057,6 +12224,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
    GGML_ASSERT(nb10 == sizeof(float));

    if (params->type == GGML_TASK_INIT) {
+        if (ith != 0) {
+            return;
+        }
        memset(params->wdata, 0, params->wsize);

        // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@@ -12151,6 +12321,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
    GGML_ASSERT(nb10 == sizeof(float));

    if (params->type == GGML_TASK_INIT) {
+        if (ith != 0) {
+            return;
+        }
        memset(params->wdata, 0, params->wsize);

        // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@@ -12349,6 +12522,7 @@ static void ggml_compute_forward_im2col(
    }
 }

+
 // ggml_compute_forward_conv_transpose_2d

 static void ggml_compute_forward_conv_transpose_2d(
@@ -12374,6 +12548,9 @@ static void ggml_compute_forward_conv_transpose_2d(
    GGML_ASSERT(nb10 == sizeof(float));

    if (params->type == GGML_TASK_INIT) {
+        if (ith != 0) {
+            return;
+        }
        memset(params->wdata, 0, params->wsize);

        // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
@@ -13917,6 +14094,14 @@ static void ggml_compute_forward_unary(
            {
                ggml_compute_forward_silu(params, src0, dst);
            } break;
+        case GGML_UNARY_OP_HARDSWISH:
+            {
+                ggml_compute_forward_hardswish(params, src0, dst);
+            } break;
+        case GGML_UNARY_OP_HARDSIGMOID:
+            {
+                ggml_compute_forward_hardsigmoid(params, src0, dst);
+            } break;
        default:
            {
                GGML_ASSERT(false);
@@ -13980,6 +14165,9 @@ static void ggml_compute_forward_add_rel_pos_f32(

    const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
    if (!inplace && params->type == GGML_TASK_INIT) {
+        if (params->ith != 0) {
+            return;
+        }
        memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
        return;
    }
@@ -16273,8 +16461,9 @@ struct ggml_compute_state_shared {
    const int n_threads;

    // synchronization primitives
-    atomic_int n_active; // num active threads
-    atomic_int node_n;   // active graph node
+    atomic_int n_active;  // num active threads
+    atomic_int node_n;    // active graph node
+    atomic_int node_task; // active graph node task phase

    bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
    void * abort_callback_data;
@@ -16330,6 +16519,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                case GGML_UNARY_OP_TANH:
                case GGML_UNARY_OP_ELU:
                case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
+                case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
                    {
                        n_tasks = 1;
                    } break;
@@ -16520,6 +16711,34 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
    return n_tasks;
 }

+static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
+    // wait for other threads to finish
+    const int last_node_n = * node_n;
+
+    while (true) {
+        if (do_yield) {
+            sched_yield();
+        }
+
+        * node_n = atomic_load(&state->shared->node_n);
+        if (* node_n != last_node_n) break;
+    }
+}
+
+static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
+    // wait for other threads to finish
+    const int last_task_phase = * task_phase;
+
+    while (true) {
+        if (do_yield) {
+            sched_yield();
+        }
+
+        * task_phase = atomic_load(&state->shared->node_task);
+        if (* task_phase != last_task_phase) break;
+    }
+}
+
 static thread_ret_t ggml_graph_compute_thread(void * data) {
    struct ggml_compute_state * state = (struct ggml_compute_state *) data;

@@ -16530,7 +16749,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {

    set_numa_thread_affinity(state->ith, n_threads);

-    int node_n = -1;
+    int node_n     = -1;
+    int task_phase = GGML_TASK_FINALIZE;

    while (true) {
        if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
@@ -16562,7 +16782,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
            // distribute new work or execute it direct if 1T
            while (++node_n < cgraph->n_nodes) {
                GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
-
                struct ggml_tensor * node = cgraph->nodes[node_n];
                const int n_tasks = ggml_get_n_tasks(node, n_threads);

@@ -16571,13 +16790,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {

                params.nth = n_tasks;

-                /* INIT */
-                if (GGML_OP_HAS_INIT[node->op]) {
-                    params.type = GGML_TASK_INIT;
-                    ggml_compute_forward(&params, node);
-                }
-
                if (n_tasks == 1) {
+                    /* INIT */
+                    if (GGML_OP_HAS_INIT[node->op]) {
+                        params.type = GGML_TASK_INIT;
+                        ggml_compute_forward(&params, node);
+                    }
+
                    // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
                    // they do something more efficient than spinning (?)
                    params.type = GGML_TASK_COMPUTE;
@@ -16598,38 +16817,24 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                }
            }

-            atomic_store(&state->shared->n_active, n_threads);
-            atomic_store(&state->shared->node_n,   node_n);
+            task_phase = GGML_TASK_INIT;
+            atomic_store(&state->shared->n_active,  n_threads);
+            atomic_store(&state->shared->node_n,    node_n);
+            atomic_store(&state->shared->node_task, task_phase);
        } else {
-            // wait for other threads to finish
-            const int last = node_n;
-
-            const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
-
-            while (true) {
-                // TODO: this sched_yield can have significant impact on the performance - either positive or negative
-                //       depending on the workload and the operating system.
-                //       since it is not clear what is the best approach, it should potentially become user-configurable
-                //       ref: https://github.com/ggerganov/ggml/issues/291
-                // UPD:  adding the do_yield flag seems to resolve the issue universally
-                if (do_yield) {
-                    sched_yield();
-                }
-
-                node_n = atomic_load(&state->shared->node_n);
-                if (node_n != last) break;
-            };
+            ggml_graph_compute_thread_sync_node(&node_n,     state, false);
+            ggml_graph_compute_thread_sync_task(&task_phase, state, false);
        }

        // check if we should stop
        if (node_n >= cgraph->n_nodes) break;

-        /* COMPUTE */
+        /* INIT & COMPUTE */
        struct ggml_tensor * node = cgraph->nodes[node_n];
        const int n_tasks = ggml_get_n_tasks(node, n_threads);

        struct ggml_compute_params params = {
-            /*.type  =*/ GGML_TASK_COMPUTE,
+            /*.type  =*/ GGML_TASK_INIT,
            /*.ith   =*/ state->ith,
            /*.nth   =*/ n_tasks,
            /*.wsize =*/ cplan->work_size,
@@ -16637,8 +16842,39 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
        };

        if (state->ith < n_tasks) {
+            if (GGML_OP_HAS_INIT[node->op]) {
+                ggml_compute_forward(&params, node);
+            }
+        }
+
+        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
+            task_phase = GGML_TASK_COMPUTE;
+            atomic_store(&state->shared->n_active,  n_threads);
+            atomic_store(&state->shared->node_task, task_phase);
+        }
+        else {
+            // TODO: this sched_yield can have significant impact on the performance - either positive or negative
+            //       depending on the workload and the operating system.
+            //       since it is not clear what is the best approach, it should potentially become user-configurable
+            //       ref: https://github.com/ggerganov/ggml/issues/291
+            // UPD:  adding the do_yield flag seems to resolve the issue universally
+            const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
+            ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
+        }
+
+        if (state->ith < n_tasks) {
+            params.type = GGML_TASK_COMPUTE;
            ggml_compute_forward(&params, node);
        }
+
+        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
+            task_phase = GGML_TASK_FINALIZE;
+            atomic_store(&state->shared->n_active,  n_threads);
+            atomic_store(&state->shared->node_task, task_phase);
+        }
+        else {
+            ggml_graph_compute_thread_sync_task(&task_phase, state, false);
+        }
    }

    return GGML_EXIT_SUCCESS;
@@ -16695,8 +16931,11 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                    if (ggml_compute_forward_mul_mat_use_blas(node)) {
                        if (node->src[0]->type != GGML_TYPE_F32) {
-                            // here we need memory just for single 2D matrix from src0
-                            cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
+                            // here we need memory for fully dequantized matrix from src0
+                            // take into account that src0 can be broadcasted into src1[2,3]
+                            cur = ggml_type_size(GGML_TYPE_F32)
+                                * node->src[0]->ne[0]*node->src[0]->ne[1]
+                                * node->src[1]->ne[2]*node->src[1]->ne[3];
                        }
                    } else
 #endif
@@ -16850,6 +17089,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
        /*.n_threads               =*/ n_threads,
        /*.n_active                =*/ n_threads,
        /*.node_n                  =*/ -1,
+        /*.node_task               =*/ GGML_TASK_FINALIZE,
        /*.abort_callback          =*/ NULL,
        /*.abort_callback_data     =*/ NULL,
    };
--- a/ggml.h
+++ b/ggml.h
@@ -489,6 +489,8 @@ extern "C" {
        GGML_UNARY_OP_GELU,
        GGML_UNARY_OP_GELU_QUICK,
        GGML_UNARY_OP_SILU,
+        GGML_UNARY_OP_HARDSWISH,
+        GGML_UNARY_OP_HARDSIGMOID,

        GGML_UNARY_OP_COUNT,
    };
@@ -1032,6 +1034,16 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    // hardswish(x) = x * relu6(x + 3) / 6
+    GGML_API struct ggml_tensor * ggml_hardswish(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // hardsigmoid(x) = relu6(x + 3) / 6
+    GGML_API struct ggml_tensor * ggml_hardsigmoid(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    // normalize along rows
    GGML_API struct ggml_tensor * ggml_norm(
            struct ggml_context * ctx,
@@ -1483,6 +1495,17 @@ extern "C" {
            int                  d1,
            bool                 is_2D);

+    GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                  s0,
+            int                  s1,
+            int                  p0,
+            int                  p1,
+            int                  d0,
+            int                  d1);
+
    GGML_API struct ggml_tensor * ggml_conv_1d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -107,7 +107,7 @@ class GGUFReader:
        offs, tensors_fields = self._build_tensors_fields(offs, tensor_count)
        new_align = self.fields.get('general.alignment')
        if new_align is not None:
-            if new_align.types != [GGUFValueType.UINT64]:
+            if new_align.types != [GGUFValueType.UINT32]:
                raise ValueError('Bad type for general.alignment field')
            self.alignment = new_align.parts[-1][0]
        padding = offs % self.alignment
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@@ -107,6 +107,7 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_IQ2_XXS       = 19, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ2_XS        = 20, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_XS       = 22, // except 1d tensors

        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };
@@ -774,6 +775,14 @@ extern "C" {
                           float   p,
                          size_t   min_keep);

+    /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
+    LLAMA_API void llama_sample_entropy(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates_p,
+                           float   min_temp,
+                           float   max_temp,
+                           float   exponent_val);
+
    LLAMA_API void llama_sample_temp(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
--- a/mypy.ini
+++ b/mypy.ini
@@ -4,3 +4,4 @@ allow_untyped_calls = true
 allow_untyped_defs = true
 allow_incomplete_defs = true
 disable_error_code = import-untyped
+warn_return_any = false
--- a/unicode.h
+++ b/unicode.h
@@ -2,8 +2,9 @@

 #include <cassert>
 #include <stdexcept>
-#include <vector>
+#include <string>
 #include <unordered_map>
+#include <vector>

 static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
 {0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
Author	SHA1	Message	Date
Paul Tsochantaris	6dd3c28c9c	metal : remove unused `n_buffers` and `buffers` (#5129 )	2024-01-26 14:16:07 +02:00
Riceball LEE	38b431de23	gguf : fix "general.alignment" type in gguf_reader.py (#5136 )	2024-01-26 11:10:28 +02:00
Georgi Gerganov	aad0b01d73	readme : update hot topics	2024-01-26 10:52:33 +02:00
Kawrakow	1182cf4d4f	Another bucket sort (#5109 ) * Initial bucket sort * Bucket sort: slightly better version * Bucket sort: another minor improvement --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-01-26 09:14:39 +02:00
XiaotaoChen	fe54033b69	readme : add MobileVLM 1.7B/3B to the supported models list (#5107 ) Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>	2024-01-25 22:14:32 +02:00
l3utterfly	5eaf9964fc	llama : dynamic temperature sampling (#4972 ) * implemented dynamic temperature sampling from koboldcpp * removed trailing whitespace * removed unused temp parameter in llama_sample_entropy * exposed exponent_val in dynamic temp sampler * added debug check for printf statements * use nullptr in llama_sample_softmax call during llama_sample_entropy this avoids counting the time taken stats twice Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * return earlier if there is only 1 candiate (i.e. max_entropy == 0) * reformat 't' case in llama_sample_queue Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com> * check for one or zero candidates case in llama_sample_entropy --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>	2024-01-25 22:06:22 +02:00
Jared Van Bortel	d292f4f204	examples : make pydantic scripts pass mypy and support py3.8 (#5099 )	2024-01-25 14:51:24 -05:00
Valentin Konovalov	256d1bb0dd	android : use release cmake build type by default (#5123 )	2024-01-25 19:05:51 +02:00
Kawrakow	faa3526a1e	Fix Q3_K_XS for MoE models (#5113 ) Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-01-25 17:58:53 +02:00
Georgi Gerganov	ddc5a5033f	metal : show compile log messages	2024-01-25 11:26:17 +02:00
Engininja2	cd4fddb29f	cuda : fix 2-bit quants on amd hip (#5105 ) * cuda : fix 2-bit quants on amd hip * use __low2float intrinsic function for new quants	2024-01-24 23:18:15 +01:00
Michael Hueschen	c9b316c78f	nix-shell: use addToSearchPath thx to @SomeoneSerge for the suggestion!	2024-01-24 12:39:29 +00:00
Michael Hueschen	bf63d695b8	nix: add cc to devShell LD_LIBRARY_PATH this fixes the error I encountered when trying to run the convert.py script in a venv: ``` $ nix develop [...]$ source .venv/bin/activate (.venv) [...]$ pip3 install -r requirements.txt <... clipped ...> [...]$ python3 ./convert.py Traceback (most recent call last): File "/home/mhueschen/projects-reference/llama.cpp/./convert.py", line 40, in <module> from sentencepiece import SentencePieceProcessor File "/home/mhueschen/projects-reference/llama.cpp/.venv/lib/python3.11/site-packages/sentencepiece/__init__.py", line 13, in <module> from . import _sentencepiece ImportError: libstdc++.so.6: cannot open shared object file: No such file or directory ``` however, I am not sure this is the cleanest way to address this linker issue...	2024-01-24 12:39:29 +00:00
slaren	1387ea2117	llama : pre-allocate input tensors in a separate buffer (#5100 )	2024-01-24 12:48:14 +01:00
Georgi Gerganov	26d607608d	metal : disable support for MUL_MAT F32 x F16	2024-01-23 15:50:56 +02:00
Kawrakow	44879ee885	Additional KL-divergence statistics (#5081 ) * perplexity: add top-token probability * perplexity: add additional KL-divergence statistics * perplexity: a better organized KL-divergence statistics output --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-01-23 15:17:20 +02:00
Johannes Gäßler	9ecdd12e95	CUDA: more info when no device code (#5088 )	2024-01-23 13:31:56 +01:00
Georgi Gerganov	89758723c7	minor : clean-up some warnings and style (#5094 ) * minor : clean-up some warnings and style ggml-ci * ggml : add comment	2024-01-23 14:12:57 +02:00
Xuan Son Nguyen	2bed4aa3f3	devops : add intel oneapi dockerfile (#5068 ) Co-authored-by: Xuan Son Nguyen <xuanson.nguyen@snowpack.eu>	2024-01-23 09:11:39 +02:00
Michael Coppola	125d03a503	llama.vim : added api key support (#5090 ) Co-authored-by: Michael Coppola <info@michaeljcoppola.com>	2024-01-23 08:51:27 +02:00
slaren	011e8ec577	llama : fix not enough space in buffer with Qwen (#5086 )	2024-01-22 23:42:41 +01:00
Kawrakow	6f9939d119	KL-divergence (#5076 ) * kl-divergence: be able to save all logits to a file * Add ability to compute KL-divergence --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-01-22 16:10:14 +02:00
Reinforce-II	780e24a22e	ggml : parallelize FP32 conversion when using BLAS (#5045 ) * make GGML_TASK_INIT phase can be run in multithread * multithreaded dequantize in mul_mat when using blas library * minor fixes * update outdated comment * fix coding style * simplify code Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-01-22 15:15:08 +02:00
XiaotaoChen	3ce7e8f8e7	llava : MobileVLM support (#4954 ) * MobileVLM native implementation * delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake * move android script to example/llava directory * Fix the editor config checks --------- Co-authored-by: Chenxiaotao03 <chenxiaotao03@meituan.com>	2024-01-22 15:09:35 +02:00
Someone Serge	b2d80e105a	flake.nix: add a comment about flakes vs nix	2024-01-22 12:19:30 +00:00
Someone Serge	28603cd283	nix: add a comment on the many nixpkgs-with-cuda instances	2024-01-22 12:19:30 +00:00
Someone Serge	5e97ec91ae	nix: add a comment about makeScope	2024-01-22 12:19:30 +00:00
Someone Serge	7251870780	nix: refactor the cleanSource rules	2024-01-22 12:19:30 +00:00
Someone Serge	fe8b3c0d4b	workflows: nix-ci: drop the redundant "paths" filter	2024-01-22 12:19:30 +00:00
Someone Serge	f4dd059259	workflows: nix-build-aarch64: rate limit	2024-01-22 12:19:30 +00:00
Someone Serge	f7276f7500	workflows: nix-ci: rebuild on flake.lock updates	2024-01-22 12:19:30 +00:00
Kawrakow	15bceec2d7	imatrix : keep intermediate imatrix results (#5077 ) Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-01-22 14:18:43 +02:00
compilade	d6bd4d46dd	llama : support StableLM 2 1.6B (#5052 ) * llama : support StableLM 2 1.6B * convert : fix Qwen's set_vocab wrongly naming all special tokens [PAD{id}] * convert : refactor Qwen's set_vocab to use it for StableLM 2 too * nix : add tiktoken to llama-python-extra * convert : use presence of tokenizer.json to determine StableLM tokenizer loader It's a less arbitrary heuristic than the vocab size.	2024-01-22 13:21:52 +02:00
Daniel Bevenius	152d9d05e0	finetune : print sample-start/include-sample-start (#5072 ) This commit adds `--sample-start` and `--include-sample-start` to the output from the main function in finetune.cpp. The motivation for this is that even though these are set explicitly by the user via the command line, if one forgets to set them then it is useful to have their values printed out. Otherwise it is possible to go through the whole training process before realizing that the values are not what one expected. Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>	2024-01-22 13:11:01 +02:00
Kawrakow	66d575c45c	llama : add Q3_K_XS (#5060 ) * Add Q3_K_XS - intermediate size between Q2_K and Q3_K_S * Q3_K_XS: quanize first 1/8 of ffn_down layers with Q4_K Together with an importance matrix, this brings perplexity for LLaMA-v2-70B below the perplexity of the former Q2_K with a 800 MB smaller quantized model size. --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-01-22 12:43:33 +02:00
bobqianic	57744932c6	ci : fix Windows CI by updating Intel SDE version (#5053 )	2024-01-22 10:55:05 +02:00
Shijie	3466c6ebcf	llama : add more qwen2 models (#5071 )	2024-01-22 09:33:19 +02:00
iSma	504dc37be8	Revert LLAMA_NATIVE to OFF in flake.nix (#5066 )	2024-01-21 21:37:13 +00:00
kuronekosaiko	05490fad7f	add safetensors support to convert-lora-to-ggml.py (#5062 ) * add safetensors support to convert-lora-to-ggml.py * Update convert-lora-to-ggml.py Remove white space in line 69.	2024-01-21 17:28:14 +01:00
bobqianic	6c5629d4d2	add `#include <string>` to unicode.h (#5051 ) Co-authored-by: Jared Van Bortel <jared@nomic.ai>	2024-01-21 10:17:35 -05:00
Kawrakow	7dcbe39d36	Add ability to evauate multiple choice tasks (#5047 ) * TruthfulQA: 1st attempt, does not look like it is working The same implementation can be used for HellaSwag as well, so I converted a HellaSwag validation dataset to the binary format used here and tested with that. The score is only around 50, so something is not quite right. * TruthfulQA: works but the result is bad I know it works because if I convert the HellaSwag validation data to the binary format used in the truthful_qa_score() function I get the exact same result as from the hellaswag_score() function. But I guess, the questions are tricky and the way I have done the combination of question + answer is very likely not the best. The TruthfulQA validation dataset contains 817 questions, with random chance result around 19%. With this version I get 29.1% for Mistral-7B and 55.2% for Mistral-7B-Instruct-v0.2. The HF leader board results for these two models are 42.2% and 68.3%, respectively. * TruthfulQA: fix random sample * TruthfulQA: prepare tasks in parallel for large test datasets * Rename truthful_qa to multiple_choice * Make MSVC happy I had forgotten that MSVC does not make constexpr's available inside a lambda. --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-01-21 14:42:44 +02:00
Kawrakow	726c0fa9a2	Slightly faster imatrix (#5050 ) * imatrix: speedup by avoiding unnecessary allocations and copies * imatrix: add --no-ppl option to skip PPL calculations altogether --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>	2024-01-21 08:01:20 +02:00
Georgi Gerganov	942c0107a7	flake.lock: Update (#5054 ) Flake lock file updates: • Updated input 'nixpkgs': 'github:NixOS/nixpkgs/9b19f5e77dd906cb52dade0b7bd280339d2a1f3d' (2024-01-13) → 'github:NixOS/nixpkgs/bbe7d8f876fbbe7c959c90ba2ae2852220573261' (2024-01-19) Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>	2024-01-21 03:17:27 +00:00
Jared Van Bortel	b43ebde3b0	convert : partially revert PR #4818 (#5041 )	2024-01-20 18:14:18 -05:00
Jared Van Bortel	97c1549808	perplexity : fix MSVC build after #5020 (#5043 ) * perplexity : fix MSVC build after #5020 * try a differerent fix	2024-01-20 17:08:08 +02:00
slaren	6df465a91d	llama : run all KQV ops on the CPU with no KV offload (#5049 ) ggml-ci	2024-01-20 17:05:49 +02:00
Herman Semenov	77bc1bbd05	cmake : add support for ccache (#5002 ) * Added support ccache for speedup recompilation * cmake : option to disable ccache --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-01-20 10:11:31 +02:00
adel boussaken	48e2b13372	Add a dart/flutter binding to README.md (#4882 )	2024-01-20 03:05:43 -05:00