cpu: fix ARM NEON nvfp4 vec dot

hexagon: slight optimization for argosrt output init (#21463 )
llama : correct platform-independent loading of BOOL metadata (#21428 )
2026-05-06 17:14:07 +00:00 · 2026-04-06 10:27:03 +02:00 · 2026-04-05 18:30:25 -07:00 · 2026-04-06 01:40:38 +02:00 · 2026-04-05 23:32:14 +02:00 · 2026-04-05 20:29:48 +02:00
44 changed files with 1554 additions and 923 deletions
--- a/.github/workflows/build-riscv.yml
+++ b/.github/workflows/build-riscv.yml
@@ -35,7 +35,7 @@ env:

 jobs:
  ubuntu-riscv64-native-sanitizer:
-    runs-on: RISCV64
+    runs-on: ubuntu-24.04-riscv

    continue-on-error: true

@@ -50,17 +50,18 @@ jobs:
          sudo apt-get update

          # Install necessary packages
-          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache git-lfs
+          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 cmake build-essential wget git-lfs

          # Set gcc-14 and g++-14 as the default compilers
          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
-          sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
-          sudo ln -sf /usr/bin/g++-14 /usr/bin/g++

-          # Install Rust stable version
-          rustup install stable
-          rustup default stable
+          if ! which rustc; then
+            # Install Rust stable version
+            sudo apt-get install -y rustup
+            rustup install stable
+            rustup default stable
+          fi

          git lfs install

@@ -73,23 +74,12 @@ jobs:
        id: checkout
        uses: actions/checkout@v6

-      - name: Setup ccache
-        run: |
-          # Unique cache directory per matrix combination
-          export CCACHE_DIR="$HOME/.ccache/sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}"
-          mkdir -p "$CCACHE_DIR"
-
-          # Configure ccache
-          ccache --set-config=max_size=5G
-          ccache --set-config=compression=true
-          ccache --set-config=compression_level=6
-          ccache --set-config=cache_dir="$CCACHE_DIR"
-          ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
-          ccache --set-config=hash_dir=false
-
-          # Export for subsequent steps
-          echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
-          echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
+      # FIXME: Enable when ggml-org/ccache-action works on riscv64
+      # - name: ccache
+      #   uses: ggml-org/ccache-action@v1.2.21
+      #   with:
+      #     key: ubuntu-riscv64-native-sanitizer-${{ matrix.sanytizer }}-${{ matrix.build_type }}
+      #     save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build
        id: cmake_build
--- a/.github/workflows/build-self-hosted.yml
+++ b/.github/workflows/build-self-hosted.yml
@@ -213,6 +213,27 @@ jobs:
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

+  ggml-ci-win-intel-vulkan:
+    runs-on: [self-hosted, Windows, X64, Intel]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Test
+        id: ggml-ci
+        shell: C:\msys64\usr\bin\bash.exe --noprofile --norc -eo pipefail "{0}"
+        env:
+          MSYSTEM: UCRT64
+          CHERE_INVOKING: 1
+          PATH: C:\msys64\ucrt64\bin;C:\msys64\usr\bin;C:\Windows\System32;${{ env.PATH }}
+        run: |
+          vulkaninfo --summary
+          # Skip python related tests with GG_BUILD_LOW_PERF=1 since Windows MSYS2 UCRT64 currently fails to create
+          # a valid python environment for testing
+          LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp
+
  ggml-ci-intel-openvino-gpu-low-perf:
    runs-on: [self-hosted, Linux, Intel, OpenVINO]

--- a/.github/workflows/build-vulkan.yml
+++ b/.github/workflows/build-vulkan.yml
@@ -72,7 +72,7 @@ jobs:

      - name: Setup Vulkan SDK
        if: steps.cache-sdk.outputs.cache-hit != 'true'
-        uses: ./.github/actions/linux-setup-vulkan-llvmpipe
+        uses: ./.github/actions/linux-setup-vulkan
        with:
          path: ./vulkan_sdk
          version: ${{ env.VULKAN_SDK_VERSION }}
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -996,7 +996,7 @@ jobs:
          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}

  ubuntu-cpu-riscv64-native:
-    runs-on: RISCV64
+    runs-on: ubuntu-24.04-riscv

    steps:
      - name: Install dependencies
@@ -1004,24 +1004,21 @@ jobs:
          sudo apt-get update

          # Install necessary packages
-          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache git-lfs
+          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 cmake build-essential libssl-dev wget git-lfs

          # Set gcc-14 and g++-14 as the default compilers
          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
-          sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
-          sudo ln -sf /usr/bin/g++-14 /usr/bin/g++

-          # Install Rust stable version
-          rustup install stable
-          rustup default stable
+          if ! which rustc; then
+            # Install Rust stable version
+            sudo apt-get install -y rustup
+            rustup install stable
+            rustup default stable
+          fi

          git lfs install

-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
      - name: Check environment
        run: |
          uname -a
@@ -1031,25 +1028,17 @@ jobs:
          cmake --version
          rustc --version

-      - name: Setup ccache
-        run: |
-          # Set unique cache directory for this job
-          export CCACHE_DIR="$HOME/.ccache/cpu-cmake-rv64-native"
-          mkdir -p "$CCACHE_DIR"
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6

-          # Configure ccache for optimal performance
-          ccache --set-config=max_size=5G
-          ccache --set-config=compression=true
-          ccache --set-config=compression_level=6
-          ccache --set-config=cache_dir="$CCACHE_DIR"
-
-          # Enable more aggressive caching
-          ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
-          ccache --set-config=hash_dir=false
-
-          # Export for subsequent steps
-          echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
-          echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
+      # FIXME: Enable when ggml-org/ccache-action works on riscv64
+      # - name: ccache
+      #   uses: ggml-org/ccache-action@v1.2.21
+      #   with:
+      #     key: ubuntu-cpu-riscv64-native
+      #     evict-old-files: 1d
+      #     save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build
        id: cmake_build
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -73,8 +73,8 @@ jobs:
            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
            { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
-            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.9.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.9.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
+            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
+            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
            { "tag": "musa", "dockerfile": ".devops/musa.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -119,6 +119,11 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
        CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
    fi

+    # Build shared libs on Windows
+    # to reduce binary size and avoid errors in library loading unit tests
+    if uname -s | grep -qi nt; then
+        CMAKE_EXTRA="${CMAKE_EXTRA} -DBUILD_SHARED_LIBS=ON"
+    fi
 fi

 if [ ! -z ${GG_BUILD_WEBGPU} ]; then
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@@ -8,109 +8,11 @@
 #include "nlohmann/json.hpp"
 #include "peg-parser.h"

-#include <algorithm>
 #include <stdexcept>
 #include <string>

 using json = nlohmann::ordered_json;

-namespace {
-
-// Gemma4-specific PEG builder extending the standard chat builder.
-// Adds value type parsers that use <|\"|> as string delimiters
-// instead of JSON's double quotes, and disables json-to-schema
-// conversion for these types.
-class common_peg_gemma4_builder {
-    common_chat_peg_builder & p_;
-    static constexpr const char * QUOTE = "<|\"|>";
-
-public:
-    explicit common_peg_gemma4_builder(common_chat_peg_builder & p) : p_(p) {}
-
-    common_peg_parser gemma4_string() {
-        return p_.rule("gemma4-string", [&]() {
-            return p_.literal(QUOTE) + p_.until(QUOTE) + p_.literal(QUOTE);
-        });
-    }
-
-    common_peg_parser gemma4_number() {
-        return p_.rule("gemma4-number", [&]() {
-            auto digit1_9 = p_.chars("[1-9]", 1, 1);
-            auto digits   = p_.chars("[0-9]");
-            auto int_part = p_.choice({p_.literal("0"), p_.sequence({digit1_9, p_.chars("[0-9]", 0, -1)})});
-            auto frac     = p_.sequence({p_.literal("."), digits});
-            auto exp      = p_.sequence({p_.choice({p_.literal("e"), p_.literal("E")}),
-                                         p_.optional(p_.chars("[+-]", 1, 1)), digits});
-            auto not_number_continuation = p_.negate(p_.chars("[0-9.eE+-]", 1, 1));
-            return p_.sequence({p_.optional(p_.literal("-")), int_part, p_.optional(frac),
-                                p_.optional(exp), not_number_continuation});
-        });
-    }
-
-    common_peg_parser gemma4_bool() {
-        return p_.rule("gemma4-bool", [&]() {
-            return p_.choice({p_.literal("true"), p_.literal("false")});
-        });
-    }
-
-    common_peg_parser gemma4_null() {
-        return p_.rule("gemma4-null", [&]() {
-            return p_.literal("null");
-        });
-    }
-
-    common_peg_parser gemma4_dict() {
-        return p_.rule("gemma4-dict", [&]() {
-            auto ws = p_.space();
-            auto key = p_.until(":");
-            auto member = p_.sequence({key, p_.literal(":"), ws, gemma4_value()});
-            auto members = p_.sequence({member, p_.zero_or_more(p_.sequence({p_.literal(","), ws, member}))});
-            return p_.sequence({
-                p_.literal("{"), ws,
-                p_.choice({p_.literal("}"), p_.sequence({members, ws, p_.literal("}")})})
-            });
-        });
-    }
-
-    common_peg_parser gemma4_array() {
-        return p_.rule("gemma4-array", [&]() {
-            auto ws = p_.space();
-            auto elements = p_.sequence({gemma4_value(), p_.zero_or_more(p_.sequence({p_.literal(","), ws, gemma4_value()}))});
-            return p_.sequence({
-                p_.literal("["), ws,
-                p_.choice({p_.literal("]"), p_.sequence({elements, ws, p_.literal("]")})})
-            });
-        });
-    }
-
-    common_peg_parser gemma4_value() {
-        return p_.rule("gemma4-value", [&]() {
-            return p_.choice({gemma4_string(), gemma4_dict(), gemma4_array(),
-                              gemma4_number(), gemma4_bool(), gemma4_null()});
-        });
-    }
-
-    // Select the appropriate value parser based on JSON schema type.
-    // Does NOT use schema() - the gemma4 types are pure PEG without
-    // JSON schema metadata, so GBNF is generated directly from the
-    // PEG structure.
-    common_peg_parser gemma4_value_for_type(const json & schema) {
-        if (!schema.contains("type") || !schema.at("type").is_string()) {
-            return gemma4_value();
-        }
-        std::string type = schema.at("type").get<std::string>();
-        if (type == "string")  { return gemma4_string(); }
-        if (type == "number")  { return gemma4_number(); }
-        if (type == "integer") { return gemma4_number(); }
-        if (type == "boolean") { return gemma4_bool(); }
-        if (type == "object")  { return gemma4_dict(); }
-        if (type == "array")   { return gemma4_array(); }
-        return gemma4_value();
-    }
-};
-
-}  // anonymous namespace
-
 // Helper to iterate over tools/functions
 static void foreach_function(const json & tools, const std::function<void(const json &)> & fn) {
    for (const auto & tool : tools) {
@@ -142,9 +44,7 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
    // Create the result structure
    common_chat_params data;
    data.prompt           = common_chat_template_direct_apply(tmpl, inputs);
-    data.format           = (autoparser.tools.format.mode == tool_format::TAG_WITH_GEMMA4_DICT)
-                            ? COMMON_CHAT_FORMAT_PEG_GEMMA4
-                            : COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.format           = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.preserved_tokens = autoparser.preserved_tokens;

    auto parser = autoparser.build_parser(inputs);
@@ -271,8 +171,6 @@ common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const
            return build_tool_parser_tag_json(ctx);
        case tool_format::TAG_WITH_TAGGED:
            return build_tool_parser_tag_tagged(ctx);
-        case tool_format::TAG_WITH_GEMMA4_DICT:
-            return build_tool_parser_tag_gemma4_dict(ctx);
        default:
            LOG_ERR("[ERROR] Template seems to support tool calls, but failed to determine tool format. Tool calling will not work properly. "
                "Check for a fixed template for your model in the models/templates directory of your llama.cpp installation or "
@@ -586,145 +484,4 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
           p.end();
 }

-common_peg_parser analyze_tools::build_tool_parser_tag_gemma4_dict(parser_build_context & ctx) const {
-    auto &       p           = ctx.p;
-    const auto & inputs      = ctx.inputs;
-    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-
-    common_peg_gemma4_builder g4(p);
-    static const std::string QUOTE = "<|\"|>";
-
-    common_peg_parser tool_choice = p.choice();
-
-    foreach_function(inputs.tools, [&](const json & tool) {
-        const auto & func   = tool.at("function");
-        std::string  name   = func.at("name");
-        const auto & params = func.at("parameters");
-
-        if (!params.contains("properties") || !params.at("properties").is_object()) {
-            auto func_parser = p.atomic(
-                p.tool_open(p.literal(function.name_prefix) + p.tool_name(p.literal(name)) + p.literal("{")) +
-                p.tool_args(p.eps()) +
-                p.tool_close(p.literal("}")));
-            tool_choice |= p.rule("tool-" + name, func_parser);
-            return;
-        }
-
-        const auto &          properties = params.at("properties");
-        std::set<std::string> required;
-        if (params.contains("required") && params.at("required").is_array()) {
-            params.at("required").get_to(required);
-        }
-
-        // Build per-argument parsers, sorted alphabetically (matching template's dictsort)
-        struct arg_entry {
-            std::string       param_name;
-            common_peg_parser parser;
-        };
-        std::vector<arg_entry> arg_entries;
-
-        for (const auto & [param_name, param_schema] : properties.items()) {
-            std::string type = "object";
-            if (param_schema.contains("type")) {
-                const auto & type_v = param_schema.at("type");
-                if (type_v.is_string()) {
-                    type_v.get_to(type);
-                } else if (type_v.is_array()) {
-                    // Handle nullable types like ["string", "null"]
-                    for (const auto & t : type_v) {
-                        if (t.is_string() && t.get<std::string>() != "null") {
-                            type = t.get<std::string>();
-                            break;
-                        }
-                    }
-                }
-            }
-            // Infer string type from enum values when type is unspecified
-            if (type == "object" && param_schema.contains("enum")) {
-                const auto & enum_vals = param_schema.at("enum");
-                if (enum_vals.is_array()) {
-                    for (const auto & v : enum_vals) {
-                        if (v.is_string()) {
-                            type = "string";
-                            break;
-                        }
-                    }
-                }
-            }
-
-            common_peg_parser value_parser = p.eps();
-            if (type == "string") {
-                // String values are delimited by <|"|>...<|"|>
-                value_parser =
-                    p.literal(QUOTE) +
-                    p.tool_arg_string_value(p.schema(p.until(QUOTE),
-                        "tool-" + name + "-arg-" + param_name + "-schema", param_schema, true)) +
-                    p.literal(QUOTE);
-            } else if (type == "number" || type == "integer") {
-                value_parser = p.tool_arg_value(g4.gemma4_number());
-            } else if (type == "boolean") {
-                value_parser = p.tool_arg_value(g4.gemma4_bool());
-            } else if (type == "null") {
-                value_parser = p.tool_arg_value(g4.gemma4_null());
-            } else if (type == "object") {
-                value_parser = p.tool_arg_value(g4.gemma4_dict());
-            } else if (type == "array") {
-                value_parser = p.tool_arg_value(g4.gemma4_array());
-            } else {
-                value_parser = p.tool_arg_value(g4.gemma4_value());
-            }
-
-            auto arg = p.tool_arg(
-                p.tool_arg_open(p.tool_arg_name(p.literal(param_name)) + p.literal(":")) +
-                value_parser +
-                p.tool_arg_close(p.eps()));
-
-            arg_entries.push_back({param_name, p.rule("tool-" + name + "-arg-" + param_name, arg)});
-        }
-
-        // Sort alphabetically to match Jinja's dictsort
-        std::sort(arg_entries.begin(), arg_entries.end(), [](const auto & a, const auto & b) {
-            return a.param_name < b.param_name;
-        });
-
-        // Build arg sequence: any arg, then zero-or-more comma-separated additional args
-        common_peg_parser args_seq = p.eps();
-        if (!arg_entries.empty()) {
-            common_peg_parser any_arg = p.choice();
-            for (auto & entry : arg_entries) {
-                any_arg |= entry.parser;
-            }
-            args_seq = p.optional(
-                any_arg + p.repeat(p.literal(",") + any_arg, 0, (int) arg_entries.size() - 1));
-        }
-
-        // Full parser: call:name{args}
-        auto func_parser = p.atomic(
-            p.tool_open(p.literal(function.name_prefix) + p.tool_name(p.literal(name)) + p.literal("{")) +
-            p.tool_args(args_seq) +
-            p.tool_close(p.literal("}")));
-
-        tool_choice |= p.rule("tool-" + name, func_parser);
-    });
-
-    // Wrap each call in <|tool_call>...</tool_call|>
-    auto wrapped_call = p.literal(format.per_call_start) + tool_choice + p.literal(format.per_call_end);
-
-    common_peg_parser tool_calls = p.eps();
-    if (inputs.parallel_tool_calls) {
-        tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call));
-    } else {
-        tool_calls = p.trigger_rule("tool-call", wrapped_call);
-    }
-
-    if (!force_tools) {
-        tool_calls = p.optional(tool_calls);
-    }
-
-    auto content_before_tools = p.until_one_of({ format.per_call_start, ctx.reasoning->start });
-    return ctx.reasoning_parser +
-           (force_tools ? p.eps() : p.optional(p.content(content_before_tools) + p.optional(ctx.reasoning_parser))) +
-           tool_calls + p.end();
-}
-
 }  // namespace autoparser
--- a/common/chat-auto-parser.h
+++ b/common/chat-auto-parser.h
@@ -145,7 +145,6 @@ enum class tool_format {
    JSON_NATIVE,      // Pure JSON: {"name": "X", "arguments": {...}}
    TAG_WITH_JSON,    // Tag-based with JSON args: <function=X>{...}</function>
    TAG_WITH_TAGGED,  // Tag-based with tagged args: <param=key>value</param>
-    TAG_WITH_GEMMA4_DICT, // Gemma4 custom dict: <|tool_call>call:name{key:<|"|>val<|"|>}<tool_call|>
 };

 inline std::ostream & operator<<(std::ostream & os, const tool_format & format) {
@@ -158,8 +157,6 @@ inline std::ostream & operator<<(std::ostream & os, const tool_format & format)
            return os << "TAG_WITH_JSON";
        case tool_format::TAG_WITH_TAGGED:
            return os << "TAG_WITH_TAGGED";
-        case tool_format::TAG_WITH_GEMMA4_DICT:
-            return os << "TAG_WITH_GEMMA4_DICT";
        default:
            return os << "UNKNOWN";
    }
@@ -363,7 +360,6 @@ struct analyze_tools : analyze_base {
                                        const common_peg_parser & call_id_section, bool have_call_id,
                                        const common_peg_parser & args,
                                        std::optional<common_peg_parser> atomic_peek) const;
-    common_peg_parser build_tool_parser_tag_gemma4_dict(parser_build_context & ctx) const;
 };

 // ============================================================================
--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
@@ -95,34 +95,6 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
              LOG_DBG(ANSI_ORANGE "[Patch: Functionary 3.1]\n" ANSI_RESET);
          }
      },
-      // Gemma4 - custom dict format: <|tool_call>call:name{key:<|"|>val<|"|>}<tool_call|>
-      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
-          if (tmpl.src.find("'<|tool_call>call:'") != std::string::npos) {
-              analysis.tools.format.mode           = tool_format::TAG_WITH_GEMMA4_DICT;
-              analysis.tools.format.per_call_start = "<|tool_call>";
-              analysis.tools.format.per_call_end   = "<tool_call|>";
-              analysis.tools.format.section_start  = "";
-              analysis.tools.format.section_end    = "";
-              analysis.tools.function.name_prefix  = "call:";
-              analysis.tools.function.name_suffix  = "";
-              analysis.tools.arguments.start       = "{";
-              analysis.tools.arguments.end         = "}";
-              analysis.tools.arguments.name_prefix = "";
-              analysis.tools.arguments.name_suffix = ":";
-              analysis.tools.arguments.separator   = ",";
-              analysis.reasoning.mode              = reasoning_mode::TAG_BASED;
-              analysis.reasoning.start             = "<|channel>thought";
-              analysis.reasoning.end               = "<channel|>";
-              analysis.preserved_tokens.clear();
-              analysis.preserved_tokens.push_back("<|tool_call>");
-              analysis.preserved_tokens.push_back("<tool_call|>");
-              analysis.preserved_tokens.push_back("<|tool_response>");
-              analysis.preserved_tokens.push_back("<tool_response|>");
-              analysis.preserved_tokens.push_back("<|\"|>");
-              analysis.preserved_tokens.push_back("<|turn>");
-              LOG_DBG(ANSI_ORANGE "[Patch: Gemma4]\n" ANSI_RESET);
-          }
-      },
      // DeepSeek-R1-Distill-Qwen
      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
          if (tmpl.src.find(
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@@ -75,84 +75,6 @@ static std::string escape_json_string_inner(const std::string & s) {
    return escaped;
 }

-static const std::string GEMMA4_QUOTE = "<|\"|>";
-
-static std::string normalize_gemma4_to_json(const std::string & input) {
-    std::string result;
-    result.reserve(input.size() * 2);
-
-    enum Ctx { DICT, ARRAY };
-    std::vector<Ctx> ctx;
-
-    auto is_ws = [](char c) { return c == ' ' || c == '\t' || c == '\n' || c == '\r'; };
-    auto skip_ws = [&](size_t & pos) {
-        while (pos < input.size() && is_ws(input[pos])) {
-            result += input[pos++];
-        }
-    };
-
-    auto quote_unquoted_key = [&](size_t & pos) {
-        if (pos < input.size() && input[pos] != '"' && input[pos] != '}') {
-            result += '"';
-            while (pos < input.size() && input[pos] != ':' && !is_ws(input[pos])) {
-                result += input[pos++];
-            }
-            result += '"';
-            skip_ws(pos);
-        }
-    };
-
-    size_t i = 0;
-    while (i < input.size()) {
-        if (i + GEMMA4_QUOTE.size() <= input.size() &&
-            input.compare(i, GEMMA4_QUOTE.size(), GEMMA4_QUOTE) == 0) {
-            result += '"';
-            i += GEMMA4_QUOTE.size();
-            continue;
-        }
-
-        char c = input[i];
-
-        if (c == '{') {
-            result += c;
-            ctx.push_back(DICT);
-            ++i;
-            skip_ws(i);
-            quote_unquoted_key(i);
-            continue;
-        }
-        if (c == '}') {
-            result += c;
-            if (!ctx.empty()) ctx.pop_back();
-            ++i;
-            continue;
-        }
-        if (c == '[') {
-            result += c;
-            ctx.push_back(ARRAY);
-            ++i;
-            continue;
-        }
-        if (c == ']') {
-            result += c;
-            if (!ctx.empty()) ctx.pop_back();
-            ++i;
-            continue;
-        }
-        if (c == ',' && !ctx.empty() && ctx.back() == DICT) {
-            result += c;
-            ++i;
-            skip_ws(i);
-            quote_unquoted_key(i);
-            continue;
-        }
-
-        result += c;
-        ++i;
-    }
-    return result;
-}
-
 // Convert Python-style single-quoted strings to JSON double-quoted strings
 // Only converts outer string delimiters, properly handling escape sequences:
 // - {'key': 'value'} -> {"key": "value"}
@@ -296,10 +218,6 @@ std::string common_chat_peg_mapper::normalize_container_value(const std::string
    return normalize_quotes_to_json(input);
 }

-std::string common_chat_peg_gemma4_mapper::normalize_container_value(const std::string & input) {
-    return normalize_quotes_to_json(normalize_gemma4_to_json(input));
-}
-
 void common_chat_peg_mapper::from_ast(const common_peg_ast_arena &    arena,
                                      const common_peg_parse_result & parse_result_arg) {
    arena.visit(parse_result_arg, [this](const common_peg_ast_node & node) { map(node); });
@@ -947,3 +865,143 @@ common_peg_parser common_chat_peg_builder::standard_json_tools(

    return force_tool_calls ? section : optional(section);
 }
+
+void common_chat_peg_gemma4_mapper::from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result) {
+    for (const auto & node : result.nodes) {
+        visit(arena, node);
+    }
+}
+
+static std::string gemma4_to_json(const common_peg_ast_arena & arena, common_peg_ast_id id) {
+    const auto & node = arena.get(id);
+
+    if (node.text.empty()) {
+        return "";
+    }
+
+    if (node.rule == "gemma4-number" || node.rule == "gemma4-bool" || node.rule == "gemma4-null") {
+        return std::string(node.text);
+    }
+
+    if (node.rule == "gemma4-string-content") {
+        return escape_json_string_inner(std::string(node.text));
+    }
+
+    if (node.rule == "gemma4-string") {
+        std::string result = "\"";
+        if (!node.children.empty()) {
+            result += gemma4_to_json(arena, node.children[0]);
+            if (!node.is_partial) {
+                result += "\"";
+            }
+        }
+        return result;
+    }
+
+    if (node.rule == "gemma4-array") {
+        std::string result = "[";
+
+        bool add_comma = false;
+        for (auto child_id : node.children) {
+            if (add_comma) {
+                result += ',';
+            }
+            add_comma = true;
+            result += gemma4_to_json(arena, child_id);
+        }
+
+        if (!node.is_partial) {
+            result += ']';
+        }
+        return result;
+    }
+
+    if (node.rule == "gemma4-dict-key-name") {
+        return std::string(node.text);
+    }
+
+    if (node.rule == "gemma4-dict-key") {
+        std::string result = "\"";
+        if (!node.children.empty()) {
+            result += escape_json_string_inner(gemma4_to_json(arena, node.children[0]));
+        }
+        if (!node.is_partial) {
+            result += "\":";
+        }
+        return result;
+    }
+
+    if (node.rule == "gemma4-dict-kv") {
+        std::string result;
+        for (auto child_id : node.children) {
+            result += gemma4_to_json(arena, child_id);
+        }
+        return result;
+    }
+
+    if (node.rule == "gemma4-dict") {
+        std::string result = "{";
+
+        bool add_comma = false;
+        for (auto child_id : node.children) {
+            if (add_comma) {
+                result += ',';
+            }
+            add_comma = true;
+            result += gemma4_to_json(arena, child_id);
+        }
+
+        if (!node.is_partial) {
+            result += '}';
+        }
+        return result;
+    }
+
+    if (node.rule == "gemma4-value") {
+        if (!node.children.empty()) {
+            return gemma4_to_json(arena, node.children[0]);
+        }
+        return "";
+    }
+
+    return "";
+}
+
+void common_chat_peg_gemma4_mapper::visit(const common_peg_ast_arena & arena, common_peg_ast_id id) {
+    const auto & node = arena.get(id);
+
+    if (node.tag == "reasoning") {
+        result.reasoning_content += std::string(node.text);
+        return;
+    }
+
+    if (node.tag == "content") {
+        result.content += std::string(node.text);
+        return;
+    }
+
+    if (node.tag == "tool") {
+        auto name_id = arena.find_by_tag(node, "tool-name");
+        auto args_id = arena.find_by_tag(node, "tool-args");
+
+        if (name_id != COMMON_PEG_INVALID_AST_ID && args_id != COMMON_PEG_INVALID_AST_ID) {
+            const auto & name_node = arena.get(name_id);
+            const auto & args_node = arena.get(args_id);
+
+            if (!name_node.is_partial) {
+                common_chat_tool_call call;
+                call.name = std::string(name_node.text);
+                if (!args_node.children.empty()) {
+                    call.arguments = gemma4_to_json(arena, args_node.children[0]);
+                }
+                result.tool_calls.push_back(call);
+            }
+        }
+
+        return;
+    }
+
+    for (auto child_id : node.children) {
+        visit(arena, child_id);
+    }
+}
--- a/common/chat-peg-parser.h
+++ b/common/chat-peg-parser.h
@@ -35,8 +35,9 @@ class common_chat_peg_mapper {
 class common_chat_peg_gemma4_mapper : public common_chat_peg_mapper {
  public:
    common_chat_peg_gemma4_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
-  protected:
-    std::string normalize_container_value(const std::string & input) override;
+    virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
+  private:
+    void visit(const common_peg_ast_arena & arena, common_peg_ast_id id);
 };

 struct content_structure;
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1077,6 +1077,131 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
    return data;
 }

+static common_chat_params common_chat_params_init_gemma4(const common_chat_template &    tmpl,
+                                                         const autoparser::generation_params & inputs) {
+    common_chat_params data;
+
+    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
+    data.format            = COMMON_CHAT_FORMAT_PEG_GEMMA4;
+    data.supports_thinking = true;
+
+    data.preserved_tokens = {
+        "<|channel>",
+        "<channel|>",
+        "<|tool_call>",
+        "<tool_call|>",
+        "<|turn>",
+    };
+
+    auto has_tools           = inputs.tools.is_array() && !inputs.tools.empty();
+    auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object();
+    auto include_grammar     = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
+    auto extract_reasoning   = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
+
+    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
+        auto start = p.rule("start", p.prefix(inputs.generation_prompt, "<|channel>"));
+
+        if (extract_reasoning) {
+            p.rule("thought", p.literal("<|channel>thought\n") + p.reasoning(p.until("<channel|>")) + p.literal("<channel|>"));
+        } else {
+            p.rule("thought", p.content(p.literal("<|channel>thought\n") + p.until("<channel|>") + p.literal("<channel|>")));
+        }
+
+        auto thought = (p.peek(p.literal("<|channel>")) + p.ref("thought")) | p.negate(p.literal("<|channel>"));
+
+        if (has_response_format) {
+            auto response_format = p.literal("```json") <<
+                p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)) <<
+                p.literal("```");
+            return start + p.optional(thought) + response_format;
+        }
+
+        if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
+            // Gemma4 tool calling syntax
+            // Rules should match traversal logic in gemma4_to_json()
+            p.rule("gemma4-string-content", p.until("<|\"|>"));
+            p.rule("gemma4-string", p.literal("<|\"|>") + p.ref("gemma4-string-content") + p.literal("<|\"|>"));
+            p.rule("gemma4-bool", p.json_bool());
+            p.rule("gemma4-null", p.json_null());
+            p.rule("gemma4-number", p.json_number());
+            p.rule("gemma4-dict-key", p.rule("gemma4-dict-key-name", p.until(":")) + p.literal(":"));
+            p.rule("gemma4-dict-kv", p.ref("gemma4-dict-key") + p.space() + p.ref("gemma4-value"));
+            p.rule("gemma4-dict", [&]() {
+                auto ws = p.space();
+                auto member = p.ref("gemma4-dict-kv");
+                auto members = p.sequence({member, p.zero_or_more(p.sequence({p.literal(","), ws, member}))});
+                return p.sequence({
+                    p.literal("{"), ws,
+                    p.choice({p.literal("}"), p.sequence({members, ws, p.literal("}")})})
+                });
+            });
+            p.rule("gemma4-array", [&]() {
+                auto ws = p.space();
+                auto value = p.ref("gemma4-value");
+                auto elements = p.sequence({value, p.zero_or_more(p.sequence({p.literal(","), ws, value}))});
+                return p.sequence({
+                    p.literal("["), ws,
+                    p.choice({p.literal("]"), p.sequence({elements, ws, p.literal("]")})})
+                });
+            });
+            p.rule("gemma4-value", [&]() {
+                return p.choice({
+                    p.ref("gemma4-string"), p.ref("gemma4-dict"), p.ref("gemma4-array"),
+                    p.ref("gemma4-number"), p.ref("gemma4-bool"), p.ref("gemma4-null")
+                });
+            });
+
+            auto tool_choice = p.choice();
+
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                std::string  name     = function.at("name");
+                // TODO @aldehir : need to extend json-schema-to-grammar to produce more than JSON rules
+                // const auto & params   = function.at("parameters");
+
+                tool_choice |= p.rule("tool-" + name, p.tool(p.sequence({
+                    p.tool_open(p.tool_name(p.literal(name)) + p.peek(p.literal("{"))),
+                    p.tool_args(p.ref("gemma4-dict")),
+                })));
+            });
+
+            auto tool_call = p.trigger_rule("tool-call", p.repeat(
+                "<|tool_call>call:" + tool_choice + "<tool_call|>",
+                /* min = */ inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0,
+                /* max = */ inputs.parallel_tool_calls ? -1 : 1
+            ));
+
+            auto content = p.rule("content", p.content(p.until_one_of({"<|channel>", "<|tool_call>"})));
+            auto message = p.rule("message", thought + content);
+            return start + p.zero_or_more(message) + tool_call;
+        }
+
+        auto content = p.rule("content", p.content(p.until("<|channel>")));
+        auto message = p.rule("message", thought + content);
+        return start + p.one_or_more(message);
+    });
+
+    data.parser = parser.save();
+
+    if (include_grammar) {
+        data.grammar_lazy = !(has_response_format || (has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED));
+        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                auto         schema   = function.at("parameters");
+                builder.resolve_refs(schema);
+            });
+            parser.build_grammar(builder, data.grammar_lazy);
+        });
+
+        data.grammar_triggers = {
+            { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool_call>" },
+        };
+    }
+
+    return data;
+}
+
 // Functionary v3.2 - uses recipient-based format: >>>recipient\n{content}
 static common_chat_params common_chat_params_init_functionary_v3_2(const common_chat_template &    tmpl,
                                                                   const autoparser::generation_params & inputs) {
@@ -1556,46 +1681,146 @@ static void requires_non_null_content(json & messages) {
 }

 // Gemma4 uses a custom tool_responses field instead of role:tool messages.
-// Convert consecutive role:tool messages into a single user message with tool_responses.
+//
+// This will transform a sequence of messages:
+//   assistant(tool_call+) -> tool+ -> assistant(content)
+//
+// Into a single assistant message containing a tool_responses field:
+//   assistant(content + tool_call + tool_responses)
+//
+// This is necessary for the Gemma4 chat template to properly format the prompt.
+// See https://ai.google.dev/gemma/docs/core/prompt-formatting-gemma4
+struct gemma4_model_turn_builder {
+    json & messages;
+    size_t pos;
+    json tool_calls = json::array();
+    json tool_responses = json::array();
+    json content;
+    json reasoning_content;
+
+    gemma4_model_turn_builder(json & msgs, size_t pos) : messages(msgs), pos(pos) {}
+
+    void collect() {
+        // Collect the first assistant message
+        auto & msg = messages[pos];
+        if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
+            // According to the prompt formatting guide, we need to preserve reasoning_content
+            // between function calls. The current chat templates do not support this, but we will do it anyway.
+            reasoning_content = msg.at("reasoning_content");
+        }
+        for (auto & tc : msg.at("tool_calls")) {
+            tool_calls.push_back(tc);
+        }
+        pos++;
+
+        // Collect tool call results
+        while (pos < messages.size() && messages[pos].value("role", "") == "tool") {
+            collect_result(messages[pos]);
+            pos++;
+        }
+
+        // Check if the next assistant message is the final message
+        if (pos < messages.size() && messages[pos].value("role", "") == "assistant") {
+            auto & next = messages[pos];
+            if (!has_tool_calls(next) && has_content(next)) {
+                content = next.at("content");
+                pos++;
+            }
+        }
+    }
+
+    void collect_result(const json & curr) {
+        json response;
+        if (curr.contains("content")) {
+            const auto & content = curr.at("content");
+            if (content.is_string()) {
+                // Try to parse the content as JSON; fall back to raw string
+                try {
+                    response = json::parse(content.get<std::string>());
+                } catch (...) {
+                    response = content;
+                }
+            } else {
+                response = content;
+            }
+        }
+
+        std::string name;
+
+        // Match name with corresponding tool call
+        size_t idx = tool_responses.size();
+        if (idx < tool_calls.size()) {
+            auto & tc = tool_calls[idx];
+            if (tc.contains("function")) {
+                name = tc.at("function").value("name", "");
+            }
+        }
+
+        // Fallback to the tool call id
+        if (name.empty()) {
+            name = curr.value("tool_call_id", "");
+        }
+
+        tool_responses.push_back({{"name", name}, {"response", response}});
+    }
+
+    json build() {
+        collect();
+
+        json msg = {
+            {"role", "assistant"},
+            {"tool_calls", tool_calls},
+        };
+        if (!tool_responses.empty()) {
+            msg["tool_responses"] = tool_responses;
+        }
+        if (!content.is_null()) {
+            msg["content"] = content;
+        }
+        if (!reasoning_content.is_null()) {
+            msg["reasoning_content"] = reasoning_content;
+        }
+        return msg;
+    }
+
+    static bool has_content(const json & msg) {
+        if (!msg.contains("content") || msg.at("content").is_null()) {
+            return false;
+        }
+        const auto & content = msg.at("content");
+        if (content.is_string() && !content.get<std::string>().empty()) {
+            return true;
+        }
+        if (content.is_array() && !content.empty()) {
+            return true;
+        }
+        return false;
+    }
+
+    static bool has_tool_calls(const json & msg) {
+        return msg.contains("tool_calls") && msg.at("tool_calls").is_array() && !msg.at("tool_calls").empty();
+    }
+};
+
 static void convert_tool_responses_gemma4(json & messages) {
    json result = json::array();
    size_t i = 0;
+
    while (i < messages.size()) {
-        if (messages[i].contains("role") && messages[i].at("role") == "tool") {
-            json tool_responses = json::array();
-            while (i < messages.size() &&
-                   messages[i].contains("role") &&
-                   messages[i].at("role") == "tool") {
-                const auto & tool_msg = messages[i];
-                std::string name;
-                if (tool_msg.contains("tool_call_id") && tool_msg.at("tool_call_id").is_string()) {
-                    name = tool_msg.at("tool_call_id");
-                } else if (tool_msg.contains("name") && tool_msg.at("name").is_string()) {
-                    name = tool_msg.at("name");
-                }
-                json response;
-                if (tool_msg.contains("content")) {
-                    const auto & content = tool_msg.at("content");
-                    if (content.is_string()) {
-                        // Try to parse the content as JSON; fall back to raw string
-                        try {
-                            response = json::parse(content.get<std::string>());
-                        } catch (...) {
-                            response = content;
-                        }
-                    } else {
-                        response = content;
-                    }
-                }
-                tool_responses.push_back({{"name", name}, {"response", response}});
-                i++;
-            }
-            result.push_back({{"role", "user"}, {"tool_responses", tool_responses}});
-        } else {
-            result.push_back(messages[i]);
+        auto & msg = messages[i];
+
+        if (msg.value("role", "") != "assistant" || !msg.contains("tool_calls") ||
+            !msg.at("tool_calls").is_array() || msg.at("tool_calls").empty()) {
+            result.push_back(msg);
            i++;
+            continue;
        }
+
+        gemma4_model_turn_builder builder(messages, i);
+        result.push_back(builder.build());
+        i = builder.pos;
    }
+
    messages = result;
 }

@@ -1634,7 +1859,7 @@ static json common_chat_extra_context() {
 std::optional<common_chat_params> common_chat_try_specialized_template(
        const common_chat_template &          tmpl,
        const std::string &                   src,
-        const autoparser::generation_params & params) {
+        autoparser::generation_params & params) {
    // Ministral/Mistral Large 3 - uses special reasoning structure fixes, can't use autoparser
    // Note: Mistral Small 3.2 uses [CALL_ID] which Ministral doesn't have, so we can distinguish them
    if (src.find("[SYSTEM_PROMPT]") != std::string::npos && src.find("[TOOL_CALLS]") != std::string::npos &&
@@ -1687,6 +1912,12 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
        return common_chat_params_init_gigachat_v3(tmpl, params);
    }

+    // Gemma4 format detection
+    if (src.find("'<|tool_call>call:'") != std::string::npos) {
+        workaround::convert_tool_responses_gemma4(params.messages);
+        return common_chat_params_init_gemma4(tmpl, params);
+    }
+
    return std::nullopt;
 }

@@ -1727,10 +1958,6 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        workaround::func_args_not_string(params.messages);
    }

-    if (src.find("'<|tool_call>call:'") != std::string::npos) {
-        workaround::convert_tool_responses_gemma4(params.messages);
-    }
-
    params.add_generation_prompt = false;
    std::string no_gen_prompt    = common_chat_template_direct_apply_impl(tmpl, params);
    params.add_generation_prompt = true;
--- a/common/chat.h
+++ b/common/chat.h
@@ -274,4 +274,4 @@ std::string common_chat_template_direct_apply(
 std::optional<common_chat_params> common_chat_try_specialized_template(
        const common_chat_template &          tmpl,
        const std::string &                   src,
-        const autoparser::generation_params & params);
+        autoparser::generation_params & params);
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -596,9 +596,12 @@ static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
        }
    }

-    for (const auto & f : files) {
-        if (gguf_filename_is_model(f.path)) {
-            return f;
+    // fallback to first available model only if tag is empty
+    if (tag.empty()) {
+        for (const auto & f : files) {
+            if (gguf_filename_is_model(f.path)) {
+                return f;
+            }
        }
    }

--- a/common/peg-parser.cpp
+++ b/common/peg-parser.cpp
@@ -256,6 +256,38 @@ static std::pair<std::vector<common_peg_chars_parser::char_range>, bool> parse_c
    return {ranges, negated};
 }

+common_peg_ast_id common_peg_ast_arena::find_by_tag(const common_peg_ast_node & parent, const std::string & tag, int max_depth) const {
+    for (auto child_id : parent.children) {
+        const auto & child = get(child_id);
+        if (child.tag == tag) {
+            return child_id;
+        }
+        if (max_depth > 1) {
+            auto result = find_by_tag(child, tag, max_depth - 1);
+            if (result != COMMON_PEG_INVALID_AST_ID) {
+                return result;
+            }
+        }
+    }
+    return COMMON_PEG_INVALID_AST_ID;
+}
+
+common_peg_ast_id common_peg_ast_arena::find_by_rule(const common_peg_ast_node & parent, const std::string & rule, int max_depth) const {
+    for (auto child_id : parent.children) {
+        const auto & child = get(child_id);
+        if (child.rule == rule) {
+            return child_id;
+        }
+        if (max_depth > 1) {
+            auto result = find_by_rule(child, rule, max_depth - 1);
+            if (result != COMMON_PEG_INVALID_AST_ID) {
+                return result;
+            }
+        }
+    }
+    return COMMON_PEG_INVALID_AST_ID;
+}
+
 void common_peg_ast_arena::visit(common_peg_ast_id id, const common_peg_ast_visitor & visitor) const {
    if (id == COMMON_PEG_INVALID_AST_ID) {
        return;
--- a/common/peg-parser.h
+++ b/common/peg-parser.h
@@ -106,6 +106,9 @@ class common_peg_ast_arena {

    const common_peg_ast_node & get(common_peg_ast_id id) const { return nodes_.at(id); }

+    common_peg_ast_id find_by_tag(const common_peg_ast_node & parent, const std::string & tag, int max_depth = 3) const;
+    common_peg_ast_id find_by_rule(const common_peg_ast_node & parent, const std::string & tag, int max_depth = 3) const;
+
    size_t size() const { return nodes_.size(); }

    void clear() { nodes_.clear(); }
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -11521,13 +11521,50 @@ class LLaDAMoEModel(TextModel):
                raise ValueError(f"Unprocessed experts: {experts}")


-@ModelBase.register("HunYuanDenseV1ForCausalLM")
+@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration")
 class HunYuanModel(TextModel):
    model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE

+    def _get_eod_token_id(self) -> int | None:
+        """Get the actual end-of-generation token from config (eod_token_id)."""
+        return self.hparams.get("eod_token_id")
+
+    def _get_eot_token_id(self) -> int | None:
+        """Get the end-of-turn token from generation_config.json.
+        This is the first entry in eos_token_id when it's a list."""
+        gen_cfg_path = self.dir_model / "generation_config.json"
+        if gen_cfg_path.is_file():
+            with open(gen_cfg_path, encoding="utf-8") as f:
+                gen_cfg = json.load(f)
+            eos = gen_cfg.get("eos_token_id")
+            if isinstance(eos, list) and len(eos) >= 2:
+                return eos[0]
+        return None
+
+    def _fix_special_tokens(self):
+        """Fix EOS/EOT tokens that are incorrect in upstream configs."""
+        eod_id = self._get_eod_token_id()
+        if eod_id is not None:
+            self.gguf_writer.add_eos_token_id(eod_id)
+        eot_id = self._get_eot_token_id()
+        if eot_id is not None:
+            self.gguf_writer.add_eot_token_id(eot_id)
+
    def set_vocab(self):
        if (self.dir_model / "tokenizer.json").is_file():
-            self._set_vocab_gpt2()
+            tokens, toktypes, tokpre = self.get_vocab_base()
+            self.gguf_writer.add_tokenizer_model("gpt2")
+            self.gguf_writer.add_tokenizer_pre(tokpre)
+            self.gguf_writer.add_token_list(tokens)
+            self.gguf_writer.add_token_types(toktypes)
+
+            # HunyuanOCR has pad_token_id=-1 in config.json; exclude pad from SpecialVocab
+            token_types = None
+            if (self.hparams.get("pad_token_id") or 0) < 0:
+                token_types = ('bos', 'eos', 'unk', 'sep', 'cls', 'mask')
+            special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True, special_token_types=token_types)
+            special_vocab.add_to_gguf(self.gguf_writer)
+            self._fix_special_tokens()
        else:
            from transformers import AutoTokenizer
            tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
@@ -11579,13 +11616,18 @@ class HunYuanModel(TextModel):
            # FIX for BOS token: Overwrite incorrect id read from config.json
            if self.hparams['hidden_size'] == 4096:
                self.gguf_writer.add_bos_token_id(127958) # only for 7b dense, fix <|bos|> token
+            self._fix_special_tokens()

    def set_gguf_parameters(self):
+        # HunyuanOCR has num_experts=1 which is not MoE, prevent parent from writing it
+        saved_num_experts = self.hparams.pop("num_experts", None)
        super().set_gguf_parameters()
+        if saved_num_experts is not None and saved_num_experts > 1:
+            self.hparams["num_experts"] = saved_num_experts
        hparams = self.hparams

        # Rope
-        if self.rope_parameters.get("rope_type") == "dynamic":
+        if self.rope_parameters.get("rope_type") in ("dynamic", "xdrope"):
            # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
            # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
            alpha = self.rope_parameters.get("alpha", 50)
@@ -11595,13 +11637,14 @@ class HunYuanModel(TextModel):
            self.gguf_writer.add_rope_freq_base(scaled_base)
            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
            self.gguf_writer.add_rope_scaling_factor(1)
-            # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
-            self.gguf_writer.add_context_length(256 * 1024) # 256k context length
+            if self.rope_parameters.get("rope_type") == "dynamic":
+                # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
+                self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
+                self.gguf_writer.add_context_length(256 * 1024) # 256k context length

-            # if any of our assumptions about the values are wrong, something has changed and this may need to be updated
-            assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
-                "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
+                # if any of our assumptions about the values are wrong, something has changed and this may need to be updated
+                assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
+                    "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        if name == "lm_head.weight":
@@ -11609,9 +11652,48 @@ class HunYuanModel(TextModel):
                logger.info("Skipping tied output layer 'lm_head.weight'")
                return

+        # skip vision tensors for HunyuanVL models
+        if name.startswith("vit."):
+            return
+
        yield from super().modify_tensors(data_torch, name, bid)


+@ModelBase.register("HunYuanVLForConditionalGeneration")
+class HunyuanOCRVisionModel(MmprojModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.hparams_vision is not None
+        # HunyuanOCR uses max_image_size instead of image_size
+        if "image_size" not in self.hparams_vision:
+            self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        assert self.hparams_vision is not None
+        hparams = self.hparams_vision
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
+        self.gguf_writer.add_vision_use_gelu(True)
+        self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-5))
+        self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2))
+        self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
+        self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if not name.startswith("vit."):
+            return  # skip text tensors
+        # strip CLS token (row 0) from position embeddings so resize_position_embeddings works
+        if "position_embedding" in name:
+            data_torch = data_torch[1:]  # [n_patches+1, n_embd] -> [n_patches, n_embd]
+        yield from super().modify_tensors(data_torch, name, bid)
+
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        # force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal
+        if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"):
+            return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
+        return super().tensor_force_quant(name, new_name, bid, n_dims)
+
+
@ModelBase.register("SmolLM3ForCausalLM")
 class SmolLM3Model(LlamaModel):
    model_arch = gguf.MODEL_ARCH.SMOLLM3
--- a/ggml/src/ggml-cpu/arch/arm/quants.c
+++ b/ggml/src/ggml-cpu/arch/arm/quants.c
@@ -672,34 +672,36 @@ void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
    float32x4_t acc = vdupq_n_f32(0.0f);

    for (int ib = 0; ib < nb; ++ib) {
+        const int8x8_t q8_0_lo = vld1_s8(y[2*ib].qs);
+        const int8x8_t q8_0_hi = vld1_s8(y[2*ib].qs + 8);
+        const int8x8_t q8_1_lo = vld1_s8(y[2*ib].qs + 16);
+        const int8x8_t q8_1_hi = vld1_s8(y[2*ib].qs + 24);
+        const int8x8_t q8_2_lo = vld1_s8(y[2*ib+1].qs);
+        const int8x8_t q8_2_hi = vld1_s8(y[2*ib+1].qs + 8);
+        const int8x8_t q8_3_lo = vld1_s8(y[2*ib+1].qs + 16);
+        const int8x8_t q8_3_hi = vld1_s8(y[2*ib+1].qs + 24);
+
        const uint8x16_t q4bits_0 = vld1q_u8(x[ib].qs);
        const uint8x16_t q4bits_1 = vld1q_u8(x[ib].qs + 16);
-
        const int8x16_t q4_lo_0 = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits_0, m4b));
        const int8x16_t q4_hi_0 = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits_0, 4));
        const int8x16_t q4_lo_1 = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits_1, m4b));
        const int8x16_t q4_hi_1 = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits_1, 4));

-        const int8x16_t q8_0a = vld1q_s8(y[2*ib].qs);
-        const int8x16_t q8_0b = vld1q_s8(y[2*ib].qs + 16);
-        const int8x16_t q8_lo_0 = vcombine_s8(vget_low_s8(q8_0a), vget_low_s8(q8_0b));
-        const int8x16_t q8_hi_0 = vcombine_s8(vget_high_s8(q8_0a), vget_high_s8(q8_0b));
+        const int8x8_t q4_0_lo = vget_low_s8(q4_lo_0);
+        const int8x8_t q4_0_hi = vget_low_s8(q4_hi_0);
+        const int8x8_t q4_1_lo = vget_high_s8(q4_lo_0);
+        const int8x8_t q4_1_hi = vget_high_s8(q4_hi_0);
+        const int8x8_t q4_2_lo = vget_low_s8(q4_lo_1);
+        const int8x8_t q4_2_hi = vget_low_s8(q4_hi_1);
+        const int8x8_t q4_3_lo = vget_high_s8(q4_lo_1);
+        const int8x8_t q4_3_hi = vget_high_s8(q4_hi_1);

-        const int8x16_t q8_1a = vld1q_s8(y[2*ib+1].qs);
-        const int8x16_t q8_1b = vld1q_s8(y[2*ib+1].qs + 16);
-        const int8x16_t q8_lo_1 = vcombine_s8(vget_low_s8(q8_1a), vget_low_s8(q8_1b));
-        const int8x16_t q8_hi_1 = vcombine_s8(vget_high_s8(q8_1a), vget_high_s8(q8_1b));
+        const int32x4_t p0 = ggml_nvfp4_dot8(q4_0_lo, q8_0_lo, q4_0_hi, q8_0_hi);
+        const int32x4_t p1 = ggml_nvfp4_dot8(q4_1_lo, q8_1_lo, q4_1_hi, q8_1_hi);
+        const int32x4_t p2 = ggml_nvfp4_dot8(q4_2_lo, q8_2_lo, q4_2_hi, q8_2_hi);
+        const int32x4_t p3 = ggml_nvfp4_dot8(q4_3_lo, q8_3_lo, q4_3_hi, q8_3_hi);

-        const int32x4_t p0 = vaddq_s32(
-            ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_0, q8_lo_0),
-            ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_0, q8_hi_0));
-        const int32x4_t p1 = vaddq_s32(
-            ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_1, q8_lo_1),
-            ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_1, q8_hi_1));
-
-        const int32x4_t sums = vpaddq_s32(p0, p1);
-
-        // Decode 4 UE4M3 scales to f32 and multiply with q8 scales
        const float dy0 = GGML_CPU_FP16_TO_FP32(y[2*ib].d);
        const float dy1 = GGML_CPU_FP16_TO_FP32(y[2*ib+1].d);
        const float32x4_t nvsc = {
@@ -710,7 +712,13 @@ void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
        };
        const float32x4_t scales = vmulq_f32(nvsc, (float32x4_t){dy0, dy0, dy1, dy1});

-        acc = vfmaq_f32(acc, vcvtq_f32_s32(sums), scales);
+        const float32x4_t sums = (float32x4_t){
+            (float)vaddvq_s32(p0),
+            (float)vaddvq_s32(p1),
+            (float)vaddvq_s32(p2),
+            (float)vaddvq_s32(p3)
+        };
+        acc = vfmaq_f32(acc, sums, scales);
    }
    sumf = vaddvq_f32(acc);
 #else
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -319,6 +319,15 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)

 #endif // !defined(__ARM_FEATURE_DOTPROD)

+static inline int32x4_t ggml_nvfp4_dot8(const int8x8_t q4_lo, const int8x8_t q8_lo,
+                                         const int8x8_t q4_hi, const int8x8_t q8_hi) {
+    const int16x8_t p_lo = vmull_s8(q4_lo, q8_lo);
+    const int16x8_t p_hi = vmull_s8(q4_hi, q8_hi);
+    const int32x4_t sum_lo = vpaddlq_s16(p_lo);
+    const int32x4_t sum_hi = vpaddlq_s16(p_hi);
+    return vaddq_s32(sum_lo, sum_hi);
+}
+
 #endif // defined(__ARM_NEON)

 #ifdef __wasm_simd128__
--- a/ggml/src/ggml-hexagon/htp/argsort-ops.c
+++ b/ggml/src/ggml-hexagon/htp/argsort-ops.c
@@ -164,6 +164,12 @@ static void quicksort_values_indices_desc(float * values, int32_t * indices, int
    if (i < right) quicksort_values_indices_desc(values, indices, i, right);
 }

+// LUT for ramp initialization of argsort output (first 32 members)
+int32_t argosrt_ramp_lut[32] __attribute__((aligned(VLEN))) = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+};
+
 static void htp_argsort_f32(unsigned int n, unsigned int i, void * data) {
    struct htp_argsort_context * actx = (struct htp_argsort_context *)data;
    struct htp_ops_context * octx = actx->octx;
@@ -205,8 +211,12 @@ static void htp_argsort_f32(unsigned int n, unsigned int i, void * data) {
    // Padded to 128 bytes.

    size_t values_size = hex_round_up(ne00 * sizeof(float), 128);
+    size_t num_vec_ind_values = hmx_ceil_div(ne00, VLEN/(sizeof(int32_t)));
    float * values_buf = (float *) spad;
    int32_t * indices_buf = (int32_t *) (spad + values_size);
+    HVX_Vector * indices_buf_vec = (HVX_Vector *) (spad + values_size);
+    const HVX_Vector ind_init_vec = *(HVX_Vector *)argosrt_ramp_lut;
+    const HVX_Vector ind_diff_vec = Q6_V_vsplat_R(32);

    for (uint32_t r = start_row; r < end_row; r++) {
        uint32_t src_offset = r * nb01;
@@ -218,9 +228,11 @@ static void htp_argsort_f32(unsigned int n, unsigned int i, void * data) {
        hex_l2fetch(src_ptr, ne00 * sizeof(float), ne00 * sizeof(float), 1);
        hvx_copy_f32_au((uint8_t*)values_buf, src_ptr, ne00);

-        // Initialize indices
-        for (uint32_t j = 0; j < ne00; j++) {
-            indices_buf[j] = j;
+        // Initialize indices - Start with values 0..31, add 32 for additional vec iterations
+        HVX_Vector curr_ind_vec = ind_init_vec;
+        for (uint32_t j_vec = 0; j_vec < num_vec_ind_values; j_vec++) {
+            indices_buf_vec[j_vec] = curr_ind_vec;
+            curr_ind_vec = Q6_Vw_vadd_VwVw(curr_ind_vec, ind_diff_vec);
        }

        // Sort values and mirror swaps to indices
--- a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
@@ -437,12 +437,18 @@ inline uint32_t ggml_webgpu_flash_attn_pick_vec_ne(const ggml_webgpu_flash_attn_

    // Head-dim specializations used by the tuned vec f16 path.
    switch (key.head_dim_qk) {
-        case 64: return 2u;
-        case 96: return 4u;
-        case 128: return 1u;
-        case 192: return 2u;
-        case 576: return 2u;
-        default: return 1u;
+        case 64:
+            return 2u;
+        case 96:
+            return 4u;
+        case 128:
+            return 1u;
+        case 192:
+            return 2u;
+        case 576:
+            return 2u;
+        default:
+            return 1u;
    }
 }

@@ -513,9 +519,9 @@ struct ggml_webgpu_flash_attn_blk_shader_lib_context {
 };

 inline ggml_webgpu_processed_shader ggml_webgpu_preprocess_flash_attn_blk_shader(
-    pre_wgsl::Preprocessor &                                    preprocessor,
-    const char *                                                shader_src,
-    const ggml_webgpu_flash_attn_blk_shader_lib_context &       context) {
+    pre_wgsl::Preprocessor &                              preprocessor,
+    const char *                                          shader_src,
+    const ggml_webgpu_flash_attn_blk_shader_lib_context & context) {
    std::vector<std::string> defines;
    std::string              variant = "flash_attn_vec_blk";

@@ -1857,9 +1863,8 @@ class ggml_webgpu_shader_lib {
        defines.push_back(std::string("SG_MAT_K=") + std::to_string(context.sg_mat_k));

        uint32_t q_tile  = context.sg_mat_m;
-        uint32_t kv_tile =
-            std::min(ggml_webgpu_flash_attn_max_kv_tile(context),
-                     context.sg_mat_n * GGML_WEBGPU_FLASH_ATTN_PREFERRED_KV_SG_TILES);
+        uint32_t kv_tile = std::min(ggml_webgpu_flash_attn_max_kv_tile(context),
+                                    context.sg_mat_n * GGML_WEBGPU_FLASH_ATTN_PREFERRED_KV_SG_TILES);
        if (context.key.use_vec) {
            q_tile  = 1;
            kv_tile = std::max(context.sg_mat_n, std::min(32u, ggml_webgpu_flash_attn_max_kv_tile(context)));
@@ -1885,14 +1890,14 @@ class ggml_webgpu_shader_lib {
        }
        defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));

-        const char * shader_src = context.key.use_vec ? wgsl_flash_attn_vec_split : wgsl_flash_attn;
+        const char *    shader_src = context.key.use_vec ? wgsl_flash_attn_vec_split : wgsl_flash_attn;
        webgpu_pipeline pipeline =
            ggml_webgpu_create_pipeline(device, preprocessor.preprocess(shader_src, defines), variant);
-        auto decisions     = std::make_shared<ggml_webgpu_flash_attn_shader_decisions>();
-        decisions->q_tile  = q_tile;
-        decisions->kv_tile = kv_tile;
-        decisions->wg_size = wg_size;
-        pipeline.context   = decisions;
+        auto decisions                    = std::make_shared<ggml_webgpu_flash_attn_shader_decisions>();
+        decisions->q_tile                 = q_tile;
+        decisions->kv_tile                = kv_tile;
+        decisions->wg_size                = wg_size;
+        pipeline.context                  = decisions;
        flash_attn_pipelines[context.key] = pipeline;
        return flash_attn_pipelines[context.key];
    }
@@ -1905,7 +1910,7 @@ class ggml_webgpu_shader_lib {

        ggml_webgpu_processed_shader processed =
            ggml_webgpu_preprocess_flash_attn_blk_shader(preprocessor, wgsl_flash_attn_vec_blk, context);
-        webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed.wgsl, processed.variant);
+        webgpu_pipeline pipeline              = ggml_webgpu_create_pipeline(device, processed.wgsl, processed.variant);
        flash_attn_blk_pipelines[context.key] = pipeline;
        return flash_attn_blk_pipelines[context.key];
    }
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -734,6 +734,7 @@ class MODEL_TENSOR(IntEnum):
    V_LAYER_OUT_SCALE    = auto()
    V_PRE_NORM           = auto()
    V_POST_NORM          = auto()
+    V_MM_PRE_NORM        = auto() # hunyuanocr
    V_MM_POST_NORM       = auto()
    V_MM_INP_NORM        = auto()
    V_MM_INP_PROJ        = auto() # gemma3
@@ -769,6 +770,8 @@ class MODEL_TENSOR(IntEnum):
    V_MM_GATE            = auto() # cogvlm
    V_TOK_BOI            = auto() # cogvlm
    V_TOK_EOI            = auto() # cogvlm
+    V_TOK_IMG_BEGIN      = auto() # hunyuanocr
+    V_TOK_IMG_END        = auto() # hunyuanocr
    V_STD_BIAS           = auto() # gemma4
    V_STD_SCALE          = auto() # gemma4
    V_SAM_POS_EMBD       = auto() # Deepseek-OCR
@@ -1246,6 +1249,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.V_MM_GATE:                 "mm.gate",
    MODEL_TENSOR.V_TOK_BOI:                 "v.boi",
    MODEL_TENSOR.V_TOK_EOI:                 "v.eoi",
+    MODEL_TENSOR.V_MM_PRE_NORM:             "mm.pre_norm",
+    MODEL_TENSOR.V_TOK_IMG_BEGIN:           "mm.image_begin",
+    MODEL_TENSOR.V_TOK_IMG_END:             "mm.image_end",
    MODEL_TENSOR.V_STD_BIAS:                "v.std_bias", # gemma4
    MODEL_TENSOR.V_STD_SCALE:               "v.std_scale", # gemma4
    # DeepSeek-OCR SAM
@@ -1393,6 +1399,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.V_MM_GATE,
        MODEL_TENSOR.V_TOK_BOI,
        MODEL_TENSOR.V_TOK_EOI,
+        MODEL_TENSOR.V_MM_PRE_NORM,
+        MODEL_TENSOR.V_TOK_IMG_BEGIN,
+        MODEL_TENSOR.V_TOK_IMG_END,
        MODEL_TENSOR.V_STD_BIAS,
        MODEL_TENSOR.V_STD_SCALE,
        MODEL_TENSOR.V_SAM_POS_EMBD,
@@ -4113,6 +4122,7 @@ class VisionProjectorType:
    GLM4V = "glm4v"
    YOUTUVL = "youtuvl"
    NEMOTRON_V2_VL = "nemotron_v2_vl"
+    HUNYUANOCR     = "hunyuanocr"


 # Items here are (block size, type size)
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -1359,6 +1359,7 @@ class TensorNameMap:
            "visual.merger.mlp.{bid}", # qwen2vl
            "mlp_AR.linear_{bid}", # PaddleOCR-VL
            "merger.mlp.{bid}",
+            "vit.perceive.proj.{bid}", # HunyuanOCR (proj.0 = conv1, proj.2 = conv2)
        ),

        MODEL_TENSOR.V_MMPROJ_FC: (
@@ -1366,6 +1367,7 @@ class TensorNameMap:
            "model.vision.linear_proj.linear_proj", # cogvlm
            "model.projector.layers", # Deepseek-OCR
            "visual.merger.proj", # glm4v
+            "vit.perceive.mlp", # HunyuanOCR
        ),

        MODEL_TENSOR.V_MMPROJ_MLP: (
@@ -1393,6 +1395,7 @@ class TensorNameMap:
            "model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1
            "vpm.embeddings.patch_embedding",
            "model.vision_model.embeddings.patch_embedding", # SmolVLM
+            "vit.embeddings.patch_embedding", # HunyuanOCR
            "vision_tower.patch_conv", # pixtral-hf
            "vision_encoder.patch_conv", # pixtral
            "vision_model.patch_embedding.linear", # llama 4
@@ -1414,6 +1417,7 @@ class TensorNameMap:
            "model.vision_tower.embeddings.position_embeddings", # Intern-S1
            "vpm.embeddings.position_embedding",
            "model.vision_model.embeddings.position_embedding", # SmolVLM
+            "vit.embeddings.position_embedding", # HunyuanOCR
            "vision_model.positional_embedding_vlm", # llama 4
            "vision_tower.patch_embed.pos_emb", # kimi-vl
            "visual.pos_embed", # qwen3vl
@@ -1425,10 +1429,12 @@ class TensorNameMap:

        MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
            "model.image_newline",  # Deepseek-OCR
+            "vit.perceive.image_newline", # HunyuanOCR
        ),

        MODEL_TENSOR.V_ENC_EMBD_VSEP: (
            "model.view_seperator",  # Deepseek-OCR
+            "vit.perceive.image_sep", # HunyuanOCR
        ),

        MODEL_TENSOR.V_ENC_ATTN_QKV: (
@@ -1444,6 +1450,7 @@ class TensorNameMap:
            "model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1
            "vpm.encoder.layers.{bid}.self_attn.q_proj",
            "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
+            "vit.layers.{bid}.self_attn.q_proj", # HunyuanOCR
            "vision_model.model.layers.{bid}.self_attn.q_proj", # llama4
            "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf
            "vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
@@ -1466,6 +1473,7 @@ class TensorNameMap:
            "model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1
            "vpm.encoder.layers.{bid}.self_attn.k_proj",
            "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
+            "vit.layers.{bid}.self_attn.k_proj", # HunyuanOCR
            "vision_model.model.layers.{bid}.self_attn.k_proj", # llama4
            "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf
            "vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
@@ -1488,6 +1496,7 @@ class TensorNameMap:
            "model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1
            "vpm.encoder.layers.{bid}.self_attn.v_proj",
            "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
+            "vit.layers.{bid}.self_attn.v_proj", # HunyuanOCR
            "vision_model.model.layers.{bid}.self_attn.v_proj", # llama4
            "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf
            "vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
@@ -1504,6 +1513,7 @@ class TensorNameMap:
            "model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1
            "vpm.encoder.layers.{bid}.layer_norm1",
            "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
+            "vit.layers.{bid}.input_layernorm", # HunyuanOCR
            "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf
            "vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral
            "vision_model.model.layers.{bid}.input_layernorm", # llama4, gemma4
@@ -1521,6 +1531,7 @@ class TensorNameMap:
            "model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1
            "vpm.encoder.layers.{bid}.self_attn.out_proj",
            "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
+            "vit.layers.{bid}.self_attn.o_proj", # HunyuanOCR
            "model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro
            "vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
            "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf
@@ -1540,6 +1551,7 @@ class TensorNameMap:
            "model.vision_tower.encoder.layer.{bid}.layernorm_after", # Intern-S1
            "vpm.encoder.layers.{bid}.layer_norm2",
            "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
+            "vit.layers.{bid}.post_attention_layernorm", # HunyuanOCR
            "vision_model.model.layers.{bid}.post_attention_layernorm", # llama4
            "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf
            "vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral
@@ -1557,6 +1569,7 @@ class TensorNameMap:
            "model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1
            "vpm.encoder.layers.{bid}.mlp.fc1",
            "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
+            "vit.layers.{bid}.mlp.dense_h_to_4h", # HunyuanOCR
            "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral-hf
            "vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral
            "vision_model.model.layers.{bid}.mlp.fc1", # llama4
@@ -1583,6 +1596,7 @@ class TensorNameMap:
            "model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1
            "vpm.encoder.layers.{bid}.mlp.fc2",
            "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
+            "vit.layers.{bid}.mlp.dense_4h_to_h", # HunyuanOCR
            "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral-hf
            "vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral
            "vision_model.model.layers.{bid}.mlp.fc2", # llama4
@@ -1639,6 +1653,7 @@ class TensorNameMap:

        MODEL_TENSOR.V_MM_POST_NORM: (
            "visual.merger.post_projection_norm", # glm4v
+            "vit.perceive.after_rms", # HunyuanOCR
        ),

        MODEL_TENSOR.V_MM_INP_PROJ: (
@@ -1806,6 +1821,18 @@ class TensorNameMap:
            "model.vision.eoi", # cogvlm
        ),

+        MODEL_TENSOR.V_MM_PRE_NORM: (
+            "vit.perceive.before_rms", # HunyuanOCR
+        ),
+
+        MODEL_TENSOR.V_TOK_IMG_BEGIN: (
+            "vit.perceive.image_begin", # HunyuanOCR
+        ),
+
+        MODEL_TENSOR.V_TOK_IMG_END: (
+            "vit.perceive.image_end", # HunyuanOCR
+        ),
+
        MODEL_TENSOR.V_STD_BIAS: (
            "model.vision_tower.std_bias", # gemma4
        ),
--- a/models/templates/google-gemma-4-31B-it-interleaved.jinja
+++ b/models/templates/google-gemma-4-31B-it-interleaved.jinja
@@ -0,0 +1,282 @@
+{%- macro format_parameters(properties, required) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'OBJECT' -%}
+                ,properties:{
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                {%- elif value is mapping -%}
+                    {{- format_parameters(value, value['required'] | default([])) -}}
+                {%- endif -%}
+                }
+                {%- if value['required'] -%}
+                    ,required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    ,items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+
+{%- set ns = namespace(prev_message_type=None, last_user_message=-1) -%}
+{%- set loop_messages = messages -%}
+{{ bos_token }}
+{#- Handle System/Tool Definitions Block -#}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {{- messages[0]['content'] | trim -}}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+
+    {{- '<turn|>\n' -}}
+{%- endif %}
+
+{#- Find last user message -#}
+{%- for message in loop_messages -%}
+    {%- if message['role'] == 'user' -%}
+        {%- set ns.last_user_message = loop.index0 -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Loop through messages -#}
+{%- for message in loop_messages -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+        {%- if not (ns.prev_message_type == 'tool_response' and message['tool_calls']) -%}
+        {{- '<|turn>' + role + '\n' }}
+        {%- endif -%}
+
+        {%- set ns.prev_message_type = None -%}
+
+            {%- if message['tool_calls'] -%}
+                {#- Preserve reasoning between tool calls for model turns that come after the last user turn -#} 
+                {%- if message['reasoning_content'] and loop.index0 > ns.last_user_message -%}
+                  {{- '<|channel>thought\n' -}}
+                  {{- message['reasoning_content'] -}}
+                  {{- '<channel|>' -}}
+                {%- endif -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+
+            {%- if message['tool_responses'] -%}
+                {#- Tool Response handling -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- '<|tool_response>' -}}
+                    {%- if tool_response['response'] is mapping -%}
+                        {{- 'response:' + tool_response['name'] | default('unknown') + '{' -}}
+                        {%- for key, value in tool_response['response'] | dictsort -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                            {%- if not loop.last %},{% endif -%}
+                        {%- endfor -%}
+                        {{- '}' -}}
+                    {%- else -%}
+                        {{- 'response:' + tool_response['name'] | default('unknown') + '{value:' + format_argument(tool_response['response'], escape_keys=False) + '}' -}}
+                    {%- endif -%}
+                    {{- '<tool_response|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_response' -%}
+            {%- endif -%}
+
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '\n\n<|image|>\n\n' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '\n\n<|video|>\n\n' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+
+        {%- if not (message['tool_responses'] and not message['content']) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+    {%- if not enable_thinking | default(false) -%}
+        {{- '<|channel>thought\n<channel|>' -}}
+    {%- endif -%}
+{%- endif -%}
--- a/models/templates/google-gemma-4-31B-it.jinja
+++ b/models/templates/google-gemma-4-31B-it.jinja
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@@ -73,6 +73,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "hunyuan-moe",       LLM_CHAT_TEMPLATE_HUNYUAN_MOE       },
    { "gpt-oss",           LLM_CHAT_TEMPLATE_OPENAI_MOE        },
    { "hunyuan-dense",     LLM_CHAT_TEMPLATE_HUNYUAN_DENSE     },
+    { "hunyuan-ocr",       LLM_CHAT_TEMPLATE_HUNYUAN_OCR       },
    { "kimi-k2",           LLM_CHAT_TEMPLATE_KIMI_K2           },
    { "seed_oss",          LLM_CHAT_TEMPLATE_SEED_OSS          },
    { "grok-2",            LLM_CHAT_TEMPLATE_GROK_2            },
@@ -216,6 +217,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
    } else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
        return LLM_CHAT_TEMPLATE_OPENAI_MOE;
+    } else if (tmpl_contains("<｜hy_Assistant｜>") && tmpl_contains("<｜hy_begin▁of▁sentence｜>")) {
+        return LLM_CHAT_TEMPLATE_HUNYUAN_OCR;
    } else if (tmpl_contains("<｜hy_Assistant｜>") && tmpl_contains("<｜hy_place▁holder▁no▁3｜>")) {
        return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
    } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
@@ -822,6 +825,22 @@ int32_t llm_chat_apply_template(
                ss << "<｜hy_User｜>" << chat[i]->content << "<｜hy_Assistant｜>";
            }
        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_OCR) {
+        // tencent/HunyuanOCR
+        ss << "<｜hy_begin▁of▁sentence｜>";
+        for (size_t i = 0; i < chat.size(); i++) {
+            std::string role(chat[i]->role);
+            if (i == 0 && role == "system") {
+                ss << chat[i]->content << "<｜hy_place▁holder▁no▁3｜>";
+                continue;
+            }
+
+            if (role == "user") {
+                ss << chat[i]->content << "<｜hy_User｜>";
+            } else if (role == "assistant") {
+                ss << chat[i]->content << "<｜hy_Assistant｜>";
+            }
+        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
        // moonshotai/Kimi-K2-Instruct
        for (auto message : chat) {
--- a/src/llama-chat.h
+++ b/src/llama-chat.h
@@ -53,6 +53,7 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
    LLM_CHAT_TEMPLATE_OPENAI_MOE,
    LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
+    LLM_CHAT_TEMPLATE_HUNYUAN_OCR,
    LLM_CHAT_TEMPLATE_KIMI_K2,
    LLM_CHAT_TEMPLATE_SEED_OSS,
    LLM_CHAT_TEMPLATE_GROK_2,
--- a/src/llama-impl.cpp
+++ b/src/llama-impl.cpp
@@ -128,7 +128,7 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
        case GGUF_TYPE_INT64:   return std::to_string(((const int64_t  *)data)[i]);
        case GGUF_TYPE_FLOAT32: return std::to_string(((const float    *)data)[i]);
        case GGUF_TYPE_FLOAT64: return std::to_string(((const double   *)data)[i]);
-        case GGUF_TYPE_BOOL:    return ((const bool *)data)[i] ? "true" : "false";
+        case GGUF_TYPE_BOOL:    return ((const int8_t *)data)[i] != 0 ? "true" : "false";
        default:                return format("unknown type %d", type);
    }
 }
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -374,8 +374,9 @@ namespace GGUFMeta {
            }
        } else {
            if (arr_info.gt == GGUF_TYPE_BOOL) {
-                std::transform((const bool *)arr_info.data, (const bool *)arr_info.data + arr_info.length, result.begin(), [](bool x) {
-                    return static_cast<T>(x);
+                const int8_t * values = (const int8_t *) arr_info.data;
+                std::transform(values, values + arr_info.length, result.begin(), [](int8_t x) {
+                    return static_cast<T>(x != 0);
                });
            } else {
                std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1279,6 +1279,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_EMBEDDING_LENGTH_PER_LAYER,  hparams.n_embd_per_layer);
                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA,    hparams.n_embd_head_k_swa);
                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA,  hparams.n_embd_head_v_swa);
+                ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,     hparams.f_final_logit_softcapping, false);

                switch (hparams.n_layer) {
                    case 35: type = LLM_TYPE_E2B; break;
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -2551,6 +2551,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    || t.first == "<|end_of_text|>"
                    || t.first == "<end_of_utterance>" // smoldocling
                    || t.first == "<turn|>" // gemma4
+                    || t.first == "<|tool_response>" // gemma4
                    || t.first == "<｜end▁of▁sentence｜>" // deepseek-ocr
               ) {
                special_eog_ids.insert(t.second);
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -753,6 +753,35 @@ static std::vector<size_t> unicode_regex_split_custom_afmoe(const std::string &
    return bpe_offsets;
 }

+// regex: [^\n]+|[\n]+
+// splits text into runs of non-newline characters and runs of newline characters
+static std::vector<size_t> unicode_regex_split_custom_newlines(const std::string & text, const std::vector<size_t> & offsets) {
+    std::vector<size_t> bpe_offsets;
+    bpe_offsets.reserve(offsets.size());
+
+    const auto cpts = unicode_cpts_from_utf8(text);
+
+    size_t start = 0;
+    for (auto offset : offsets) {
+        const size_t offset_ini = start;
+        const size_t offset_end = start + offset;
+        assert(offset_end <= cpts.size());
+        start = offset_end;
+
+        size_t pos = offset_ini;
+        while (pos < offset_end) {
+            const bool is_newline = (cpts[pos] == '\n');
+            const size_t run_start = pos;
+            while (pos < offset_end && (cpts[pos] == '\n') == is_newline) {
+                pos++;
+            }
+            bpe_offsets.push_back(pos - run_start);
+        }
+    }
+
+    return bpe_offsets;
+}
+
 static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
    std::vector<size_t> bpe_offsets;

@@ -769,6 +798,8 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
    } else if (regex_expr == "\\p{AFMoE_digits}") {
        // AFMOE digit pattern - use custom implementation for proper splitting
        bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
+    } else if (regex_expr == "[^\\n]+|[\\n]+") {
+        bpe_offsets = unicode_regex_split_custom_newlines(text, offsets);
    } else if (regex_expr == "\\d{1,3}(?=(?:\\d{3})*\\b)") {
        // tiny_aya digit grouping pattern from tokenizer.json:
        //   {"type": "Split", "pattern": {"Regex": "\\d{1,3}(?=(?:\\d{3})*\\b)"}, "behavior": "Isolated"}
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -1976,10 +1976,24 @@ static void test_template_output_peg_parsers(bool detailed_debug) {

    {
        // Google Gemma 4 (tool calling with Gemma4 dict format)
-        auto tst = peg_tester("models/templates/gemma4.jinja");
+        auto tst = peg_tester("models/templates/google-gemma-4-31B-it.jinja");

        tst.test("Hello, world!").expect(simple_assist_msg("Hello, world!")).run();

+        // Reasoning and content
+        tst.test(
+                "<|channel>thought\nI'm\nthinking<channel|>Hello, world!\nWhat's up?")
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .expect(message_assist_thoughts)
+            .run();
+
+        // Reasoning and content with reasoning_format = none
+        tst.test(
+                "<|channel>thought\nI'm\nthinking<channel|>Hello, world!\nWhat's up?")
+            .reasoning_format(COMMON_REASONING_FORMAT_NONE)
+            .expect_content("<|channel>thought\nI'm\nthinking<channel|>Hello, world!\nWhat's up?")
+            .run();
+
        // Simple tool call with string argument
        tst.test(
                "<|tool_call>call:get_time{city:<|\"|>London<|\"|>}<tool_call|>")
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -19,6 +19,7 @@ add_library(mtmd
            models/conformer.cpp
            models/gemma4v.cpp
            models/glm4v.cpp
+            models/hunyuanocr.cpp
            models/internvl.cpp
            models/kimivl.cpp
            models/kimik25.cpp
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -148,6 +148,11 @@
 #define TN_TOK_BOI         "v.boi"
 #define TN_TOK_EOI         "v.eoi"

+// hunyuanocr
+#define TN_MM_PRE_NORM     "mm.pre_norm.%s"
+#define TN_TOK_IMG_BEGIN   "mm.image_begin"
+#define TN_TOK_IMG_END     "mm.image_end"
+
 // deepseek-ocr
 #define TN_SAM_POS_EMBD   "v.sam.pos_embd.%s"
 #define TN_SAM_PATCH_EMBD "v.sam.patch_embd.%s"
@@ -266,6 +271,7 @@ enum projector_type {
    PROJECTOR_TYPE_YOUTUVL,
    PROJECTOR_TYPE_KIMIK25,
    PROJECTOR_TYPE_NEMOTRON_V2_VL,
+    PROJECTOR_TYPE_HUNYUANOCR,
    PROJECTOR_TYPE_UNKNOWN,
 };

@@ -306,6 +312,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_YOUTUVL,   "youtuvl"},
    { PROJECTOR_TYPE_KIMIK25,   "kimik25"},
    { PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
+    { PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"},
 };

 static projector_type clip_projector_type_from_string(const std::string & str) {
@@ -515,7 +522,7 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
        case GGUF_TYPE_INT64:   return std::to_string(((const int64_t  *)data)[i]);
        case GGUF_TYPE_FLOAT32: return std::to_string(((const float    *)data)[i]);
        case GGUF_TYPE_FLOAT64: return std::to_string(((const double   *)data)[i]);
-        case GGUF_TYPE_BOOL:    return ((const bool *)data)[i] ? "true" : "false";
+        case GGUF_TYPE_BOOL:    return ((const int8_t *)data)[i] != 0 ? "true" : "false";
        default:                return string_format("unknown type %d", type);
    }
 }
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -358,7 +358,8 @@ struct clip_model {
    // MINICPMV projection
    ggml_tensor * mm_model_pos_embed_k = nullptr;
    ggml_tensor * mm_model_query = nullptr;
-    ggml_tensor * mm_model_proj = nullptr;
+    ggml_tensor * mm_model_proj   = nullptr;
+    ggml_tensor * mm_model_proj_b = nullptr;
    ggml_tensor * mm_model_kv_proj = nullptr;
    ggml_tensor * mm_model_attn_q_w = nullptr;
    ggml_tensor * mm_model_attn_q_b = nullptr;
@@ -419,6 +420,11 @@ struct clip_model {
    ggml_tensor * mm_boi = nullptr;
    ggml_tensor * mm_eoi = nullptr;

+    // hunyuanocr perceiver
+    ggml_tensor * mm_pre_norm_w  = nullptr;
+    ggml_tensor * mm_img_begin   = nullptr;
+    ggml_tensor * mm_img_end     = nullptr;
+
    // deepseek ocr sam
    ggml_tensor * patch_embed_proj_w = nullptr;
    ggml_tensor * patch_embed_proj_b = nullptr;
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -902,6 +902,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            {
                builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
            } break;
+        case PROJECTOR_TYPE_HUNYUANOCR:
+            {
+                builder = std::make_unique<clip_graph_hunyuanocr>(ctx, img);
+            } break;
        case PROJECTOR_TYPE_MLP:
        case PROJECTOR_TYPE_MLP_NORM:
        case PROJECTOR_TYPE_LDP:
@@ -1408,6 +1412,14 @@ struct clip_model_loader {
                        get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true);
                        get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
                     } break;
+                case PROJECTOR_TYPE_HUNYUANOCR:
+                    {
+                        hparams.n_merge = 2;
+                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
+                        get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
+                        get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
+                        hparams.set_warmup_n_tokens(28*28);
+                    } break;
                case PROJECTOR_TYPE_LFM2A:
                    {
                        // audio preprocessing params
@@ -2035,6 +2047,22 @@ struct clip_model_loader {
                    model.mm_boi            = get_tensor(TN_TOK_BOI);
                    model.mm_eoi            = get_tensor(TN_TOK_EOI);
                } break;
+            case PROJECTOR_TYPE_HUNYUANOCR:
+                {
+                    // proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear)
+                    model.mm_0_w            = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
+                    model.mm_0_b            = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
+                    model.mm_1_w            = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
+                    model.mm_1_b            = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
+                    model.mm_model_proj     = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
+                    model.mm_model_proj_b   = get_tensor(string_format(TN_MM_PROJECTOR, "bias"));
+                    model.mm_pre_norm_w     = get_tensor(string_format(TN_MM_PRE_NORM, "weight"));
+                    model.mm_post_norm_w    = get_tensor(string_format(TN_MM_POST_NORM, "weight"));
+                    model.mm_img_begin      = get_tensor(TN_TOK_IMG_BEGIN);
+                    model.mm_img_end        = get_tensor(TN_TOK_IMG_END);
+                    model.image_newline     = get_tensor(TN_IMAGE_NEWLINE);
+                    model.view_seperator    = get_tensor(TN_IMAGE_SEPERATOR, false);
+                } break;
            case PROJECTOR_TYPE_JANUS_PRO:
                {
                    model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
@@ -2584,6 +2612,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
        case PROJECTOR_TYPE_QWEN3VL:
        case PROJECTOR_TYPE_GLM4V:
        case PROJECTOR_TYPE_PADDLEOCR:
+        case PROJECTOR_TYPE_HUNYUANOCR:
        case PROJECTOR_TYPE_YOUTUVL:
            return (img->nx / params.patch_size) / 2;
        default:
@@ -2768,6 +2797,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
            int h = static_cast<int>(std::sqrt(static_cast<float>(n_patches)));
            n_patches = h * (h + 1) + 1;
        } break;
+        case PROJECTOR_TYPE_HUNYUANOCR:
+            {
+                int merge = ctx->model.hparams.n_merge;
+                int ow = (img->nx / patch_size) / merge;
+                int oh = (img->ny / patch_size) / merge;
+                n_patches = (ow + 1) * oh + 2;
+            } break;
        case PROJECTOR_TYPE_LFM2A:
            {
                n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
@@ -3175,6 +3211,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        case PROJECTOR_TYPE_JANUS_PRO:
        case PROJECTOR_TYPE_PHI4:
        case PROJECTOR_TYPE_COGVLM:
+        case PROJECTOR_TYPE_HUNYUANOCR:
            {
                // do nothing
            } break;
@@ -3346,6 +3383,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
        case PROJECTOR_TYPE_PADDLEOCR:
        case PROJECTOR_TYPE_KIMIK25:
            return ctx->model.mm_2_w->ne[1];
+        case PROJECTOR_TYPE_HUNYUANOCR:
+            return ctx->model.mm_model_proj->ne[1];
        case PROJECTOR_TYPE_COGVLM:
            return ctx->model.mm_4h_to_h_w->ne[1];
        case PROJECTOR_TYPE_DEEPSEEKOCR:
--- a/tools/mtmd/models/hunyuanocr.cpp
+++ b/tools/mtmd/models/hunyuanocr.cpp
@@ -0,0 +1,59 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_hunyuanocr::build() {
+    const int merge = hparams.n_merge;
+    const int pw    = n_patches_x;
+    const int ph    = n_patches_y;
+
+    ggml_tensor * pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR);
+
+    ggml_tensor * inp = build_inp();
+    ggml_tensor * cur = build_vit(inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, pos_embd, nullptr);
+
+    // perceiver projector
+    cur = build_norm(cur, model.mm_pre_norm_w, nullptr, NORM_TYPE_RMS, eps, -1);
+
+    // [C, W*H] -> [W, H, C] for conv2d
+    cur = ggml_reshape_3d(ctx0, cur, n_embd, pw, ph);
+    cur = ggml_permute(ctx0, cur, 2, 0, 1, 3);
+    cur = ggml_cont(ctx0, cur);
+
+    // Conv2d(1152->2304, k=2, s=2) + GELU + Conv2d(2304->4608, k=1, s=1)
+    cur = ggml_conv_2d(ctx0, model.mm_0_w, cur, merge, merge, 0, 0, 1, 1);
+    if (model.mm_0_b) {
+        cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.mm_0_b, 1, 1, model.mm_0_b->ne[0]));
+    }
+    cur = ggml_gelu(ctx0, cur);
+    cur = ggml_conv_2d(ctx0, model.mm_1_w, cur, 1, 1, 0, 0, 1, 1);
+    if (model.mm_1_b) {
+        cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.mm_1_b, 1, 1, model.mm_1_b->ne[0]));
+    }
+
+    const int ow   = pw / merge;
+    const int oh   = ph / merge;
+    const int idim = (int)cur->ne[2]; // OC = 4608
+
+    // append newline along W (dim 0)
+    ggml_tensor * nl = ggml_reshape_4d(ctx0, model.image_newline, 1, 1, idim, 1);
+    nl = ggml_repeat_4d(ctx0, nl, 1, oh, idim, 1);
+    cur = ggml_concat(ctx0, cur, nl, 0);
+
+    // [OW+1, OH, OC] -> [OC, (OW+1)*OH]
+    cur = ggml_permute(ctx0, cur, 1, 2, 0, 3);
+    cur = ggml_cont_2d(ctx0, cur, idim, (ow + 1) * oh);
+
+    // project to LLM hidden size
+    cur = build_mm(model.mm_model_proj, cur);
+    if (model.mm_model_proj_b) {
+        cur = ggml_add(ctx0, cur, model.mm_model_proj_b);
+    }
+
+    // wrap with begin/end tokens
+    cur = ggml_concat(ctx0, ggml_reshape_2d(ctx0, model.mm_img_begin, model.mm_img_begin->ne[0], 1), cur, 1);
+    cur = ggml_concat(ctx0, cur, ggml_reshape_2d(ctx0, model.mm_img_end, model.mm_img_end->ne[0], 1), 1);
+
+    cur = build_norm(cur, model.mm_post_norm_w, nullptr, NORM_TYPE_RMS, eps, -1);
+
+    ggml_build_forward_expand(gf, cur);
+    return gf;
+}
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@@ -98,6 +98,11 @@ struct clip_graph_glm4v : clip_graph {
    ggml_cgraph * build() override;
 };

+struct clip_graph_hunyuanocr : clip_graph {
+    clip_graph_hunyuanocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
 struct clip_graph_mobilenetv5 : clip_graph {
    clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -406,6 +406,13 @@ struct mtmd_context {
                    img_end = "\n"; // prevent empty batch on llama-server
                    image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
                } break;
+            case PROJECTOR_TYPE_HUNYUANOCR:
+                {
+                    // note: these use fullwidth ｜ (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
+                    img_beg = "<｜hy_place▁holder▁no▁100｜>";
+                    img_end = "<｜hy_place▁holder▁no▁101｜>";
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
+                } break;
            default:
                throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
        }
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -155,8 +155,8 @@ struct server_slot {
    int64_t t_start_process_prompt;
    int64_t t_start_generation;

-    double t_prompt_processing; // ms
-    double t_token_generation;  // ms
+    double t_prompt_processing = 0.0; // ms
+    double t_token_generation = 0.0;  // ms

    std::function<void(int /* id_slot */)> callback_on_release;

--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -261,14 +261,14 @@ struct result_timings {
    int32_t cache_n = -1;

    int32_t prompt_n = -1;
-    double prompt_ms;
-    double prompt_per_token_ms;
-    double prompt_per_second;
+    double prompt_ms = 0.0;
+    double prompt_per_token_ms = 0.0;
+    double prompt_per_second = 0.0;

    int32_t predicted_n = -1;
-    double predicted_ms;
-    double predicted_per_token_ms;
-    double predicted_per_second;
+    double predicted_ms = 0.0;
+    double predicted_per_token_ms = 0.0;
+    double predicted_per_second = 0.0;

    // Optional speculative metrics - only included when > 0
    int32_t draft_n = 0;
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -108,10 +108,8 @@ int main(int argc, char ** argv) {
    llama_backend_init();
    llama_numa_init(params.numa);

-    LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency());
-    LOG_INF("\n");
+    LOG_INF("build_info: %s\n", build_info.c_str());
    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
-    LOG_INF("\n");

    server_http_context ctx_http;
    if (!ctx_http.init(params)) {
Author	SHA1	Message	Date
Ruben Ortlam	a30369d515	cpu: fix ARM NEON nvfp4 vec dot	2026-04-06 10:27:03 +02:00
Yarden Tal	25eec6f327	hexagon: slight optimization for argosrt output init (#21463 )	2026-04-05 18:30:25 -07:00
anchortense	58190cc84d	llama : correct platform-independent loading of BOOL metadata (#21428 ) * model-loader : fix GGUF bool array conversion * model-loader : fix remaining GGUF bool pointer uses	2026-04-06 01:40:38 +02:00
Richard Davison	af76639f72	model : add HunyuanOCR support (#21395 ) * HunyuanOCR: add support for text and vision models - Add HunyuanOCR vision projector (perceiver-based) with Conv2d merge - Add separate HUNYUAN_OCR chat template (content-before-role format) - Handle HunyuanOCR's invalid pad_token_id=-1 in converter - Fix EOS/EOT token IDs from generation_config.json - Support xdrope RoPE scaling type - Add tensor mappings for perceiver projector (mm.before_rms, mm.after_rms, etc.) - Register HunYuanVLForConditionalGeneration for both text and mmproj conversion * fix proper mapping * Update gguf-py/gguf/tensor_mapping.py Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> * Update tools/mtmd/clip.cpp Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> * address comments * update * Fix typecheck * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2026-04-05 23:32:14 +02:00
Ludovic Henry	761797ffdf	ci : use default RISE RISC-V Runners (#21263 )	2026-04-05 20:29:48 +02:00
ddh0	5d3a4a7da5	server : fix logging of build + system info (#21460 ) This PR changes the logging that occurs at startup of llama-server. Currently, it is redundant (including CPU information twice) and it is missing the build + commit info.	2026-04-05 16:14:02 +02:00
M1DNYT3	c08d28d088	ci: lower cuda12 floor to 12.8.1 for broader host compatibility (#21438 ) Co-authored-by: M1DNYT3 <m1dnyt3@MacBookPro.lan>	2026-04-05 09:04:00 +08:00
Nicholas Sparks	661e9acb36	ci: fix vulkan workflow referencing non-existent action (#21442 )	2026-04-05 08:59:51 +08:00
Aldehir Rojas	b8635075ff	common : add gemma 4 specialized parser (#21418 ) * common : add gemma4 dedicated parser * cont : add '<\|tool_response>' as eog * cont : emit JSON from Gemma4 tool call AST * cont : more fixes * cont : refactor convert function * cont : refine rules and mapping * cont : add more tests * cont : clean up * cont : remove autoparser gemma4 implementation * cont : more cleanup * cont : rename gemma4.jinja to match the others * cont : add custom template to support interleaved thinking * cont : preserve reasoning in model turns * cont : fix initializer error * cont : fix unused vars * cont : fix accidental static * cont : fix specialized_template signature * fix extra semicolon * remove debug line and extra space [no ci]	2026-04-04 20:39:00 +02:00
Dan Hoffman	9c699074c9	server: Fix undefined timing measurement errors in server context (#21201 ) Co-authored-by: Dan Hoffman <dhoffman@cyket.net>	2026-04-04 22:11:19 +08:00
Adrien Gallouët	d01f6274c0	common : respect specified tag, only fallback when tag is empty (#21413 ) Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2026-04-04 15:08:03 +02:00
SamareshSingh	650bf14eb9	llama-model: read final_logit_softcapping for Gemma 4 (#21390 )	2026-04-04 13:05:10 +02:00
Aman Gupta	b7ad48ebda	llama: add custom newline split for Gemma 4 (#21406 )	2026-04-04 15:06:34 +08:00
Reese Levine	d006858316	ggml-webgpu: move from parameter buffer pool to single buffer with offsets (#21278 ) * Work towards removing bitcast * Move rest of existing types over * Add timeout back to wait and remove synchronous set_tensor/memset_tensor * move to unpackf16 for wider compatibility * cleanup * Remove deadlock condition in free_bufs * Start work on removing parameter buffer pools * Simplify and optimize further * simplify profile futures * Fix stride * Try using a single command buffer per batch * formatting	2026-04-03 11:40:14 -07:00
Masato Nakasaka	e439700992	ci: Add Windows Vulkan backend testing on Intel (#21292 ) * experimenting CI * Experimenting CI fix for MinGW * experimenting CI on Windows * modified script for integration with VisualStudio * added proxy handling * adding python version for Windows execution * fix iterator::end() dereference * fixed proxy handling * Fix errors occurring on Windows * fixed ci script * Reverted to master * Stripping test items to simplify Windows test * adjusting script for windows testing * Changed shell * Fixed shell * Fixed shell * Fix CI setting * Fix CI setting * Fix CI setting * Experimenting ci fix * Experimenting ci fix * Experimenting ci fix * Experimenting ci fix * experimenting fix for unit test error * Changed to use BUILD_LOW_PERF to skip python tests * Fix CI * Added option to specify Ninja generator * Reverted proxy related changes	2026-04-03 20:16:44 +03:00