Compare commits

...

126 Commits
b8230 ... b8356

Author SHA1 Message Date
Bartowski
b9da4444df ggml : guard against sumq2 being 0 in IQ4_NL (#20460) 2026-03-15 10:47:28 +02:00
PikaPikachu
617db241aa cuda : add RDNA4-specific MMVQ parameter table for bs=1 decode (#19478)
* mmvq: add RDNA3/RDNA4-specific parameter table (nwarps=8, rows=1)

* mmvq: add dedicated RDNA3 parameter table

* mmvq: exclude RDNA3.5 (gfx1150/1151) from RDNA3 table
2026-03-15 08:33:39 +01:00
Ruben Ortlam
1a3d8edbba vulkan: use graphics queue on AMD (#20551)
* vulkan: use graphics queue on AMD for slightly better performance

* disable async transfer queue on AMD
2026-03-15 08:18:54 +01:00
sprayandwipe
6b10a82c00 kv-cache : fix reading llama_kv_cell_ext during state read (#20273)
Co-authored-by: sid <sid@ragingfist.net>
2026-03-15 09:11:19 +02:00
Michael Wand
d23355afc3 model : wire up Qwen3.5/Qwen3.5MoE tensors for NVFP4 support (#20506) 2026-03-14 22:44:42 +01:00
Georgi Gerganov
b30a5fdf37 metal : add FA specialization for HSK = 320, HSV = 256 (#20549) 2026-03-14 23:15:47 +02:00
Georgi Gerganov
b4768955c4 ci : move self-hosted workflows to separate files (#20540) 2026-03-14 23:15:35 +02:00
Gerard Guillemas Martos
fc350fdf96 docker : force Python 3.13 in Vulkan container (#20530)
* ci: force Python 3.13 in Vulkan container

* remove unnecessary `update-alternatives` line
2026-03-14 21:37:09 +01:00
Eve
3a6f059909 ci : try to optimize some jobs (#20521)
* force arm version to test

* run on either x86 or arm if we can help it, this only works for runs without ccache

* readd other jobs

* remove ccache
2026-03-14 20:27:52 +01:00
Max Krasnyansky
609ea50026 hexagon: Q4_0 and MXFP4 repack fixes (#20527)
* hexagon: fix tail corruption with rows sizes not multiple of 256

* hexagon: use different stride for repacking partial blocks

* hex-mm: update repack and kernels to avoid shuffles for full 256-element blocks

Previous commit changed the repacking to use even:odd (0:1,2:3,..) packing
instead of the original (0:128,1:129,...) packing in order to fix tail corruption.
Since the mm kernels already deal with partial tails we can use even:odd
packing only for the last block.
This avoid performance penalty of having to shuffle to zip the elements
in the common case.

* hex-mm: update rmpy x8 for better optimizations

* hex-mm: tighten supported MUL_MAT checks to avoid spurios failures

* hex-mm: use vzero to init accumulators

* hex-mm: properly call partial rmpy_x8
2026-03-14 11:09:08 -07:00
Georgi Gerganov
9f774e45ee ci : reduce webgpu tests timeout to 900s (#20538)
[no ci]
2026-03-14 17:08:26 +02:00
Xuan-Son Nguyen
94d0262277 mtmd: add llama-mtmd-debug binary (#20508)
* mtmd: add llama-mtmd-debug binary

* adapt

* fixes

* fix compile error

* fix windows compile error

* rm legacy clip_debug_encode()

* add MTMD_API to fix build
2026-03-14 15:52:29 +01:00
Neo Zhang
a93c0ef0fa add op gated_delta_net (#20455) 2026-03-14 22:01:57 +08:00
Chedrian07
710878a7dd webui: restore code preview iframe origin isolation (#20477) 2026-03-14 11:28:28 +01:00
Adrien Gallouët
0685848bc6 scripts : remove get-wikitext-103.sh (#20543)
It doesn't work and no one seems to use it.

    $ wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
    HTTP request sent, awaiting response... 301 Moved Permanently
    Location: unspecified
    ERROR: Redirection (301) without location.

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-03-14 11:22:04 +01:00
Adrien Gallouët
0024a69b70 scripts : update get-hellaswag.sh and get-winogrande.sh (#20542)
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-03-14 11:21:50 +01:00
Adrien Gallouët
d0b79aaa2f ggml : add native AVX512-FP16 support for F16 operations (#20529)
The overall benchmark speed remains almost the same because the CPU is
now calculating faster than the RAM can deliver the data. (See perf stat
results below showing 2.7 billion fewer instructions).

Also note that this path will be only enabled for native build or with
custom flags.

now:
```
 Performance counter stats for 'build/bin/llama-bench -m Qwen3-0.6B-f16.gguf -p 512 -n 128':

        189,073.52 msec task-clock                       #   14.658 CPUs utilized
               404      context-switches                 #    2.137 /sec
                19      cpu-migrations                   #    0.100 /sec
           372,390      page-faults                      #    1.970 K/sec
   310,877,195,595      instructions                     #    0.54  insn per cycle
   581,071,530,602      cycles                           #    3.073 GHz
    19,352,107,994      branches                         #  102.352 M/sec
        48,304,438      branch-misses                    #    0.25% of all branches
    84,998,431,152      L1-dcache-loads                  #  449.552 M/sec
    12,186,410,279      L1-dcache-load-misses            #   14.34% of all L1-dcache accesses

      12.899358742 seconds time elapsed

     187.823044000 seconds user
       1.253416000 seconds sys
```

before:
```
 Performance counter stats for 'build/bin/llama-bench -m Qwen3-0.6B-f16.gguf -p 512 -n 128':

        190,594.56 msec task-clock                       #   14.652 CPUs utilized
               436      context-switches                 #    2.288 /sec
                22      cpu-migrations                   #    0.115 /sec
           372,782      page-faults                      #    1.956 K/sec
   313,574,921,966      instructions                     #    0.54  insn per cycle
   586,064,970,425      cycles                           #    3.075 GHz
    19,585,778,563      branches                         #  102.761 M/sec
        48,437,488      branch-misses                    #    0.25% of all branches
    86,219,336,628      L1-dcache-loads                  #  452.370 M/sec
    12,232,085,771      L1-dcache-load-misses            #   14.19% of all L1-dcache accesses

      13.007923164 seconds time elapsed

     189.395316000 seconds user
       1.202612000 seconds sys
```

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-03-14 10:06:14 +01:00
Wallentri
f2c0dfb739 Use fp32 in cuBLAS V100 to avoid overflows, env variables to override cuBLAS compute type (#19959)
* Update ggml-cuda.cu

* Update ggml-cuda.cu

* Update build.md

* Update build.md

* Update ggml/src/ggml-cuda/ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

* Update build.md

* Update ggml/src/ggml-cuda/ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update build.md

* Update ggml-cuda.cu

* Update ggml-cuda.cu

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
2026-03-14 15:43:13 +08:00
Zijun Yu
9789c4ecdc ggml : add OpenVINO backend (#15307)
* Update build doc

* Add cgraph tensor output name to OV op name

* Update openvino build instructions

* Add initial NPU support

* draft NPU support version 2: prefill + kvcache

* NPU support version 2: prefill + kvcache

* Change due to ggml cgraph changes, not correct yet

* Change due to ggml cgraph changes, llama-3.2 CPU work

* Add AMD64 to CMakeLists

* Change due to ggml cgraph changes, all device work

* Refactor: clean, fix warning

* Update clang-format

* Statful transformation for CPU GPU

* Add SwiGLU

* Fuse to SDPA

* Replace Concat with Broadcast in MulMat for GQA

* Pull out indices creation for kv cache update

* Refactor: remove past_token_len from extra_inputs

* Fix Phi3 SwiGLU and SoftMax

* Pull out sin cos from rope

* Reduce memory: free ov weights node after graph conversion

* Fix CPY due to cgraph change

* Added OpenVINO CI/CD. Updated docs

* Fix llama-cli

* Fix Phi3 ROPE; Add test-backend-ops

* Fix NPU

* Fix llama-bench; Clang-format

* Fix llama-perplexity

* temp. changes for mark decomp

* matmul in fp32

* mulmat input conversion fix

* mulmat type conversion update

* add mark decomp pass

* Revert changes in fuse_to_sdpa

* Update build.md

* Fix test-backend-ops

* Skip test-thread-safety; Run ctest only in ci/run.sh

* Use CiD for NPU

* Optimize tensor conversion, improve TTFT

* Support op SET_ROWS

* Fix NPU

* Remove CPY

* Fix test-backend-ops

* Minor updates for raising PR

* Perf: RMS fused to OV internal RMS op

* Fix after rebasing

- Layout of cache k and cache v are unified: [seq, n_head, head_size]
- Add CPY and FLASH_ATTN_EXT, flash attn is not used yet
- Skip test-backend-ops due to flash attn test crash
- Add mutex around graph conversion to avoid test-thread-safety fali in the future
- Update NPU config
- Update GPU config to disable SDPA opt to make phi-3 run

* Change openvino device_type to GPU; Enable flash_attn

* Update supports_buft and supports_op for quantized models

* Add quant weight conversion functions from genai gguf reader

* Quant models run with accuracy issue

* Fix accuracy: disable cpu_repack

* Fix CI; Disable test-backend-ops

* Fix Q4_1

* Fix test-backend-ops: Treat quantized tensors as weights

* Add NPU Q4_0 support

* NPU perf: eliminate zp

* Dequantize q4_1 q4_k q6_k for NPU

* Add custom quant type: q8_1_c, q4_0_128

* Set m_is_static=false as default in decoder

* Simpilfy translation of get_rows

* Fix after rebasing

* Improve debug util; Eliminate nop ReshapeReshape

* STYLE: make get_types_to_requant a function

* Support BF16 model

* Fix NPU compile

* WA for npu 1st token acc issue

* Apply EliminateZP only for npu

* Add GeGLU

* Fix Hunyuan

* Support iSWA

* Fix NPU accuracy

* Fix ROPE accuracy when freq_scale != 1

* Minor: not add attention_size_swa for non-swa model

* Minor refactor

* Add Q5_K to support phi-3-q4_k_m

* Requantize Q6_K (gs16) to gs32 on GPU

* Fix after rebasing

* Always apply Eliminate_ZP to fix GPU compile issue on some platforms

* kvcachefusion support

* env variable GGML_OPENVINO_DISABLE_SDPA_OPTIMIZATION added

* Fix for Phi3

* Fix llama-cli (need to run with --no-warmup)

* Fix add_sliced_mask; Revert mulmat, softmax; Remove input attention_size, iSWA model not working

* fix after rebasing

* Fix llama-3-8b and phi3-mini q4_0 NPU

* Update to OV-2025.3 and CMakeLists.txt

* Add OV CI cache

* Apply CISC review and update CI to OV2025.3

* Update CI to run OV dep install before build

* Update OV dockerfile to use OV2025.3 and update build docs

* Style: use switch in supports_ops

* Style: middle ptr and ref align, omit optional struct keyword

* NPU Unify PD (#14)

* Stateless. Fix llama-cli llama-server

* Simplify broadcast op in attention

* Replace get_output_tensor+memcpy with set_output_tensor

* NPU unify PD. Unify dynamic and static dims

* Clean placeholders in ggml-openvino.cpp

* NPU unify PD (handled internally)

* change graph to 4d, support multi sequences

* Fix llama-bench

* Fix NPU

* Update ggml-decoder.cpp

Hitting error while compiling on windows:

error C3861: 'unsetenv': identifier not found

Reason: unsetenv() is a POSIX function; it doesn’t exist on Windows. Visual Studio (MSVC) won’t recognize it.

Proposed fix: Use _putenv_s() (Windows equivalent)
This is supported by MSVC and achieves the same effect: it removes the environment variable from the process environment.

This keeps cross-platform compatibility.

* Update ggml-decoder.cpp

* Update ggml-decoder.cpp

* Update ggml-decoder.cpp

* Update ggml-decoder.cpp

* Update ggml-decoder.cpp

* Remove the second decoder for node. Moving the function into the model decoder

* Fix error for naive

* NPU prefill chunking

* NPU fix llama-bench

* fallback naive run with accuracy issue

* NPU support llma-perplexity -b 512 --no-warmup

* Refactor: split ov_graph_compute for dynamic and static

* remove unused API GgmlOvDecoder::get_output_stride(const std::string & name)

* minor update due to ov 2025.4

* remove unused API GgmlOvDecoder::get_output_names()

* remove unused API get_output_shape(const std::string & name)

* Modified API GgmlOvDecoder::get_output_type(const std::string & name)

* Removed API GgmlOvDecoder::get_output_op_params(const std::string & name)

* Removed API get_output_ggml_tensor(const std::string & name)

* Removed API m_outputs

* Removed m_output_names

* Removed API GgmlOvDecoder::get_input_names()

* Removed API GgmlOvDecoder::get_input_stride(const std::string& name)

* Removed API get_input_type

* Removed API get_input_type

* Removed API GgmlOvDecoder::get_input_shape(const std::string & name)

* Removed API GgmlOvDecoder::get_input_op_params(const std::string & name)

* Fix error for decoder cache

* Reuse cached decoder

* GPU remove Q6_K requantization

* NPU fix wrong model output shape

* NPU fix q4 perf regression

* Remove unused variable nodes

* Fix decoder can_reuse for llama-bench

* Update build.md for Windows

* backend buffer: allocate on host

* Use shared_buffer for GPU NPU; Refactor

* Add ov_backend_host_buffer; Use cached remote context

* Put kvcache on GPU

* Use ggml_aligned_malloc

* only use remote tensor for kvcache

* only use remote tensor for kvcache for GPU

* FIX: use remote tensor from singleton

* Update build.md to include OpenCL

* NPU always requant to q4_0_128

* Optimize symmetric quant weight extraction: use single zp

* Use Q8_0_C in token embd, lm_head, and for 5 and 6 bits quant

* Update build.md

* Support -ctk f32

* Initial stateful graph support

* Update ggml/src/ggml-openvino/ggml-decoder.cpp

Co-authored-by: Yamini Nimmagadda <yamini.nimmagadda@intel.com>

* code cleanup

* npu perf fix

* requant to f16 for Q6 embed on NPU

* Update ggml/src/ggml-openvino/ggml-decoder.cpp

* Update ggml/src/ggml-openvino/ggml-openvino-extra.cpp

* Create OPENVINO.md in llama.cpp backend docs

* Update OPENVINO.md

* Update OPENVINO.md

* Update OPENVINO.md

* Update build.md

* Update OPENVINO.md

* Update OPENVINO.md

* Update OPENVINO.md

* kq_mask naming fix

* Syntax correction for workflows build file

* Change ov backend buffer is_host to false

* Fix llama-bench -p -n where p<=256

* Fix --direct-io 0

* Don't put kvcache on GPU in stateful mode

* Remove hardcode names

* Fix stateful shapes

* Simplification for stateful and update output shape processing

* Remove hardcode names

* Avoid re-compilation in llama-bench

* Extract zp directly instead of bias

* Refactor weight tensor processing

* create_weight_node accept non-ov backend buffer

* remove changes in llama-graph.cpp

* stateful masking fix (#38)

Fix for stateful accuracy issues and cl_out_of_resources error in stateful GPU with larger context sizes.

* Fix test-backend-ops crash glu, get_rows, scale, rms_norm, add

* hardcoded name handling for rope_freqs.weight

* Suppress logging and add error handling to allow test-backend-ops to complete

* Fix MUL_MAT with broadcast; Add unsupported MUL_MAT FLASH_ATTN cases

* Use bias instead of zp in test-backend-ops

* Update OV in CI, Add OV CI Tests in GH Actions

* Temp fix for multithreading bug

* Update OV CI, fix review suggestions.

* fix editorconfig-checker, update docs

* Fix tabs to spaces for editorconfig-checker

* fix editorconfig-checker

* Update docs

* updated model link to be GGUF model links

* Remove GGML_CPU_REPACK=OFF

* Skip permuted ADD and MUL

* Removed static variables from utils.cpp

* Removed initializing non-existing variable

* Remove unused structs

* Fix test-backend-ops for OV GPU

* unify api calling

* Update utils.cpp

* When the dim is dynamic, throw an error, need to is stastic forst

* Add interface compute_model_outputs(), which get the model output through computing the node use count & status in the cgraph to avoid the flag using

* No need to return

* Fix test-backend-ops for OV GPU LNL

* Fix test-thread-safety

* use the shape from infer request of output tensor create to avoid issue

* fix dynamic output shape  issue

* fix issue for the unused node in tests

* Remove unused lock

* Add comment

* Update openvino docs

* update to OV release version 2026.0

* add ci ov-gpu self hosted runner

* fix editorconfig

* Fix perplexity

* Rewrite the model inputs finding mechanism  (#54)

* Rewrite the model inputs finding logistic

* Put stateful shape handle in get input shape

* Put the iteration logistic in func

* Added ggml-ci-intel-openvino-gpu and doc update

* .hpp files converted to .h

* fix ggml-ci-x64-intel-openvino-gpu

* Fix for stateful execution bug in llama-bench

* Minor updates after stateful llama-bench fix

* Update ggml/src/ggml-openvino/utils.cpp

Co-authored-by: Yamini Nimmagadda <yamini.nimmagadda@intel.com>

* Remove multiple get_shape calls

* Bring back mutex into compute

* Fix VIEW op, which slice the input node

* Added token_len_per_seq existence check before slicing masks and moved node retrieval inside guarded block to prevent missing-key access

* Temp. fix for test requant errors

* Update to OV ggml-ci to low-perf

* ci : temporary disable "test-llama-archs"

* ci : cache v4 -> v5, checkout v4 -> v6, fix runner tag

* docs : update url

* Fix OV link in docker and Update docs

---------

Co-authored-by: Ravi Panchumarthy <ravi.panchumarthy@intel.com>
Co-authored-by: Cavus Mustafa <mustafa.cavus@intel.com>
Co-authored-by: Arshath <arshath.ramzan@intel.com>
Co-authored-by: XuejunZhai <Xuejun.Zhai@intel.com>
Co-authored-by: Yamini Nimmagadda <yamini.nimmagadda@intel.com>
Co-authored-by: Xuejun Zhai <Xuejun.Zhai@intel>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-03-14 07:56:55 +02:00
Adrien Gallouët
77e20cc107 vendor : update cpp-httplib to 0.37.2 (#20484)
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-03-14 06:51:02 +01:00
Rail Chabdarov
5a32a9b8a5 Fix data race in CUDA's "cpy" kernel (influences GGML's DUP, CONT operations). (#20507)
* Fix datarace in CUDA's "cpy" kernel.

* Remove extra barrier by using more of shared memory.
2026-03-14 13:19:44 +08:00
lhez
3b439504ba opencl: fix l2_norm (#20480) 2026-03-13 22:18:52 -07:00
Adrien Gallouët
463b6a963c tools : enable kvu in perplexity for hellaswag, winogrande, multiple-choice (#19954)
llama-perplexity -hf unsloth/Qwen3-0.6B-GGUF:Q4_K_M -f winogrande-debiased-eval.csv --winogrande

    winogrande_score : tokenizing selected tasks
    winogrande_score : calculating winogrande score over selected tasks.
    split_equal: sequential split is not supported when there are coupled sequences in the input batch (you may need to use the -kvu flag)
    decode: failed to find a memory slot for batch of size 46
    failed to decode the batch, n_batch = 2048, ret = 1
    winogrande_score: llama_decode() failed

same for hellaswag:

    split_equal: sequential split is not supported when there are coupled sequences in the input batch (you may need to use the -kvu flag)
    decode: failed to find a memory slot for batch of size 99
    failed to decode the batch, n_batch = 2048, ret = 1
    hellaswag_score: llama_decode() failed

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-03-13 21:25:57 +01:00
Georgi Gerganov
e30f1fdf74 graph : remove redundant GDN state transposes (#20443)
* ggml : transpose fused GDN state access for coalesced memory reads (#20436)

The fused Gated Delta Net kernel accessed the [S_v, S_v] state matrix
column-wise on row-major storage, causing strided reads (stride S_v =
128 floats = 512 bytes) that waste GPU cache bandwidth. This produced a
39% regression on Qwen3.5-9B (Metal, M4 Max) compared to the unfused
path.

Transpose the state indexing so threads read contiguously:
- Metal: s_ptr[is*S_v] -> s_ptr[is] (stride 1 vs S_v)
- CUDA:  curr_state[i*S_v+col] -> curr_state[col*S_v+i] (coalesced)
- CPU:   restructured loops for row-wise transposed access

Also add --fused-gdn [on|off|auto] CLI flag (mirrors --flash-attn) so
users can control fused GDN independently of auto-detection.

All GATED_DELTA_NET backend-ops tests pass.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* ggml : use SIMD dot products in CPU GDN kernel, couple AR/chunked fused flags

- Replace scalar inner loops with ggml_vec_dot_f32 for SIMD-optimized
  dot products in the CPU fused GDN kernel (delta and attention output)
- Couple fused_gdn_ar and fused_gdn_ch flags in auto-detection: if one
  path lacks device support, disable both to prevent state layout mismatch
  between transposed (fused) and non-transposed (unfused) formats

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* llama : rever fgdn argument changes

* graph : remove GDN state transposes

* vulkan : adapt

* cuda : remove obsolete smem code

---------

Co-authored-by: Paul Flynn <paul@arkavo.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Oliver Simons <osimons@nvidia.com>
2026-03-13 22:12:54 +02:00
Piotr Wilkin (ilintar)
1430c35948 common/parser: gracefully handle undetected tool parser, print error message. (#20286) 2026-03-13 20:56:10 +01:00
ZeroV0LT
f17b3be63f llama : fix pooling assertion crash in chunked GDN detection path (#20468)
* llama : fix pooling assertion crash in chunked GDN detection path

The chunked fused Gated Delta Net detection in sched_reserve() calls
graph_reserve(16*n_seqs, n_seqs, n_outputs, ...) where n_outputs = n_seqs.
This creates a dimension mismatch in build_pooling() for embedding models
with mean/rank pooling: build_inp_mean() creates a tensor with shape
[n_tokens=16*n_seqs, ...] while t_embd is reduced to [n_outputs=n_seqs, ...]
via out_ids, causing ggml_mul_mat to assert on ggml_can_mul_mat(a, b).

Fix: pass n_tokens as n_outputs in the chunked GDN graph reservation,
matching the pattern used by the pp/tg worst-case reservations.

Regression introduced by #20340 (d28961d).
Same class of bug as #12517, fixed by #12545.

* server : add mean pooling tests to embedding test suite

Add test_embedding_pooling_mean and test_embedding_pooling_mean_multiple
to cover the --pooling mean codepath, which was previously untested.

These tests would have caught the regression introduced by #20340 where
build_pooling() crashes with a ggml_mul_mat assertion due to mismatched
dimensions in the chunked GDN detection path.

---------

Co-authored-by: Domenico Crupi <domenico@zerovolt.it>
2026-03-13 20:53:42 +02:00
SoftwareRenderer
d7ba99c485 server: reset counter related to kill-switch on client error (#20513)
* server: reset kill-switch on client error

This avoids triggering a server kill switch.

If the client sends a request that exceeds the configured context size, an appropriate HTTP 400 response is provided and no tokens are generated.

However since no tokens are generated, update_slots() increments n_empty_consecutive. If the client sends 3 such messages in a row, the server terminates.

* moved counter reset as per recommendation

* cont : minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-03-13 19:58:09 +02:00
rehan-10xengineer
fbaa95bc29 ggml-cpu: add RVV vec dot kernels for quantization types (#18859)
* ggml-cpu: add rvv quantize_row_q8_K kernel

Co-authored-by: Rehan Qasim <rehan.qasim@10xengineers.ai>

* ggml-cpu: add rvv vec_dot for iq4_nl, mxfp4, iq2_xxs

Co-authored-by: Rehan Qasim <rehan.qasim@10xengineers.ai>

* ggml-cpu: add rvv vec_dot for iq4_xs, refactor

* ggml-cpu: remove ifunc for rvv vec dot

* ggml-cpu: add vec_dot for iq2_xs, iq3_xxs

Co-authored-by: Rehan Qasim <rehan.qasim@10xengineers.ai>

* ggml-cpu: refactor quants.c

---------

Co-authored-by: taimur-10x <taimur.ahmad@10xengineers.ai>
Co-authored-by: Rehan Qasim <rehan.qasim@10xengineers.ai>
Co-authored-by: Rehan Qasim <rehanbhatti0317@gmail.com>
2026-03-13 17:36:04 +02:00
Adrien Gallouët
b5e1212063 ggml : fix typo gmml (#20512)
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-03-13 14:36:13 +01:00
Daniel Bevenius
8f974d2392 mtmd : rename mtmd_get_audio_bitrate to mtmd_get_audio_sample_rate (#20105)
This commit renames the the function `mtmd_get_audio_bitrate` to
`mtmd_get_audio_sample_rate` to better reflect its purpose.

The motivation for this is that the function currently returns the audio
sample rate, not the bitrate (sample_rate × bit_depth × channels), and
that is how it is used in the code as well.

This is a breaking change, but I believe mtmd is still in
experimental/development phase so it might be alright to simply rename.
2026-03-13 12:30:02 +01:00
Piotr Wilkin (ilintar)
2948e6049a general: CONTRIBUTING.md - guidelines for quantization schemes (#19762)
* Guidelines for quantization schemes

* Update CONTRIBUTING.md

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Change required precision from Q8 to FP16/BF16

* Update CONTRIBUTING.md

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update CONTRIBUTING.md

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update CONTRIBUTING.md

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update CONTRIBUTING.md

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update CONTRIBUTING.md [no ci]

* Update CONTRIBUTING.md [no ci]

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
2026-03-13 12:21:33 +01:00
Georgi Gerganov
73c9eb8ced metal : fix l2 norm scale (#20493) 2026-03-13 11:43:20 +02:00
Daniel Bevenius
983df142a9 convert : fix/suppress pyright errors (#20442)
* convert : fix/suppress pyright errors

This commit fixes the pyright errors that are generated by pyright for
convert_hf_to_gguf.py.

The motivation for this is that running this locally generates errors
that CI does not, and it can be difficult to spot new errors. One use
case is when working on new models which cannot be run in CI due to
privacy. Having the ability to run pyright locally is would be helpful
in this cases.

In the linked issue there is the mention of switching to `ty` which I
don't know anything about but in the meantime I would appreciate if we
could suppress these errors for now, and later perhaps revert this
commit.

With this change there are no errors but there are 4 informations
messages if the `mistral_common` package is installed. The
`--level error` flag can be used to suppress them.

Resolves: https://github.com/ggml-org/llama.cpp/issues/20417
2026-03-13 06:00:52 +01:00
Georgi Gerganov
57819b8d4b llama : disable graph reuse with pipeline parallelism (#20463) 2026-03-12 21:04:13 +02:00
Alessandro de Oliveira Faria (A.K.A.CABELO)
557fe2d913 vendor : update cpp-httplib to 0.37.1 (#20390) 2026-03-12 13:57:06 +01:00
Piotr Wilkin (ilintar)
0e810413bb tests : use reasoning instead of reasoning_budget in server tests (#20432) 2026-03-12 13:41:01 +01:00
Ruben Ortlam
128142fe7d test-backend-ops: allow loading tests from file and parsing model operators into file (#19896)
* tests: allow loading test-backend-ops tests from json

* add error threshold based on op

* add error when file cannot be read

* add graph operator json extraction tool

* add nb parameter for non-contiguous input tensors

* fix view check

* only use view if non-contiguous/permuted, use C++ random instead of rand()

* replace internal API calls with public llama_graph_reserve call

* reduce test description length

* fix nb[0] not getting set for view

* add name to tests

* fix inplace error

* use text file instead of json

* move llama_graph_reserve function to new llama-ext header, move export-graph-ops to tests/

* fix missing declaration

* use pragma once

* fix indent

* fix Windows build
2026-03-12 13:26:00 +01:00
Daniel Bevenius
6de1bc631d common : update completion executables list [no ci] (#19934)
This commit updates the bash completion executables list, adding missing
executables and removing some that non longer exist.
2026-03-12 12:12:01 +01:00
Asbjørn Olling
0a10c34dc1 grammar: Fix grammar root symbol check (#19761)
* grammar: fix bad check for root symbol, correct error logging

* add tests to demonstrate root symbol check failure
2026-03-12 12:04:56 +01:00
ProgenyAlpha
deee23863b vulkan: add GATED_DELTA_NET op support (#20334)
* vulkan: add GATED_DELTA_NET op support

Implements the fused gated delta net recurrence as a Vulkan compute
shader with full support for scalar gate, KDA vector gate, GQA
broadcast, multi-token sequences, and permuted (non-contiguous) q/k
inputs. Specialization constants select head size (32/64/128) and
KDA mode at pipeline creation time.

Passes all 13 test-backend-ops cases on AMD Radeon 890M (RADV GFX1150).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* vulkan: optimize GATED_DELTA_NET shader (Phase 1)

- vec4 dot products on all inner loops (dp4 hardware intrinsic)
- Cache exp(g) in shared memory for KDA path, eliminating ~32K
  redundant global reads and ~16K redundant exp() calls per token
- vec4 fused decay + rank-1 update (3 vec4 ops vs 12 scalar ops)
- Add perf benchmark cases for GATED_DELTA_NET to test-backend-ops

KDA TG: +5.4% throughput. Non-KDA: no regressions.
13/13 test-backend-ops passing on AMD Radeon 890M (RADV GFX1150).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* vulkan: address review feedback for GATED_DELTA_NET

Pipeline array refactor [3][2], A_TYPE/D_TYPE/FLOAT_TYPE shader macros,
scale in push constants, supports_op fix, dispatch restructuring.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* vulkan: use FLOAT_TYPE for buffer/shared declarations, align formatting

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* vulkan: add explicit FLOAT_TYPE casts for buffer loads

Wrap data_q, data_k, and data_g buffer reads with FLOAT_TYPE() casts
to ensure correct behavior across all Vulkan configurations.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* vulkan: fix Q/K broadcast for interleaved head layout

Adapt to the interleaved broadcast convention from #20340:
head_id / rq1 → head_id % neq1

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Progeny Alpha <ProgenyAlpha@users.noreply.github.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 11:32:04 +01:00
Sigbjørn Skjæret
c3e3f9e533 convert : better mtp check and fix return [no ci] (#20419) 2026-03-12 10:04:20 +01:00
ProgenyAlpha
40c550d4f6 vulkan: fix SSM_CONV PP scaling with large ubatch sizes (#20379)
* vulkan: optimize SSM_CONV workgroup dispatch for large ubatch

Tile tokens into 2D workgroups (32x16) to reduce workgroup launch
overhead at large ubatch sizes. Add vec4 fast path for nc=4 (common
d_conv size). Fixes PP performance degradation with ubatch > 512.

Ref: ggml-org/llama.cpp#18725

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* vulkan: remove unused shared memory declaration in SSM_CONV

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Progeny Alpha <ProgenyAlpha@users.noreply.github.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 10:03:18 +01:00
Pascal
de190154c8 New conversations now auto-select the first loaded model (#20403)
* webui: auto-select first loaded model for new conversations in router mode

* chore: update webui build output
2026-03-12 09:07:05 +01:00
Masashi Yoshimura
05039967da ggml-virtgpu: Fix some build commands (#20341) 2026-03-12 15:47:45 +08:00
Georgi Gerganov
e4cff0956b metal : avoid divisions in bin kernel (#20426)
* metal : avoid modulus in bin kernel when not broadcasting

* metal : fix capture_started flag
2026-03-12 09:42:40 +02:00
Masato Nakasaka
4cc6eb158c ci: Setup self-hosted CI for Intel Linux Vulkan backend (#20154) 2026-03-12 06:43:22 +01:00
Jeff Bolz
246ffc4b05 vulkan: fix l2_norm epsilon handling (#20350) 2026-03-12 06:39:41 +01:00
Jeff Bolz
aa429cf507 vulkan: fix OOB check in flash_attn_mask_opt (#20296) 2026-03-12 06:35:49 +01:00
Masato Nakasaka
5866e3bbc8 vulkan: Fix ErrorOutOfHostMemory on Intel GPU when loading large models with --no-mmap (#20059)
* Changed to reuse command buffers to fix crashing on Intel GPU

* Removed unused parameter

* Fixed compile error and minor mistake

* Fix logging

* Changing to use usage flag per command buffer

* fixed style

* added buffer reset

* Removed cmd_buffer_idx for reuse consistency

* Fixed style
2026-03-12 06:30:16 +01:00
lhez
0516e04bf9 opencl: use larger workgroup size for get_rows (#20316) 2026-03-11 22:03:27 -07:00
shaofeiqi
3d9ab225e7 opencl: add cumsum op (#18981)
* OpenCL: add CUMSUM op support

* remove unused argument

* opencl: refactor cumsum

* opencl: refactor

* opencl: refactor tmp buffer

* opencl: adjust max number of subgroups

* opencl: fix whitespace

* opencl: fix global size when cumsum the tmp buffer

---------

Co-authored-by: Li He <lih@qti.qualcomm.com>
2026-03-11 22:03:07 -07:00
uvos
d63aa398de hip: compile debug builds with -O2 on hip to avoid a compiler bug (#20392) 2026-03-12 10:37:10 +08:00
Mishusha
a8304b4d27 common/parser: add GigaChatV3/3.1 models support (#19931)
Co-authored-by: Mishusha <pmv26021975@gmail.com>
2026-03-12 01:22:25 +01:00
DAN™
fdb17643d3 model : add support for Phi4ForCausalLMV (#20168)
* Add support for Phi4ForCausalLMV.

* Fix Phi-4 vision parity (correcting SigLIP2 patch-kernel export layout) and matching HF NaFlex resize behavior in mtmd.

* Rename contants + fix tokenizer label

* Clean-ups.

* Fix GGUF export.

* Set tokenizer.ggml.pre explicitly.

* Default vocab name rather than forcing it.

* Clean-ups.

* Fix indent.

* Fix subscriptable error.

* remov overcomplicated code path

* Clean-ups.

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
2026-03-12 00:25:54 +01:00
Richard Davison
1eea6a2968 graph : add optional scale parameter to build_lora_mm [no ci] (#20427) 2026-03-12 00:22:49 +01:00
ddh0
4a748b8f15 common : fix --n-cpu-moe, --cpu-moe for models with fused gate + up (#20416) 2026-03-12 00:13:28 +01:00
Masashi Yoshimura
f2ab047f27 ggml-webgpu: Add supports for GGML_OP_REPEAT (#20230)
* Add GGML_OP_REPEAT to webgpu backend.

* Add i16 support for GGML_OP_REPEAT.
2026-03-11 14:40:36 -07:00
Georgi Gerganov
d28961d81e llama : enable chunked fused GDN path (#20340)
* llama : enable chunked fused GDN path

* models : avoid Q and K repeats when using fused GDA

* cont : fix comment

Co-authored-by: Aman Gupta <amangupta052@gmail.com>

* cont : fix the fix

Co-authored-by: Aman Gupta <amangupta052@gmail.com>

* cont : fix

* metal : add GDN kernel (#20361)

* metal : add Metal backend for GGML_OP_GATED_DELTA_NET

Add a fused Metal kernel for the gated delta net recurrence op
(#19504), enabling GPU-accelerated inference for DeltaNet-based
models (Qwen3.5, etc.) on Apple Silicon.

Supports both GDA (scalar gate) and KDA (per-row gate) modes
with head_size 64 and 128. Unsupported configurations (head_size
32, non-contiguous tensors) gracefully fall back to CPU.

Performance: Qwen3.5-0.8B Q4_K_M on M4 Max
  tg128: 170 -> 213 t/s (+25%)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* metal : validate contiguity of all input tensors in supports_op

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* metal : add algorithm equivalence comment for GDA decay path

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* cont : unslop + optimize

* cont : clean-up

---------

Co-authored-by: Paul Flynn <paul@arkavo.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>

* CUDA: AR gated delta net improvements (#20391)

* Add FastDiv to gated_delta_net_cuda

* Shard columns across warps

This reduces register pressure (avoids spill for S_v = 128) and gives
the warp-scheduler more CTAs to schedule (thus hiding data-access
latencies).

* Remove unneded include in gated_delta_net.cu

* Improve comments

* Apply code-formating

* Make sharding HIP-compatible

1. Use ggml_cuda_get_physical_warp_size() to determine warp size flexibly
2. Add test with partial warp to test sum reduction on CUDA

* Remove fastdiv_s64, as we can treat neqk1 and rq3 as uint32_t

* Rename variables

* Enable GDN also for prefill, move TODO for chunked_GDN

* Actually remove the TODO from 2068908975

* Get warp size at runtime

warp_size is not known at compile time in hip host code.

* Don't expose ggml_cuda_get_physical_warp_size on host

---------

Co-authored-by: uvos <devnull@uvos.xyz>

* llama : refactor llm_build_delta_net_base API

---------

Co-authored-by: Aman Gupta <amangupta052@gmail.com>
Co-authored-by: Paul Flynn <paul@arkavo.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Oliver Simons <osimons@nvidia.com>
Co-authored-by: uvos <devnull@uvos.xyz>
2026-03-11 22:46:40 +02:00
Sigbjørn Skjæret
f90bd1dd84 llama : whitespace cleanup (#20422) 2026-03-11 21:18:29 +01:00
Richard Davison
5eae9cb1d9 ggml : add NVFP4 quantization type support (#19769)
* WIP: add NVFP4 quantization support

* tests

* improve NVFP4 dot product implementation performance and fix bad super call

* typo

* Use nvfp4 kvalues

* vulkan : fix NVFP4 shader compilation by including kvalues_mxfp4 lookup table

* vulcal and perf fixes

* wip

* Fix metal

* fix vulcan

* Rename threshold & fix wrong scale

* Fix MOE

* Shelf backend implementations (CUDA, Metal, Vulkan, arch-specific SIMD)

Remove NVFP4 support from GPU backends and architecture-specific
optimized dot products. These should be added in separate PRs so
backend specialists can review them independently.

Reverted files:
- ggml-cuda: common.cuh, convert.cu, mmq.cu/cuh, mmvq.cu, vecdotq.cuh,
  quantize.cu/cuh, mma.cuh, ggml-cuda.cu, fattn-tile.cuh
- ggml-metal: ggml-metal.metal, ggml-metal-device.cpp, ggml-metal-impl.h,
  ggml-metal-ops.cpp
- ggml-vulkan: ggml-vulkan.cpp, all vulkan-shaders/*
- ggml-cpu arch: arm/quants.c, x86/quants.c, powerpc/quants.c, s390/quants.c

Core NVFP4 support (type definition, CPU fallback dot product,
quantization, dequantization, conversion) is retained.

* Fix arch-fallback.h: add NVFP4 generic fallback for all platforms

After shelving backend-specific SIMD implementations, the generic
CPU dot product needs to be aliased on ARM, x86, PowerPC, and s390
platforms that previously relied on arch-specific versions.

* quantize: add NVFP4 as a quantization type option

* Fix ggml_fp32_to_ue4m3: handle subnormal values

Previously, values with ue4m3_exp <= 0 were clamped to 0, causing
all small scales to underflow. This made NVFP4 quantization via
llama-quantize produce garbage (PPL = 5.8M) since typical transformer
weights have amax/6.0 in the range 0.001-0.01, which falls in the
UE4M3 subnormal range.

Now subnormals are properly encoded as man * 2^-9 (exp=0, man=1..7),
matching the decode path in ggml_ue4m3_to_fp32.

Result: NVFP4 requantization now produces PPL = 15.25 (vs F16 = 14.33),
comparable to Q4_1 (PPL = 15.81) at slightly lower BPW (4.70 vs 5.15).

* Restore ARM NEON NVFP4 dot product implementation

Restores the optimized ggml_vec_dot_nvfp4_q8_0 for ARM NEON using
vqtbl1q_s8 lookup and ggml_vdotq_s32 dot products.

tg128 performance: 4.37 t/s (generic) -> 13.66 t/s (NEON) = 3.1x speedup

* Optimize ARM NEON NVFP4 dot product: LUT + vpaddq + vfmaq

- Add ue4m3_scale_lut[128] to ggml-common.h replacing branch-heavy
  ggml_ue4m3_to_fp32() in the hot loop
- Use vpaddq_s32 for pairwise int32 reduction instead of vaddvq_s32
- Accumulate with vfmaq_f32 into float32x4_t vector accumulators

tg128: 8.1 -> 31.0 t/s (3.8x speedup, 77% of Q4_1 speed)

* ARM NEON NVFP4: rearrange q8 to match nibble layout

Alternative approach: rearrange q8 data to match the NVFP4 lo/hi
nibble layout instead of rearranging the looked-up NVFP4 values.
Eliminates vcombine_s8(vget_low, vget_low) shuffles.

Performance is equivalent (~18.5 t/s) - the bottleneck is the 2x
block overhead from QK=16 vs QK=32, not the shuffle instructions.

* CPU only backend 64 super-block layout

* cleanup

* Remove unused LUT

* int

* exclude NVFP4 from unsupported ops in metal build

* remove quantization for now

* store scales as native UE4M3, preserve original model bits when possible

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* correct comment

* format

* reduce duplication and cleanup

* Address comments

* move detection to prepare_tensors

* Use math instead of const

* Move

* fix comment

* Shelf quantize tests

* Rebase and move check

* cleanup

* lint

* Update gguf-py/gguf/scripts/gguf_convert_endian.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Use fallback quant config

* Simplify

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* organize

* Refactor

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* add quantize_nvfp4 (required for test_quants.py)

* add quantize_nvfp4 (required for test_quants.py)

* add quantize_nvfp4 (required for test_quants.py)

* fix return type

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-03-11 21:02:54 +01:00
Georgi Gerganov
3ca19b0e9f benches : add nemotron super (#20420) 2026-03-11 21:39:40 +02:00
Daniel Bevenius
eaf1d7930c llama : add support for Nemotron 3 Super (#20411)
* llama : add support for Nemotron 3 Super

This commit adds support for the Nemotron 3 Super model (120B.A12B)
enabling this model to be converted to GGUF format and run in llama.cpp.

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Matt Clayton <156335168+mattjcly@users.noreply.github.com>
2026-03-11 19:27:53 +01:00
Georgi Gerganov
76ea1c1c46 metal : fix capture_compute counter logic (#20410) 2026-03-11 18:38:22 +02:00
Aman Gupta
bd1ec818e9 compare-llama-bench: check remotes as well (#20406) 2026-03-12 00:14:42 +08:00
Georgi Gerganov
b541241104 metal : fix q5_k mul_mv register spill (#20399) 2026-03-11 16:25:27 +02:00
Georgi Gerganov
c363256839 metal : add env var to trigger graph capture (#20398) 2026-03-11 16:25:10 +02:00
Neo Zhang
ecac98ee53 [SYCL] Update SYCL.md for binary package for Windows (#20401)
* add download binary package

* update prefix
2026-03-11 22:21:22 +08:00
Ruben Ortlam
182acfe5c5 ci: disable coopmat on ubuntu-24-cmake-vulkan job (#20294) 2026-03-11 14:12:29 +01:00
Aldehir Rojas
b5fe4559ae common/parser: use nlohmann::ordered_json to preserve parameter order (#20385) 2026-03-11 10:26:51 +01:00
Piotr Wilkin (ilintar)
acb7c79069 common/parser: handle reasoning budget (#20297)
* v1

* Finished!

* Handlie cli

* Reasoning sampler

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Less explosive terminology :)

* Add utf-8 case and tests

* common : migrate reasoning budget sampler to common

* cont : clean up

* cont : expose state and allow passing as initial state

* cont : remove unused imports

* cont : update state machine doc string

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
Co-authored-by: Alde Rojas <hello@alde.dev>
2026-03-11 10:26:12 +01:00
uvos
5f91b1d5d5 ggml-cuda: gdn use shared mem for HIP (#20366)
Suggested-by: Aman Gupta <amangupta052@gmail.com>
2026-03-11 13:06:19 +08:00
uvos
9ef7523ee9 cuda/hip: fix loop unrolling in ssm-conv (#20369) 2026-03-11 13:04:32 +08:00
Pascal
00de615345 Fix agentic mcp image single model (#20339)
* webui: fix MCP image attachments dropped during the agentic loop in single-model mode

* chore: update webui build output
2026-03-11 05:31:33 +01:00
Alessandro de Oliveira Faria (A.K.A.CABELO)
e1a399992b vendor : update cpp-httplib to 0.37.0 (#20207) 2026-03-11 11:03:53 +08:00
Alessandro de Oliveira Faria (A.K.A.CABELO)
4f2f0a163d vendor : update miniaudio to 0.11.25 (#20209) 2026-03-11 11:01:56 +08:00
Neo Zhang
0cec84f999 fix op rope, add rope_back (#20293) 2026-03-11 09:53:34 +08:00
Neo Zhang
b2e1427c9b fix for failed UT case: ACC, L2_NORM, UPSCALE, fused_glu, unary (#20283) 2026-03-11 09:53:05 +08:00
Vinicios Lugli
4d99d45084 model : qwen3vl reranker text support (#20332)
* model : fix qwen3vl reranker support

* Remove CLS_OUT

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-03-10 23:40:14 +01:00
ddh0
10e5b148b0 llama-quant : correct n_attention_wv usage (#20357)
* llama-quant : correct `n_attention_wv` usage

In #19770, I introduced a regression in the way the
`quantize_state_impl` counter values were initialized. I was
incrementing and using `n_attention_wv` in the same loop, when it should
have been fixed by the time we're deciding tensor types in
`llama_tensor_get_type_impl` (for `use_more_bits`).

I never observed a difference in any of [my
tests](https://github.com/ggml-org/llama.cpp/pull/19770#issuecomment-4000424712)
- it was only after @bartowski kindly pointed this out that I realized
it was incorrect. (Thanks!)

* simplify
2026-03-10 21:43:29 +02:00
Georgi Gerganov
90b2731894 ggml : bump RPC version (#20330) 2026-03-10 21:36:57 +02:00
Reese Levine
aa2d278a11 ggml webgpu: faster normal quant and some k-quant matrix operations, better shader parameter handling (#20173)
* K quant speedup (#20)

* Basic JIT compilation for mul_mat, get_rows, and scale (#17)

* scale jit working

* preliminary working jit for getrows and mulmat, needs refining

* simplified mul_mat preprocessing switch statement

* get_rows fixes, mul_mat refinement

* formatted + last edits

* removed some extraneous prints

* fixed get_rows, fixed workgroup dispatch in mul_mat. no gibberish

* small fix

* some changes, working

* get_rows and mul_mat jit fixed and working

* Update formatting

* formatting

* Add header

---------

Co-authored-by: Neha Abbas <nehaabbas@ReeseLevines-MacBook-Pro.local>
Co-authored-by: Reese Levine <reeselevine1@gmail.com>

* Start work on all-encompassing shader library

* refactor argmax, set_rows

* Refactor all but flashattention, mat mul

* no gibberish, all k quants added, merged

* vec memory fix

* q6_k matching metal on my machine, tests passing

* Set tile size for q6_k separately

* Separate out fast shaders

---------

Co-authored-by: neha-ha <137219201+neha-ha@users.noreply.github.com>

* Move towards writeBuffer for params

* Move away from multiple buffers for set_rows errors, remove host buffer for parameter buffers, minor cleanups

* Remove extra file

* Formatting

---------

Co-authored-by: neha-ha <137219201+neha-ha@users.noreply.github.com>
2026-03-10 09:14:27 -07:00
Piotr Wilkin (ilintar)
6c770d16ca Reduce level of content parser warning message to avoid log spam on non-debug verbosity (#20347) 2026-03-10 15:21:51 +01:00
Ray Xu
8d880ac012 examples : fix empty items in json_schema_to_grammar.py [no ci] (#19968)
* Fix logic for retrieving schema items in `json_schema_to_grammar.py`

If `schema['items']` is `{}` and `prefixItems not in schema', as `{}` is Falsy, the original code here will raise an error.

I think if `schema['items']` is `{}`, them items should just be `{}`

* Apply suggestion from @CISC

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Add tests for arrays with empty items

Add two unit tests to `tests/test-json-schema-to-grammar.cpp` that validate handling of arrays when 'items' is an empty schema and when 'prefixItems' is present alongside an empty 'items'. Both tests expect the same generated grammar, ensuring the JSON Schema->grammar conversion treats an empty 'items' schema (and the presence of 'prefixItems') correctly and covering this edge case.

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-03-10 14:38:18 +01:00
a3894281
0f1e9d14cc docs: update CPU backend ops to mark POOL_1D as supported (#20304) 2026-03-10 21:31:24 +08:00
Georgi Gerganov
1274fbee9e models : fix assert in mamba2 (cont) (#20335)
* models : fix assert in mamba2 (cont)

* cont : add n_group mod

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-03-10 15:00:08 +02:00
Georgi Gerganov
a7b3dee7a5 server : make 2 checkpoints near the end of the prompt (#20288)
* server : make 2 checkpoints near the end of the prompt

* cont : adjust checkpoints
2026-03-10 14:28:23 +02:00
Sigbjørn Skjæret
ec947d2b16 common : fix incorrect uses of stoul (#20313) 2026-03-10 11:40:26 +01:00
Charles Xu
0cd4f4720b kleidiai : support for concurrent sme and neon kernel execution (#20070) 2026-03-10 09:25:25 +02:00
Taimur Ahmad
af237f3026 ggml-cpu: add RVV repack GEMM and GEMV for quantization types (#19121)
* ggml-cpu: add rvv ggml_quantize_mat_4x8 for q8_0

Co-authored-by: Rehan Qasim <rehan.qasim@10xengineers.ai>

* ggml-cpu: add rvv repacking for iq4_nl

* ggml-cpu: add generic impl for iq4_nl gemm/gemv

* ggml-cpu: add rvv repacking for q8_0

* ggml-cpu: refactor; add rvv repacking for q4_0, q4_K

* ggml-cpu: refactor; add rvv repacking for q2_K

Co-authored-by: Rehan Qasim <rehan.qasim@10xengineers.ai>

* ggml-cpu: refactor rvv repack

---------

Co-authored-by: Rehan Qasim <rehan.qasim@10xengineers.ai>
2026-03-10 08:49:52 +02:00
Julian Pscheid
1a5631beaa metal: handle command buffer failures gracefully in synchronize (#20306)
Replace GGML_ABORT("fatal error") in ggml_metal_synchronize() with
error flag + return. This aligns synchronize error handling with
graph_compute, which already returns GGML_STATUS_FAILED for the same
condition.

When a command buffer fails (e.g., iOS GPU access revocation during
backgrounding, macOS eGPU disconnect, OOM), the backend enters an
error state instead of killing the host process. Subsequent
graph_compute calls return GGML_STATUS_FAILED immediately. Recovery
requires recreating the backend.

Failed extra command buffers are properly released on the error path
to avoid Metal object leaks.
2026-03-10 08:32:24 +02:00
ddh0
1dab5f5a44 llama-quant : fail early on missing imatrix, refactor type selection, code cleanup (#19770)
* quantize : imatrix-fail early + code cleanup

* fix manual override printing

it's in the preliminary loop now, so needs to be on its own line

* revert header changes per ggerganov

* remove old #includes

* clarify naming

rename `tensor_quantization` to `tensor_typo_option` to descirbe its
functionality

* fix per barto
2026-03-10 08:16:05 +02:00
Aldehir Rojas
c96f608d98 common: consolidate PEG string parsers (#20263)
* common : consolidate PEG string parsers
* cont : fix json_string_content()
2026-03-10 00:29:21 +01:00
Xuan-Son Nguyen
0842b9b465 model: fix step3.5 n_rot (#20318) 2026-03-09 23:42:24 +01:00
Xuan-Son Nguyen
59db9a357d llama: dynamic head_dim and n_rot for SWA (#20301)
* llama: dynamic head_dim and n_rot for SWA

* also add gguf_writer wrappers

* fix build

* build_rope_shift arg reorder
2026-03-09 22:22:39 +01:00
Evan Huus
23fbfcb1ad server: Parse port numbers from MCP server URLs in CORS proxy (#20208)
* Parse port numbers from MCP server URLs

* Pass scheme to http proxy for determining whether to use SSL

* Fix download on non-standard port and re-add port to logging

* add test

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
2026-03-09 17:47:54 +01:00
Paul Flynn
e22cd0aa15 metal : extend mul_mv_ext to BF16, Q2_K, Q3_K (#20250)
Enable mul_mv_ext small-batch kernels (BS 2-8) for BF16, Q2_K,
and Q3_K quantization types. These types previously fell through
to the slower single-row mul_mv path.

BF16 uses the float4 dequantize path (like F16). Q2_K and Q3_K
use the float4x4 K-quant path (like Q4_K/Q5_K/Q6_K).

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 16:48:12 +02:00
Georgi Gerganov
96cfc4992c server : fix checkpoints n_tokens calculation (#20287) 2026-03-09 16:47:06 +02:00
Georgi Gerganov
ed0007aa32 metal : add upscale (#20284) 2026-03-09 16:45:11 +02:00
Georgi Gerganov
344ee2a38a server : warn swa-full is not supported for non-SWA models (#20291) 2026-03-09 16:44:25 +02:00
Georgi Gerganov
d6e1556499 server : fix off-by-1 in server_tokens::size_up_to_pos() (#20279)
* server : fix off-by-1 in server_tokens::size_up_to_pos()

* cont : fix typo [no ci]
2026-03-09 16:43:38 +02:00
Piotr Wilkin (ilintar)
f76565db92 common: map developer role to system (#20215)
* Map developer role to system
* Simplify
2026-03-09 14:25:11 +01:00
Georgi Gerganov
43e1cbd6c1 models : fix assert in mamba2 graph (#20270) 2026-03-09 13:15:15 +02:00
Georgi Gerganov
107d599952 server : add kill switch when server is stuck (#20277) 2026-03-09 10:33:12 +02:00
Aman Gupta
e8bbc736cb ggml-cuda: disable gdn for musa (#20278) 2026-03-09 16:15:36 +08:00
ddh0
b518195101 llama-quant : left-align tensor names in output (#20117) 2026-03-09 09:28:41 +02:00
Aman Gupta
e2763a6723 contributing: limit open PRs for new contributors to 1 (#20036) 2026-03-09 15:05:34 +08:00
Bertay Eren
0beb8db3a0 ggml-vulkan: add SGN operator, auto-generate Vulkan.csv and ops.md (#20219) 2026-03-09 07:24:16 +01:00
Ruben Ortlam
b2f460bd3c vulkan: skip zero size tensors in backend copies (#20233) 2026-03-09 07:23:45 +01:00
Michael Huang
5f4cdac385 cuda : display total and free VRAM capacity during device initialization (#20185) 2026-03-09 12:45:43 +08:00
Aaron Teo
ae87863dc1 llama-bench: introduce -hf and -hff flags & use --mmap 1 by default (#20211) 2026-03-09 09:05:44 +08:00
Piotr Wilkin (ilintar)
97c64fbdbd PEG parser for LFM2 (#20251)
* PEG parser for LFM2

* Simplify using python_value()
2026-03-09 01:11:22 +01:00
Georgi Gerganov
d417bc43dd server : do not create checkpoints right after mtmd chunks (#20232) 2026-03-08 22:16:46 +02:00
Sigbjørn Skjæret
35bee031e1 graph : remove redundant scale_w parameter (#20235) 2026-03-08 18:58:28 +01:00
Aldehir Rojas
451ef08432 common : gracefully handle incomplete output (#20191)
* common : handle incomplete UTF-8 at end of input in PEG parser

* cont : if reached end prematurely, emit needs_more_input to propagate partial output

* cont: refactor peg parse context to add lenient flag

* cont : remove partial flag, keep lenient flag
2026-03-08 17:17:02 +01:00
Piotr Wilkin (ilintar)
9b24886f78 Fix compile bug (#20203)
* Fix compile bug

* Update common/chat-auto-parser-helpers.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-03-08 17:15:49 +01:00
Piotr Wilkin (ilintar)
62b8143ad2 Fix structured outputs (#20223)
* Fix structured outputs

* Update common/chat-auto-parser-generator.cpp

Co-authored-by: Aldehir Rojas <hello@alde.dev>

---------

Co-authored-by: Aldehir Rojas <hello@alde.dev>
2026-03-08 17:14:43 +01:00
GiantPrince
d088d5b74f ggml-vulkan: Add ELU op support (#20183)
* ggml-Vulkan: add ELU support

* ggml-Vulkan: remove extra spaces and variables

* ggml-Vulkan: fix format issue

* ggml-Vulkan: fix format issue

* fix whitespace issue

* Update Vulkan.csv and ops.md
2026-03-08 12:38:17 +01:00
Jeff Bolz
cd18a50ea5 vulkan: Fix data races in coopmat1 mul_mat(_id) (#20084)
* vulkan: Fix data races in coopmat1 mul_mat(_id)

Add barriers between coopmat store and regular loads. We sort of got away with
this because it was the same subgroup accessing the values, but it's still a
race and may not work.

* switch to subgroup control barriers
2026-03-08 12:33:48 +01:00
Johannes Gäßler
a976ff081b llama: end-to-end tests (#19802)
* tests: add end-to-end tests per model architecture

* fixup for rebase

* fix use-after-free in llama-model-loader.cpp

* fix CI

* fix WebGPU

* fix CI

* disable CI for macOS-latest-cmake-arm64

* use expert_weights_scale only if != 0.0f

* comments
2026-03-08 12:30:21 +01:00
Christopher Maher
a95047979a readme : update infra list (#20212) 2026-03-08 12:42:28 +02:00
Piotr Wilkin (ilintar)
b283f6d5b3 Revert to OAI-compatible args (#20213)
* Revert to OAI-compatible args

* Apply workaround::func_args_not_string
2026-03-08 11:33:03 +01:00
decahedron1
ff52ee964d server : correct index on finish in OAI completion streams (#20226) 2026-03-08 10:08:57 +01:00
Neo Zhang
213c4a0b81 [SYCL] supprt Flash Attention for fp32/fp16/Q4/Q5/Q8 (#20190)
* support flash-attention for fp32/fp16/Q4/Q5/Q8

* rm warining

* update for JIT
2026-03-08 12:00:07 +08:00
Aman Gupta
c5a778891b ggml: add GATED_DELTA_NET op (#19504)
* ggml: add GATED_DELTA_NET op

* remove the transpose

* add KDA

* add qwen35 dense

* llama : check for fused gated delta net backend support

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-03-07 15:41:10 +08:00
lhez
6fce5c6a7d opencl: add l2_norm (#20160) 2026-03-06 18:03:05 -08:00
Piotr Wilkin (ilintar)
c024d85908 Autoparser: True streaming (#20177)
* Relax atomicity constraint for nicer, more pleasent, True Streaming parsing

* Whitespace

* Remove redundant atomics
2026-03-07 01:55:33 +01:00
421 changed files with 38916 additions and 27522 deletions

138
.devops/openvino.Dockerfile Normal file
View File

@@ -0,0 +1,138 @@
ARG OPENVINO_VERSION_MAJOR=2026.0
ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
ARG UBUNTU_VERSION=24.04
# Optional proxy build arguments - empty by default
ARG http_proxy=
ARG https_proxy=
## Build Image
FROM ubuntu:${UBUNTU_VERSION} AS build
# Pass proxy args to build stage
ARG http_proxy
ARG https_proxy
RUN apt-get update && \
apt-get install -y --no-install-recommends \
ca-certificates \
gnupg \
wget \
git \
cmake \
ninja-build \
build-essential \
libtbb12 \
libssl-dev \
ocl-icd-opencl-dev \
opencl-headers \
opencl-clhpp-headers \
intel-opencl-icd && \
rm -rf /var/lib/apt/lists/*
# Install OpenVINO for Ubuntu 24.04
ARG OPENVINO_VERSION_MAJOR
ARG OPENVINO_VERSION_FULL
RUN mkdir -p /opt/intel && \
wget https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
tar -xf openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
echo "Y" | ./install_dependencies/install_openvino_dependencies.sh && \
cd - && \
ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
ENV OpenVINO_DIR=/opt/intel/openvino
WORKDIR /app
COPY . .
# Build Stage
RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \
cmake -B build/ReleaseOV -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENVINO=ON && \
cmake --build build/ReleaseOV -j$(nproc)"
# Copy all necessary libraries
RUN mkdir -p /app/lib && \
find build/ReleaseOV -name '*.so*' -exec cp {} /app/lib \; && \
find ${OpenVINO_DIR}/runtime/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \; 2>/dev/null || \
find ${OpenVINO_DIR}/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \;
# Create runtime directories and copy binaries
RUN mkdir -p /app/full \
&& cp build/ReleaseOV/bin/* /app/full/ \
&& cp *.py /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh
## Base Runtime Image
FROM ubuntu:${UBUNTU_VERSION} AS base
# Pass proxy args to runtime stage
ARG http_proxy
ARG https_proxy
RUN apt-get update \
&& apt-get install -y libgomp1 libtbb12 curl\
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete
COPY --from=build /app/lib/ /app/
### Full (all binaries)
FROM base AS full
ARG http_proxy
ARG https_proxy
COPY --from=build /app/full /app/
WORKDIR /app
RUN apt-get update && \
apt-get install -y --no-install-recommends \
git \
python3 \
python3-venv \
python3-pip && \
python3 -m venv /ov-venv && \
/ov-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
/ov-venv/bin/pip install --no-cache-dir -r requirements.txt && \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /tmp/* /var/tmp/* && \
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
find /var/cache -type f -delete
ENTRYPOINT ["/bin/bash", "-c", "source /ov-venv/bin/activate && exec /app/tools.sh \"$@\"", "--"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app/
WORKDIR /app
ENTRYPOINT [ "/app/llama-cli" ]
### Server, Server only
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama-server /app/
WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View File

@@ -53,10 +53,11 @@ RUN apt-get update \
&& apt-get install -y \
build-essential \
git \
python3 \
python3-dev \
python3.13 \
python3.13-dev \
python3-pip \
python3-wheel \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.13 100 \
&& pip install --break-system-packages --upgrade setuptools \
&& pip install --break-system-packages -r requirements.txt \
&& apt autoremove -y \

View File

@@ -0,0 +1,25 @@
name: "Linux - Setup OpenVINO Toolkit"
description: "Setup OpenVINO Toolkit for Linux"
inputs:
path:
description: "Installation path"
required: true
version_major:
description: "OpenVINO major version (e.g., 2025.3)"
required: true
version_full:
description: "OpenVINO full version (e.g., 2025.3.0.19807.44526285f24)"
required: true
runs:
using: "composite"
steps:
- name: Setup OpenVINO Toolkit
id: setup
uses: ./.github/actions/unarchive-tar
with:
url: https://storage.openvinotoolkit.org/repositories/openvino/packages/${{ inputs.version_major }}/linux/openvino_toolkit_ubuntu24_${{ inputs.version_full }}_x86_64.tgz
path: ${{ inputs.path }}
type: z
strip: 1

View File

@@ -63,6 +63,34 @@ jobs:
path: ./spacemit_toolchain
version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}
ubuntu-24-openvino-cache:
runs-on: ubuntu-24.04
env:
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.0"
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Setup Cache
uses: actions/cache@v5
id: cache-openvino
with:
path: ./openvino_toolkit
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
- name: Setup OpenVINO Toolkit
if: steps.cache-openvino.outputs.cache-hit != 'true'
uses: ./.github/actions/linux-setup-openvino
with:
path: ./openvino_toolkit
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
version_full: ${{ env.OPENVINO_VERSION_FULL }}
windows-2022-rocm-cache:
runs-on: windows-2022

View File

@@ -5,7 +5,7 @@ on:
jobs:
linux:
runs-on: ubuntu-24.04
runs-on: ubuntu-slim
steps:
- uses: actions/checkout@v6
with:
@@ -14,7 +14,7 @@ jobs:
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y build-essential tcl
sudo apt install -y build-essential tcl cmake
- name: Build
run: |

View File

@@ -142,7 +142,7 @@ jobs:
# cmake --build build --config Release -j $(nproc)
debian-13-loongarch64-cpu-cross:
runs-on: ubuntu-24.04
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
steps:
@@ -197,7 +197,7 @@ jobs:
cmake --build build --config Release -j $(nproc)
debian-13-loongarch64-vulkan-cross:
runs-on: ubuntu-24.04
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
steps:

250
.github/workflows/build-self-hosted.yml vendored Normal file
View File

@@ -0,0 +1,250 @@
name: CI (self-hosted)
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
'**/*.cu',
'**/*.cuh',
'**/*.swift',
'**/*.m',
'**/*.metal',
'**/*.comp',
'**/*.glsl',
'**/*.wgsl'
]
pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build-self-hosted.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
'**/*.cu',
'**/*.cuh',
'**/*.swift',
'**/*.m',
'**/*.metal',
'**/*.comp',
'**/*.glsl',
'**/*.wgsl'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
jobs:
ggml-ci-nvidia-cuda:
runs-on: [self-hosted, Linux, NVIDIA]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Test
id: ggml-ci
run: |
nvidia-smi
GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
ggml-ci-nvidia-vulkan-cm:
runs-on: [self-hosted, Linux, NVIDIA]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Test
id: ggml-ci
run: |
vulkaninfo --summary
GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
ggml-ci-nvidia-vulkan-cm2:
runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Test
id: ggml-ci
run: |
vulkaninfo --summary
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
ggml-ci-cpu-amx:
runs-on: [self-hosted, Linux, CPU, AMX]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Test
id: ggml-ci
run: |
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
# ggml-ci-amd-vulkan:
# runs-on: [self-hosted, Linux, AMD]
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v6
# - name: Test
# id: ggml-ci
# run: |
# vulkaninfo --summary
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
# ggml-ci-amd-rocm:
# runs-on: [self-hosted, Linux, AMD]
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v6
# - name: Test
# id: ggml-ci
# run: |
# amd-smi static
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
ggml-ci-mac-metal:
runs-on: [self-hosted, macOS, ARM64]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Test
id: ggml-ci
run: |
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
ggml-ci-mac-webgpu:
runs-on: [self-hosted, macOS, ARM64]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Dawn Dependency
id: dawn-depends
run: |
DAWN_VERSION="v2.0.0"
DAWN_OWNER="reeselevine"
DAWN_REPO="dawn"
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
curl -L -o artifact.zip \
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
mkdir dawn
unzip artifact.zip
tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
- name: Test
id: ggml-ci
run: |
GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
ggml-ci-mac-vulkan:
runs-on: [self-hosted, macOS, ARM64]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Test
id: ggml-ci
run: |
vulkaninfo --summary
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
ggml-ci-linux-intel-vulkan:
runs-on: [self-hosted, Linux, Intel]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
persist-credentials: false
- name: Test
id: ggml-ci
run: |
vulkaninfo --summary
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
ggml-ci-intel-openvino-gpu-low-perf:
runs-on: [self-hosted, Linux, Intel, OpenVINO]
env:
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.0"
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Use OpenVINO Toolkit Cache
uses: actions/cache@v5
id: cache-openvino
with:
path: ./openvino_toolkit
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
- name: Setup OpenVINO Toolkit
if: steps.cache-openvino.outputs.cache-hit != 'true'
uses: ./.github/actions/linux-setup-openvino
with:
path: ./openvino_toolkit
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
version_full: ${{ env.OPENVINO_VERSION_FULL }}
- name: Install OpenVINO dependencies
run: |
cd ./openvino_toolkit
chmod +x ./install_dependencies/install_openvino_dependencies.sh
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
- name: Test
id: ggml-ci
run: |
source ./openvino_toolkit/setupvars.sh
GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

View File

@@ -93,7 +93,7 @@ jobs:
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
ctest -L main -E "test-llama-archs" --verbose --timeout 900
macOS-latest-cmake-x64:
runs-on: macos-15-intel
@@ -317,8 +317,8 @@ jobs:
cd build
ctest -L main --verbose --timeout 900
ubuntu-latest-llguidance:
runs-on: ubuntu-latest
ubuntu-24-llguidance:
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
steps:
- name: Clone
@@ -345,8 +345,8 @@ jobs:
cd build
ctest -L main --verbose --timeout 900
ubuntu-latest-cmake-rpc:
runs-on: ubuntu-latest
ubuntu-24-cmake-rpc:
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
continue-on-error: true
@@ -355,12 +355,6 @@ jobs:
id: checkout
uses: actions/checkout@v6
# - name: ccache
# uses: ggml-org/ccache-action@v1.2.16
# with:
# key: ubuntu-latest-cmake-rpc
# evict-old-files: 1d
- name: Dependencies
id: depends
run: |
@@ -381,20 +375,13 @@ jobs:
ctest -L main --verbose
ubuntu-24-cmake-vulkan-deb:
runs-on: ubuntu-24.04
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-24-cmake-vulkan-deb
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
run: |
@@ -469,6 +456,7 @@ jobs:
cd build
export GGML_VK_VISIBLE_DEVICES=0
export GGML_VK_DISABLE_F16=1
export GGML_VK_DISABLE_COOPMAT=1
# This is using llvmpipe and runs slower than other backends
ctest -L main --verbose --timeout 4800
@@ -541,23 +529,16 @@ jobs:
run: |
cd build
# This is using llvmpipe and runs slower than other backends
ctest -L main --verbose --timeout 3600
ctest -L main --verbose --timeout 900
ubuntu-24-wasm-webgpu:
runs-on: ubuntu-24.04
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-latest-wasm-webgpu
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Install Emscripten
run: |
git clone https://github.com/emscripten-core/emsdk.git
@@ -743,6 +724,83 @@ jobs:
-DGGML_SYCL_F16=ON
cmake --build build --config Release -j $(nproc)
ubuntu-24-cmake-openvino:
name: ubuntu-24-cmake-openvino-${{ matrix.openvino_device }}
strategy:
matrix:
include:
- variant: cpu
runner: '"ubuntu-24.04"'
openvino_device: "CPU"
- variant: gpu
runner: '["self-hosted","Linux","X64","Intel"]'
openvino_device: "GPU"
runs-on: ${{ fromJSON(matrix.runner) }}
env:
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.0"
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-24-cmake-openvino-${{ matrix.variant }}-no-preset-v1
evict-old-files: 1d
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
- name: Use OpenVINO Toolkit Cache
uses: actions/cache@v5
id: cache-openvino
with:
path: ./openvino_toolkit
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
- name: Setup OpenVINO Toolkit
if: steps.cache-openvino.outputs.cache-hit != 'true'
uses: ./.github/actions/linux-setup-openvino
with:
path: ./openvino_toolkit
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
version_full: ${{ env.OPENVINO_VERSION_FULL }}
- name: Install OpenVINO dependencies
run: |
cd ./openvino_toolkit
chmod +x ./install_dependencies/install_openvino_dependencies.sh
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
- name: Build
id: cmake_build
run: |
source ./openvino_toolkit/setupvars.sh
cmake -B build/ReleaseOV -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENVINO=ON
cmake --build build/ReleaseOV --config Release -j $(nproc)
- name: Test
id: cmake_test
# TODO: fix and re-enable the `test-llama-archs` test below
run: |
cd ${{ github.workspace }}
if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
export GGML_OPENVINO_DEVICE=GPU
fi
ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
build-linux-cross:
uses: ./.github/workflows/build-linux-cross.yml
@@ -1588,144 +1646,6 @@ jobs:
run: |
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
ggml-ci-x64-nvidia-cuda:
runs-on: [self-hosted, Linux, X64, NVIDIA]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Test
id: ggml-ci
run: |
nvidia-smi
GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
ggml-ci-x64-nvidia-vulkan-cm:
runs-on: [self-hosted, Linux, X64, NVIDIA]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Test
id: ggml-ci
run: |
vulkaninfo --summary
GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
ggml-ci-x64-nvidia-vulkan-cm2:
runs-on: [self-hosted, Linux, X64, NVIDIA, COOPMAT2]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Test
id: ggml-ci
run: |
vulkaninfo --summary
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
ggml-ci-x64-cpu-amx:
runs-on: [self-hosted, Linux, X64, CPU, AMX]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Test
id: ggml-ci
run: |
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
# ggml-ci-x64-amd-vulkan:
# runs-on: [self-hosted, Linux, X64, AMD]
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v6
# - name: Test
# id: ggml-ci
# run: |
# vulkaninfo --summary
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
# ggml-ci-x64-amd-rocm:
# runs-on: [self-hosted, Linux, X64, AMD]
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v6
# - name: Test
# id: ggml-ci
# run: |
# amd-smi static
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
ggml-ci-mac-metal:
runs-on: [self-hosted, macOS, ARM64]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Test
id: ggml-ci
run: |
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
ggml-ci-mac-webgpu:
runs-on: [self-hosted, macOS, ARM64]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Dawn Dependency
id: dawn-depends
run: |
DAWN_VERSION="v2.0.0"
DAWN_OWNER="reeselevine"
DAWN_REPO="dawn"
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
curl -L -o artifact.zip \
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
mkdir dawn
unzip artifact.zip
tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
- name: Test
id: ggml-ci
run: |
GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
ggml-ci-mac-vulkan:
runs-on: [self-hosted, macOS, ARM64]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Test
id: ggml-ci
run: |
vulkaninfo --summary
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
ggml-ci-arm64-cpu-kleidiai:
runs-on: ubuntu-22.04-arm

View File

@@ -47,6 +47,7 @@ jobs:
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
- { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
- { tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
- { tag: "openvino", dockerfile: ".devops/openvino.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
steps:
- name: Check out the repo
uses: actions/checkout@v6

View File

@@ -231,6 +231,86 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz
name: llama-bin-ubuntu-vulkan-x64.tar.gz
ubuntu-24-openvino:
runs-on: ubuntu-24.04
outputs:
openvino_version: ${{ steps.openvino_version.outputs.value }}
env:
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.0"
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
steps:
- name: Set OpenVINO version output
id: openvino_version
run: echo "value=${{ env.OPENVINO_VERSION_MAJOR }}" >> $GITHUB_OUTPUT
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-24-cmake-openvino-release-no-preset-v1
evict-old-files: 1d
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
sudo apt install ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
- name: Use OpenVINO Toolkit Cache
uses: actions/cache@v5
id: cache-openvino
with:
path: ./openvino_toolkit
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
- name: Setup OpenVINO Toolkit
if: steps.cache-openvino.outputs.cache-hit != 'true'
uses: ./.github/actions/linux-setup-openvino
with:
path: ./openvino_toolkit
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
version_full: ${{ env.OPENVINO_VERSION_FULL }}
- name: Install OpenVINO dependencies
run: |
cd ./openvino_toolkit
chmod +x ./install_dependencies/install_openvino_dependencies.sh
echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
- name: Build
id: cmake_build
run: |
source ./openvino_toolkit/setupvars.sh
cmake -B build/ReleaseOV -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENVINO=ON
cmake --build build/ReleaseOV --config Release -j $(nproc)
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
run: |
cp LICENSE ./build/ReleaseOV/bin/
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/ReleaseOV/bin .
- name: Upload artifacts
uses: actions/upload-artifact@v6
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
windows-cpu:
runs-on: windows-2025
@@ -872,7 +952,7 @@ jobs:
permissions:
contents: write # for creating release
runs-on: ubuntu-latest
runs-on: ubuntu-slim
needs:
- windows
@@ -883,6 +963,7 @@ jobs:
- ubuntu-22-rocm
- ubuntu-22-cpu
- ubuntu-22-vulkan
- ubuntu-24-openvino
- macOS-arm64
- macOS-x64
- ios-xcode-build
@@ -967,6 +1048,7 @@ jobs:
- [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
- [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
- [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
**Windows:**
- [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)

View File

@@ -1,4 +1,4 @@
name: Server-Metal
name: Server (self-hosted)
on:
workflow_dispatch: # allows manual triggering
@@ -14,7 +14,7 @@ on:
push:
branches:
- master
paths: ['.github/workflows/server-metal.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
paths: ['.github/workflows/server-self-hosted.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
env:
LLAMA_LOG_COLORS: 1
@@ -28,7 +28,7 @@ concurrency:
jobs:
server-metal:
runs-on: [self-hosted, macOS, ARM64]
runs-on: [self-hosted, llama-server, macOS, ARM64]
name: server-metal (${{ matrix.wf_name }})
strategy:
@@ -71,3 +71,42 @@ jobs:
pip install -r requirements.txt
export ${{ matrix.extra_args }}
pytest -v -x -m "not slow"
server-cuda:
runs-on: [self-hosted, llama-server, Linux, NVIDIA]
name: server-cuda (${{ matrix.wf_name }})
strategy:
matrix:
build_type: [Release]
wf_name: ["GPUx1"]
include:
- build_type: Release
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
wf_name: "GPUx1, backend-sampling"
fail-fast: false
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Build
id: cmake_build
run: |
cmake -B build -DGGML_SCHED_NO_REALLOC=ON
cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
- name: Tests
id: server_integration_tests
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
run: |
cd tools/server/tests
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
export ${{ matrix.extra_args }}
pytest -v -x -m "not slow"

View File

@@ -29,7 +29,7 @@ concurrency:
jobs:
webui-check:
name: WebUI Checks
runs-on: ubuntu-latest
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
continue-on-error: true
steps:
- name: Checkout code

5
.gitignore vendored
View File

@@ -124,6 +124,11 @@ poetry.toml
# Scripts
!/scripts/install-oneapi.bat
# Generated by scripts
/hellaswag_val_full.txt
/winogrande-debiased-eval.csv
/wikitext-2-raw/
# Test models for lora adapters
/lora-tests

View File

@@ -74,6 +74,7 @@
/ggml/src/ggml-virtgpu/ @kpouget
/ggml/src/ggml-webgpu/ @reeselevine
/ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
/ggml/src/ggml-openvino/ @cavusmustafa @wine99
/ggml/src/ggml.c @ggerganov
/ggml/src/ggml.cpp @ggerganov
/ggml/src/gguf.cpp @JohannesGaessler @Green-Sky

View File

@@ -30,15 +30,21 @@ Before submitting your PR:
- Search for existing PRs to prevent duplicating efforts
- llama.cpp uses the ggml tensor library for model evaluation. If you are unfamiliar with ggml, consider taking a look at the [examples in the ggml repository](https://github.com/ggml-org/ggml/tree/master/examples/). [simple](https://github.com/ggml-org/ggml/tree/master/examples/simple) shows the bare minimum for using ggml. [gpt-2](https://github.com/ggml-org/ggml/tree/master/examples/gpt-2) has minimal implementations for language model inference using GPT-2. [mnist](https://github.com/ggml-org/ggml/tree/master/examples/mnist) demonstrates how to train and evaluate a simple image classifier
- Test your changes:
- Execute [the full CI locally on your machine](ci/README.md) before publishing
- Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
- Execute [the full CI locally on your machine](ci/README.md) before publishing
- Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
- Create separate PRs for each feature or fix:
- Avoid combining unrelated changes in a single PR
- For intricate features, consider opening a feature request first to discuss and align expectations
- When adding support for a new model or feature, focus on **CPU support only** in the initial PR unless you have a good reason not to. Add support for other backends like CUDA in follow-up PRs
- Avoid combining unrelated changes in a single PR
- For intricate features, consider opening a feature request first to discuss and align expectations
- When adding support for a new model or feature, focus on **CPU support only** in the initial PR unless you have a good reason not to. Add support for other backends like CUDA in follow-up PRs
- In particular, adding new data types (extension of the `ggml_type` enum) carries with it a disproportionate maintenance burden. As such, to add a new quantization type you will need to meet the following *additional* criteria *at minimum*:
- convert a small model to GGUF using the new type and upload it to HuggingFace
- provide [perplexity](https://github.com/ggml-org/llama.cpp/tree/master/tools/perplexity) comparisons to FP16/BF16 (whichever is the native precision) as well as to types of similar size
- provide KL divergence data calculated vs. the FP16/BF16 (whichever is the native precision) version for both the new type as well as types of similar size
- provide [performance data](https://github.com/ggml-org/llama.cpp/tree/master/tools/llama-bench) for the new type in comparison to types of similar size on pure CPU
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
- If you are a new contributor, limit your open PRs to 1.
After submitting your PR:
- Expect requests for modifications to ensure the code meets llama.cpp's standards for quality and long-term maintainability

View File

@@ -259,6 +259,8 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
- [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale
- [llmaz](https://github.com/InftyAI/llmaz) - ☸️ Easy, advanced inference platform for large language models on Kubernetes.
- [LLMKube](https://github.com/defilantech/llmkube) - Kubernetes operator for llama.cpp with multi-GPU and Apple Silicon Metal
support"
</details>
<details>
@@ -277,6 +279,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
| [BLAS](docs/build.md#blas-build) | All |
| [BLIS](docs/backend/BLIS.md) | All |
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
| [OpenVINO [In Progress]](docs/backend/OPENVINO.md) | Intel CPUs, GPUs, and NPUs |
| [MUSA](docs/build.md#musa) | Moore Threads GPU |
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
| [HIP](docs/build.md#hip) | AMD GPU |

View File

@@ -0,0 +1,72 @@
# NVIDIA DGX Spark
## System info
```bash
uname --all
Linux spark-17ed 6.11.0-1016-nvidia #16-Ubuntu SMP PREEMPT_DYNAMIC Sun Sep 21 16:52:46 UTC 2025 aarch64 aarch64 aarch64 GNU/Linux
g++ --version
g++ (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
nvidia-smi
Fri Mar 6 11:39:45 2026
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 |
+-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GB10 On | 0000000F:01:00.0 Off | N/A |
| N/A 52C P0 13W / N/A | Not Supported | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
```
## ggml-org/nemotron-3-super-120b-GGUF
Model: https://huggingface.co/ggml-org/nemotron-3-super-120b-GGUF
- `llama-batched-bench`
main: n_kv_max = 303104, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = 99, n_threads = 20, n_threads_batch = 20
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
| 512 | 32 | 1 | 544 | 1.094 | 468.05 | 1.621 | 19.74 | 2.715 | 200.37 |
| 512 | 32 | 2 | 1088 | 1.463 | 700.16 | 2.437 | 26.26 | 3.900 | 279.01 |
| 512 | 32 | 4 | 2176 | 2.647 | 773.76 | 4.043 | 31.66 | 6.689 | 325.29 |
| 512 | 32 | 8 | 4352 | 5.291 | 774.14 | 6.151 | 41.62 | 11.442 | 380.37 |
| 512 | 32 | 16 | 8704 | 10.603 | 772.62 | 10.385 | 49.30 | 20.987 | 414.72 |
| 512 | 32 | 32 | 17408 | 21.231 | 771.69 | 18.235 | 56.16 | 39.466 | 441.09 |
| 4096 | 32 | 1 | 4128 | 5.340 | 767.05 | 1.616 | 19.81 | 6.956 | 593.47 |
| 4096 | 32 | 2 | 8256 | 10.673 | 767.55 | 2.454 | 26.08 | 13.127 | 628.94 |
| 4096 | 32 | 4 | 16512 | 21.348 | 767.46 | 4.072 | 31.44 | 25.420 | 649.57 |
| 4096 | 32 | 8 | 33024 | 42.714 | 767.15 | 6.277 | 40.78 | 48.991 | 674.08 |
| 4096 | 32 | 16 | 66048 | 85.385 | 767.54 | 10.596 | 48.32 | 95.981 | 688.14 |
| 4096 | 32 | 32 | 132096 | 170.819 | 767.32 | 18.619 | 55.00 | 189.437 | 697.31 |
| 8192 | 32 | 1 | 8224 | 10.690 | 766.32 | 1.619 | 19.76 | 12.310 | 668.10 |
| 8192 | 32 | 2 | 16448 | 21.382 | 766.24 | 2.467 | 25.94 | 23.850 | 689.65 |
| 8192 | 32 | 4 | 32896 | 42.782 | 765.92 | 4.098 | 31.23 | 46.881 | 701.69 |
| 8192 | 32 | 8 | 65792 | 85.582 | 765.77 | 6.368 | 40.20 | 91.951 | 715.52 |
| 8192 | 32 | 16 | 131584 | 171.066 | 766.21 | 10.774 | 47.52 | 181.840 | 723.62 |
| 8192 | 32 | 32 | 263168 | 342.140 | 766.19 | 18.969 | 53.98 | 361.109 | 728.78 |
- `llama-bench`
| model | size | params | backend | n_ubatch | fa | test | t/s |
| ----------------------- | ---------: | ---------: | ---------- | -------: | -: | --------------: | -------------------: |
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | pp2048 | 768.84 ± 0.90 |
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | tg32 | 19.94 ± 0.16 |
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | pp2048 @ d4096 | 764.51 ± 0.50 |
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | tg32 @ d4096 | 19.95 ± 0.18 |
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | pp2048 @ d8192 | 759.53 ± 0.71 |
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | tg32 @ d8192 | 19.83 ± 0.18 |
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | pp2048 @ d16384 | 747.98 ± 1.58 |
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | tg32 @ d16384 | 19.84 ± 0.18 |
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | pp2048 @ d32768 | 724.40 ± 2.70 |
| nemotron 120B.A12B Q4_K | 65.10 GiB | 120.67 B | CUDA | 2048 | 1 | tg32 @ d32768 | 19.45 ± 0.18 |
build: 04a65daab (8268)

View File

@@ -25,6 +25,9 @@
# # with KLEIDIAI support
# GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with OPENVINO support
# GG_BUILD_OPENVINO=1 GG_BUILD_LOW_PERF=1 GGML_OPENVINO_DEVICE=CPU bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
if [ -z "$2" ]; then
echo "usage: $0 <output-dir> <mnt-dir>"
@@ -46,6 +49,7 @@ cd $sd/../
SRC=`pwd`
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_OPENSSL=OFF -DGGML_SCHED_NO_REALLOC=ON"
CTEST_EXTRA=""
if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
@@ -165,6 +169,18 @@ if [ -n "${GG_BUILD_KLEIDIAI}" ]; then
-DBUILD_SHARED_LIBS=OFF"
fi
if [ ! -z ${GG_BUILD_OPENVINO} ]; then
if [ -z ${OpenVINO_DIR} ]; then
echo "OpenVINO_DIR not found, please install OpenVINO via archives and enable it by:"
echo "source /opt/intel/openvino/setupvars.sh"
exit 1
fi
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_OPENVINO=ON"
# TODO: fix and re-enable the `test-llama-archs` test below
CTEST_EXTRA="-E test-llama-archs"
fi
## helpers
# download a file if it does not exist or if it is outdated
@@ -222,7 +238,7 @@ function gg_run_ctest_debug {
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
(time ctest --output-on-failure -L main -E "test-opt|test-backend-ops" ) 2>&1 | tee -a $OUT/${ci}-ctest.log
(time ctest --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
set +e
}
@@ -254,9 +270,9 @@ function gg_run_ctest_release {
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
if [ -z ${GG_BUILD_LOW_PERF} ]; then
(time ctest --output-on-failure -L 'main|python' ) 2>&1 | tee -a $OUT/${ci}-ctest.log
(time ctest --output-on-failure -L 'main|python' ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
else
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
(time ctest --output-on-failure -L main -E test-opt ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
fi
set +e

View File

@@ -81,6 +81,8 @@ add_library(${TARGET} STATIC
preset.cpp
preset.h
regex-partial.cpp
reasoning-budget.cpp
reasoning-budget.h
regex-partial.h
sampling.cpp
sampling.h

View File

@@ -732,23 +732,28 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
"llama-completion",
"llama-convert-llama2c-to-ggml",
"llama-cvector-generator",
"llama-debug",
"llama-diffusion-cli",
"llama-embedding",
"llama-eval-callback",
"llama-export-lora",
"llama-finetune",
"llama-fit-params",
"llama-gemma3-cli",
"llama-gen-docs",
"llama-gguf",
"llama-gguf-hash",
"llama-gguf-split",
"llama-gritlm",
"llama-idle",
"llama-imatrix",
"llama-infill",
"llama-mtmd-cli",
"llama-llava-clip-quantize-cli",
"llama-llava-cli",
"llama-lookahead",
"llama-lookup",
"llama-lookup-create",
"llama-lookup-merge",
"llama-lookup-stats",
"llama-minicpmv-cli",
"llama-mtmd-cli",
"llama-parallel",
"llama-passkey",
"llama-perplexity",
@@ -2427,11 +2432,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
);
}
if (split_arg.size() == 1) {
std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoull(split_arg[0]) * 1024*1024);
return;
}
for (size_t i = 0; i < split_arg.size(); i++) {
params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
params.fit_params_target[i] = std::stoull(split_arg[i]) * 1024*1024;
}
}
).set_env("LLAMA_ARG_FIT_TARGET"));
@@ -2666,7 +2671,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.out_file = value;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE,
LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS}));
add_opt(common_arg(
{"-ofreq", "--output-frequency"}, "N",
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@@ -2913,6 +2919,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
auto parsed = json::parse(value);
for (const auto & item : parsed.items()) {
if (item.key() == "enable_thinking") {
LOG_WRN("Setting 'enable_thinking' via --chat-template-kwargs is deprecated. "
"Use --reasoning on / --reasoning off instead.\n");
}
params.default_template_kwargs[item.key()] = item.value().dump();
}
}
@@ -3048,14 +3058,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.reasoning_format = common_reasoning_format_from_name(value);
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
add_opt(common_arg(
{"-rea", "--reasoning"}, "[on|off|auto]",
"Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))",
[](common_params & params, const std::string & value) {
if (is_truthy(value)) {
params.enable_reasoning = 1;
params.default_template_kwargs["enable_thinking"] = "true";
} else if (is_falsey(value)) {
params.enable_reasoning = 0;
params.default_template_kwargs["enable_thinking"] = "false";
} else if (is_autoy(value)) {
params.enable_reasoning = -1;
} else {
throw std::invalid_argument(
string_format("error: unknown value for --reasoning: '%s'\n", value.c_str()));
}
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_REASONING"));
add_opt(common_arg(
{"--reasoning-budget"}, "N",
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
"token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)",
[](common_params & params, int value) {
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
if (value < -1) { throw std::invalid_argument("invalid value"); }
params.reasoning_budget = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
add_opt(common_arg(
{"--reasoning-budget-message"}, "MESSAGE",
"message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)",
[](common_params & params, const std::string & value) {
params.reasoning_budget_message = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
add_opt(common_arg(
{"--chat-template"}, "JINJA_TEMPLATE",
string_format(
@@ -3607,6 +3642,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
}
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
add_opt(common_arg(
{"--check"},
string_format("check rather than generate results (default: %s)", params.check ? "true" : "false"),
[](common_params & params) {
params.check = true;
}
).set_examples({LLAMA_EXAMPLE_RESULTS}));
add_opt(common_arg(
{"--save-logits"},
string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),

View File

@@ -1,7 +1,9 @@
#include "chat-auto-parser.h"
#include "chat-peg-parser.h"
#include "chat.h"
#include "common.h"
#include "json-schema-to-grammar.h"
#include "log.h"
#include "nlohmann/json.hpp"
#include <stdexcept>
@@ -51,13 +53,15 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
bool has_tools =
autoparser.tools.format.mode != tool_format::NONE && inputs.tools.is_array() && !inputs.tools.empty();
std::string trigger_marker = !autoparser.tools.format.section_start.empty() ? autoparser.tools.format.section_start :
autoparser.tools.format.per_call_start;
bool include_grammar =
has_tools && ((inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO && !trigger_marker.empty()) ||
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED);
autoparser.tools.format.per_call_start;
bool has_response_format = !inputs.json_schema.empty() && inputs.json_schema.is_object();
bool include_grammar = has_response_format || (has_tools &&
((inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO && !trigger_marker.empty()) ||
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED));
if (include_grammar) {
data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
data.grammar_lazy = !has_response_format && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
@@ -68,7 +72,7 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
});
// Set grammar triggers based on tool section markers (fall back to per-call markers)
if (data.grammar_lazy) { // only do triggers on lazy grammar
if (data.grammar_lazy) {
data.grammar_triggers = {
{ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, trigger_marker }
};
@@ -87,7 +91,7 @@ common_peg_arena autoparser::build_parser(const templates_params & inputs) const
// pre-register a json-string rule that accepts both quote styles. This must happen
// before any call to p.json() so that all JSON parsing inherits the flexible rule.
if (tools.format.uses_python_dicts) {
p.rule("json-string", [&]() { return p.choice({ p.double_quoted_string(), p.single_quoted_string() }); });
p.rule("json-string", p.quoted_string());
}
parser_build_context ctx(p, inputs);
@@ -104,8 +108,11 @@ common_peg_arena autoparser::build_parser(const templates_params & inputs) const
bool has_response_format = inputs.json_schema.is_object() && !inputs.json_schema.empty();
if (has_response_format) {
return ctx.reasoning_parser + p.space() +
p.content(p.schema(p.json(), "response-format", inputs.json_schema)) + p.end();
auto response_format = p.rule("response-format", p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));
return ctx.reasoning_parser + p.space() + p.choice({
p.literal("```json") + p.space() + response_format + p.space() + p.literal("```"),
response_format
}) + p.end();
}
if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && jinja_caps.supports_tool_calls) {
@@ -129,7 +136,9 @@ common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) co
if (thinking_forced_open || thinking_forced_closed) {
// Thinking is forced open OR forced closed with enable_thinking=true
// In both cases, expect only the closing tag (opening was in template)
return p.reasoning(p.until(end)) + end;
// However, since we might have incorrectly detected the open/close pattern,
// we admit an optional starting marker
return p.optional(p.literal(start)) + p.reasoning(p.until(end)) + end;
}
if (mode == reasoning_mode::TAG_BASED || mode == reasoning_mode::TOOLS_ONLY) {
// Standard tag-based reasoning OR tools-only mode (reasoning appears with tools)
@@ -174,7 +183,10 @@ common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const
case tool_format::TAG_WITH_TAGGED:
return build_tool_parser_tag_tagged(ctx);
default:
GGML_ABORT("Unable to create tool parser");
LOG_ERR("[ERROR] Template seems to support tool calls, but failed to determine tool format. Tool calling will not work properly. "
"Check for a fixed template for your model in the models/templates directory of your llama.cpp installation or "
"report an issue at https://github.com/ggml-org/llama.cpp/issues\n");
return ctx.p.eps();
}
}
@@ -239,8 +251,6 @@ common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context
if (!function.close.empty()) {
func_parser = func_parser + function.close;
}
func_parser = p.atomic(func_parser);
tool_choice |= p.rule("tool-" + name, func_parser);
});
@@ -357,13 +367,31 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
// Build call_id parser based on position (if supported)
common_peg_parser call_id_section = p.eps();
bool have_call_id = false;
if (call_id.pos == call_id_position::BETWEEN_FUNC_AND_ARGS && !call_id.prefix.empty() &&
!call_id.suffix.empty()) {
call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix))) + call_id.suffix;
have_call_id = true;
call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix)) + call_id.suffix);
}
auto func_parser = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
call_id_section + p.space() + args_seq;
bool matched_atomic = false;
common_peg_parser func_parser = p.eps();
if (!function.name_suffix.empty()) {
func_parser = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
call_id_section + p.space() + args_seq;
matched_atomic = true;
} else if (have_call_id) {
func_parser = p.atomic(p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
call_id_section) + p.space() + args_seq;
matched_atomic = true;
} else if (!arguments.name_prefix.empty() && properties.size() > 0) {
func_parser = p.atomic(p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
call_id_section + p.space() + p.peek(p.literal(arguments.name_prefix))) + args_seq;
matched_atomic = true;
} else {
func_parser = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
call_id_section + p.space() + args_seq;
}
if (!function.close.empty()) {
func_parser = func_parser + p.space() + p.tool_close(p.literal(function.close));
@@ -377,8 +405,10 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
func_parser =
func_parser + p.tool_close(p.space()); // force this to process tool closing callbacks in mapper
}
if (!matched_atomic) {
func_parser = p.atomic(func_parser);
}
func_parser = p.atomic(func_parser);
tool_choice |= p.rule("tool-" + name, func_parser);
});

View File

@@ -162,7 +162,7 @@ diff_split calculate_diff_split(const std::string & left, const std::string & ri
right_fully_consumed = true;
}
auto eat_segment = [](std::string & str, segment & seg) -> std::string { return str.append(seg.value); };
auto eat_segment = [](std::string str, const segment & seg) -> std::string { return std::move(str) + seg.value; };
bool can_have_text_suffix = left_end->type == segment_type::TEXT && right_end->type == segment_type::TEXT;
bool can_have_text_prefix = right_start->type == segment_type::TEXT && left_start->type == segment_type::TEXT;

View File

@@ -6,7 +6,7 @@
#include <nlohmann/json.hpp>
using json = nlohmann::ordered_json;
using ordered_json = nlohmann::ordered_json;
static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
int count = 0;
@@ -68,7 +68,7 @@ static int json_brace_depth(const std::string & s) {
// JSON-escape a string and return the inner content (without surrounding quotes).
static std::string escape_json_string_inner(const std::string & s) {
std::string escaped = json(s).dump();
std::string escaped = ordered_json(s).dump();
if (escaped.size() >= 2 && escaped.front() == '"' && escaped.back() == '"') {
return escaped.substr(1, escaped.size() - 2);
}
@@ -167,8 +167,8 @@ void tag_based_peg_mapper::from_ast(const common_peg_ast_arena & arena, const co
});
}
tagged_parse_result tagged_peg_parser::parse_and_extract(const std::string & input, bool is_partial) const {
common_peg_parse_context ctx(input, is_partial);
tagged_parse_result tagged_peg_parser::parse_and_extract(const std::string & input, common_peg_parse_flags extra_flags) const {
common_peg_parse_context ctx(input, flags | extra_flags);
auto parse_result = arena.parse(ctx);
tag_based_peg_mapper mapper;
@@ -179,11 +179,10 @@ tagged_parse_result tagged_peg_parser::parse_and_extract(const std::string & inp
tagged_parse_result tagged_peg_parser::parse_anywhere_and_extract(const std::string & input) const {
if (input.empty()) {
return parse_and_extract(input, false);
return parse_and_extract(input);
}
for (size_t i = 0; i < input.size(); i++) {
common_peg_parse_context ctx(input, false);
ctx.debug = debug;
common_peg_parse_context ctx(input, flags);
auto parse_result = arena.parse(ctx, i);
if (parse_result.success() || i == input.size() - 1) {
tag_based_peg_mapper mapper;
@@ -310,7 +309,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
if (arg_count > 0) {
arg_entry = ",";
}
arg_entry += json(trim(node.text)).dump() + ":";
arg_entry += ordered_json(trim(node.text)).dump() + ":";
++arg_count;
auto & target = args_target();
@@ -344,7 +343,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
// Try to parse as JSON value (number, bool, null, object, array)
try {
json parsed = json::parse(value_content);
ordered_json parsed = ordered_json::parse(value_content);
if (parsed.is_string()) {
// Don't add closing quote yet (added by arg_close) for monotonic streaming
std::string escaped = parsed.dump();
@@ -409,7 +408,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
common_peg_parser common_chat_peg_builder::standard_constructed_tools(
const std::map<std::string, std::string> & markers,
const nlohmann::json & tools,
const ordered_json & tools,
bool parallel_tool_calls,
bool force_tool_calls) {
if (!tools.is_array() || tools.empty()) {
@@ -440,7 +439,7 @@ common_peg_parser common_chat_peg_builder::standard_constructed_tools(
}
const auto & function = tool_def.at("function");
std::string name = function.at("name");
nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
// Build argument parsers
auto args = eps();
@@ -477,6 +476,74 @@ common_peg_parser common_chat_peg_builder::standard_constructed_tools(
return force_tool_calls ? section : optional(section);
}
// Python-style tool calls: name(arg1="value1", arg2=123)
// Used only by LFM2 for now, so we don't merge it into autoparser
common_peg_parser common_chat_peg_builder::python_style_tool_calls(
const ordered_json & tools,
bool parallel_tool_calls) {
if (!tools.is_array() || tools.empty()) {
return eps();
}
auto tool_choices = choice();
for (const auto & tool_def : tools) {
if (!tool_def.contains("function")) {
continue;
}
const auto & function = tool_def.at("function");
std::string name = function.at("name");
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
auto args = eps();
if (params.contains("properties") && !params["properties"].empty()) {
auto arg_choice = choice();
for (const auto & el : params["properties"].items()) {
const std::string & prop_name = el.key();
const auto & prop_def = el.value();
bool is_string_type = (prop_def.contains("type") && prop_def["type"] == "string");
auto arg_name_parser = literal(prop_name);
common_peg_parser arg_value_parser = eps();
auto string_value_parser = choice({
literal("\"") + tool_arg_string_value(string_content('"')) + literal("\""),
literal("'") + tool_arg_string_value(string_content('\'')) + literal("'")
});
if (is_string_type) {
arg_value_parser = string_value_parser;
} else {
arg_value_parser = tool_arg_value(python_value());
}
// Full argument: name="value" or name=value
auto arg_rule = tool_arg(
tool_arg_open(eps()) +
tool_arg_name(arg_name_parser) +
literal("=") +
arg_value_parser +
tool_arg_close(eps())
);
arg_choice |= arg_rule;
}
args = arg_choice + zero_or_more("," + space() + arg_choice);
}
auto tool_parser = tool(tool_open(tool_name(literal(name)) + literal("(")) +
space() + tool_args(args) + space() + tool_close(literal(")"))
);
tool_choices |= rule("tool-" + name, tool_parser);
}
if (parallel_tool_calls) {
return "[" + space() + tool_choices + zero_or_more("," + space() + tool_choices) + space() + "]";
}
return "[" + space() + tool_choices + space() + "]";
}
// Helper: Parse dot notation key into prefix and field name
static std::pair<std::string, std::string> parse_key_spec(const std::string & key) {
auto dot_pos = key.find('.');
@@ -488,11 +555,11 @@ static std::pair<std::string, std::string> parse_key_spec(const std::string & ke
// Mode 1: function_is_key — parse {"function_name": {...}}
common_peg_parser common_chat_peg_builder::build_json_tools_function_is_key(
const nlohmann::json & tools,
const std::string & args_key,
const std::string & effective_args_key,
const std::string & call_id_key,
const std::string & gen_call_id_key) {
const ordered_json & tools,
const std::string & args_key,
const std::string & effective_args_key,
const std::string & call_id_key,
const std::string & gen_call_id_key) {
auto tool_choices = choice();
@@ -502,7 +569,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_function_is_key(
}
const auto & function = tool_def.at("function");
std::string name = function.at("name");
nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
// Build inner object fields
std::vector<common_peg_parser> inner_fields;
@@ -510,7 +577,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_function_is_key(
if (!call_id_key.empty()) {
auto id_parser = atomic(
literal("\"" + call_id_key + "\"") + space() + literal(":") + space() +
literal("\"") + tool_id(json_string_content()) + literal("\"")
literal("\"") + tool_id(string_content('"')) + literal("\"")
);
inner_fields.push_back(optional(id_parser + space() + optional(literal(",") + space())));
}
@@ -519,7 +586,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_function_is_key(
auto gen_id_parser = atomic(
literal("\"" + gen_call_id_key + "\"") + space() + literal(":") + space() +
choice({
literal("\"") + tool_id(json_string_content()) + literal("\""),
literal("\"") + tool_id(string_content('"')) + literal("\""),
tool_id(json_number())
})
);
@@ -567,11 +634,11 @@ common_peg_parser common_chat_peg_builder::build_json_tools_function_is_key(
// Mode 2: Nested keys (dot notation like "function.name")
common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
const nlohmann::json & tools,
const std::string & effective_name_key,
const std::string & effective_args_key,
const std::string & call_id_key,
const std::string & gen_call_id_key) {
const ordered_json & tools,
const std::string & effective_name_key,
const std::string & effective_args_key,
const std::string & call_id_key,
const std::string & gen_call_id_key) {
auto tool_choices = choice();
@@ -588,7 +655,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
}
const auto & function = tool_def.at("function");
std::string name = function.at("name");
nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
auto nested_name = literal("\"" + nested_name_field + "\"") + space() + literal(":") + space() +
literal("\"") + tool_name(literal(name)) + literal("\"");
@@ -608,7 +675,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
if (id_spec.first.empty()) {
auto id_parser = atomic(
literal("\"" + call_id_key + "\"") + space() + literal(":") + space() +
literal("\"") + tool_id(json_string_content()) + literal("\"")
literal("\"") + tool_id(string_content('"')) + literal("\"")
);
tool_parser_body = tool_parser_body + optional(id_parser + space() + literal(",") + space());
}
@@ -620,7 +687,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
auto gen_id_parser = atomic(
literal("\"" + gen_call_id_key + "\"") + space() + literal(":") + space() +
choice({
literal("\"") + tool_id(json_string_content()) + literal("\""),
literal("\"") + tool_id(string_content('"')) + literal("\""),
tool_id(json_number())
})
);
@@ -639,7 +706,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
// Mode 3: Flat keys with optional ID fields and parameter ordering
common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
const nlohmann::json & tools,
const ordered_json & tools,
const std::string & effective_name_key,
const std::string & effective_args_key,
const std::string & call_id_key,
@@ -656,7 +723,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
}
const auto & function = tool_def.at("function");
std::string name = function.at("name");
nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();
ordered_json params = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
auto tool_name_ = name_key_parser + space() + literal(":") + space() +
literal("\"") + tool_name(literal(name)) + literal("\"");
@@ -669,7 +736,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
id_parser = atomic(
literal("\"" + call_id_key + "\"") + space() + literal(":") + space() +
choice({
literal("\"") + tool_id(json_string_content()) + literal("\""),
literal("\"") + tool_id(string_content('"')) + literal("\""),
tool_id(json_number())
})
);
@@ -680,7 +747,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
gen_id_parser = atomic(
literal("\"" + gen_call_id_key + "\"") + space() + literal(":") + space() +
choice({
literal("\"") + tool_id(json_string_content()) + literal("\""),
literal("\"") + tool_id(string_content('"')) + literal("\""),
tool_id(json_number())
})
);
@@ -724,7 +791,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
common_peg_parser common_chat_peg_builder::standard_json_tools(
const std::string & section_start,
const std::string & section_end,
const nlohmann::json & tools,
const ordered_json & tools,
bool parallel_tool_calls,
bool force_tool_calls,
const std::string & name_key,

View File

@@ -94,7 +94,7 @@ class common_chat_peg_builder : public common_peg_parser_builder {
// parameters_order: order in which JSON fields should be parsed
common_peg_parser standard_json_tools(const std::string & section_start,
const std::string & section_end,
const nlohmann::json & tools,
const nlohmann::ordered_json & tools,
bool parallel_tool_calls,
bool force_tool_calls,
const std::string & name_key = "",
@@ -108,25 +108,30 @@ class common_chat_peg_builder : public common_peg_parser_builder {
// Legacy-compatible helper for building XML/tagged style tool calls
// Used by tests and manual parsers
common_peg_parser standard_constructed_tools(const std::map<std::string, std::string> & markers,
const nlohmann::json & tools,
const nlohmann::ordered_json & tools,
bool parallel_tool_calls,
bool force_tool_calls);
// Helper for Python-style function call format: name(arg1="value1", arg2=123)
// Used by LFM2 and similar templates
common_peg_parser python_style_tool_calls(const nlohmann::ordered_json & tools,
bool parallel_tool_calls);
private:
// Implementation helpers for standard_json_tools — one per JSON tool call layout mode
common_peg_parser build_json_tools_function_is_key(const nlohmann::json & tools,
const std::string & args_key,
const std::string & effective_args_key,
const std::string & call_id_key,
const std::string & gen_call_id_key);
common_peg_parser build_json_tools_function_is_key(const nlohmann::ordered_json & tools,
const std::string & args_key,
const std::string & effective_args_key,
const std::string & call_id_key,
const std::string & gen_call_id_key);
common_peg_parser build_json_tools_nested_keys(const nlohmann::json & tools,
const std::string & effective_name_key,
const std::string & effective_args_key,
const std::string & call_id_key,
const std::string & gen_call_id_key);
common_peg_parser build_json_tools_nested_keys(const nlohmann::ordered_json & tools,
const std::string & effective_name_key,
const std::string & effective_args_key,
const std::string & call_id_key,
const std::string & gen_call_id_key);
common_peg_parser build_json_tools_flat_keys(const nlohmann::json & tools,
common_peg_parser build_json_tools_flat_keys(const nlohmann::ordered_json & tools,
const std::string & effective_name_key,
const std::string & effective_args_key,
const std::string & call_id_key,
@@ -155,19 +160,19 @@ struct tagged_parse_result {
struct tagged_peg_parser {
common_peg_arena arena;
bool debug = false;
common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_NONE;
tagged_peg_parser & withDebug() {
debug = true;
flags |= COMMON_PEG_PARSE_FLAG_DEBUG;
return *this;
}
tagged_peg_parser & withoutDebug() {
debug = false;
flags = flags & ~COMMON_PEG_PARSE_FLAG_DEBUG;
return *this;
}
tagged_parse_result parse_and_extract(const std::string & input, bool is_partial = false) const;
tagged_parse_result parse_and_extract(const std::string & input, common_peg_parse_flags extra_flags = COMMON_PEG_PARSE_FLAG_NONE) const;
tagged_parse_result parse_anywhere_and_extract(const std::string & input) const;
};

View File

@@ -129,7 +129,7 @@ json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const {
{"type", "function"},
{"function", {
{"name", tool_call.name},
{"arguments", json::parse(tool_call.arguments)},
{"arguments", json(tool_call.arguments)},
}},
};
if (!tool_call.id.empty()) {
@@ -857,7 +857,9 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = true;
data.supports_thinking = true;
data.supports_thinking = true;
data.thinking_start_tag = "[THINK]";
data.thinking_end_tag = "[/THINK]";
data.prompt = common_chat_template_direct_apply(tmpl, inputs, /* messages_override = */ adjusted_messages);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.preserved_tokens = {
@@ -1165,9 +1167,11 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
const autoparser::templates_params & inputs) {
common_chat_params data;
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
data.thinking_start_tag = "<think>";
data.thinking_end_tag = "</think>";
data.preserved_tokens = {
"<|tool_calls_section_begin|>",
"<|tool_calls_section_end|>",
@@ -1274,8 +1278,166 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
return data;
}
// LFM2 format:
// - Reasoning: <think>{reasoning}</think> (optional, only if enable_thinking is true)
// - Content: text after reasoning (optional)
// - Tool calls: <|tool_call_start|>[function_name(arg1="value1", arg2="value2")]<|tool_call_end|>
// Tool calls can appear multiple times (parallel tool calls)
static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl,
const autoparser::templates_params & inputs) {
common_chat_params data;
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
data.preserved_tokens = {
"<|tool_list_start|>",
"<|tool_list_end|>",
"<|tool_call_start|>",
"<|tool_call_end|>",
"<think>",
"</think>",
};
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
const std::string TOOL_CALL_START = "<|tool_call_start|>";
const std::string TOOL_CALL_END = "<|tool_call_end|>";
const std::string THINK_START = "<think>";
const std::string THINK_END = "</think>";
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
auto end = p.end();
auto reasoning = p.eps();
if (extract_reasoning && inputs.enable_thinking) {
reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END);
}
if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
return reasoning + p.content(p.rest()) + end;
}
auto tool_calls = p.rule("tool-calls",
p.trigger_rule("tool-call", p.literal(TOOL_CALL_START) +
p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls) +
p.literal(TOOL_CALL_END)
)
);
auto content = p.content(p.until(TOOL_CALL_START));
return reasoning + content + tool_calls + end;
});
data.parser = parser.save();
if (include_grammar) {
data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
auto schema = function.at("parameters");
builder.resolve_refs(schema);
});
parser.build_grammar(builder, data.grammar_lazy);
});
data.grammar_triggers = {
{ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, TOOL_CALL_START }
};
}
return data;
}
static common_chat_params common_chat_params_init_gigachat_v3(
const common_chat_template & tmpl,
const autoparser::templates_params & inputs) {
common_chat_params data;
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = false;
data.preserved_tokens = {
"<|message_sep|>\n\n",
"<|role_sep|>\n",
};
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
auto tool_call_start_prefix = "<|message_sep|>\n\nfunction call<|role_sep|>\n";
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
// Build a choice of all available tools
auto tool_choice = p.choice();
for (const auto & tool : inputs.tools) {
const auto & function = tool.at("function");
std::string name = function.at("name");
const auto & schema = function.at("parameters");
auto tool_name = p.json_member("name", "\"" + p.tool_name(p.literal(name)) + "\"");
auto tool_args = p.json_member("arguments", p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema)));
auto tool_open = p.tool_open(p.literal("{") << tool_name);
tool_choice |= p.rule("tool-" + name, tool_open << "," << tool_args << "}");
}
// Define the tool call structure
auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
auto max_calls = 1; // parallel toolcalls are not supported
auto tool_call = p.rule("tool-call", p.literal(tool_call_start_prefix) + tool_choice);
auto tool_calls = p.trigger_rule("tool-call-root", p.repeat(tool_call, /* min = */ min_calls, /* max = */ max_calls));
return p.content(p.until("<|message_sep|>\n\n")) << tool_calls;
}
// Content only parser
include_grammar = false;
return p.content(p.rest());
});
data.parser = parser.save();
if (include_grammar) {
data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
auto schema = function.at("parameters");
builder.resolve_refs(schema);
});
parser.build_grammar(builder, data.grammar_lazy);
});
data.grammar_triggers = {
{COMMON_GRAMMAR_TRIGGER_TYPE_WORD, tool_call_start_prefix}
};
}
return data;
}
namespace workaround {
static void map_developer_role_to_system(json & messages) {
for (auto & message : messages) {
if (message.contains("role")) {
if (message["role"] == "developer") {
message["role"] = "system";
}
}
}
}
// if first message is system and template does not support it, merge it with next message
static void system_message_not_supported(json & messages) {
if (!messages.empty() && messages.front().at("role") == "system") {
@@ -1353,6 +1515,12 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
params.add_bos = tmpls->add_bos;
params.add_eos = tmpls->add_eos;
if (src.find("<|channel|>") == std::string::npos) {
// map developer to system for all models except for GPT-OSS
workaround::map_developer_role_to_system(params.messages);
}
workaround::func_args_not_string(params.messages);
if (!tmpl.original_caps().supports_system_role) {
workaround::system_message_not_supported(params.messages);
}
@@ -1420,12 +1588,39 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
return common_chat_params_init_kimi_k2(tmpl, params);
}
// LFM2 - uses <|tool_list_start|>/<|tool_list_end|> markers and <|tool_call_start|>[name(args)]<|tool_call_end|> format
// Detection: template has "<|tool_list_start|>" and "<|tool_list_end|>" markers
if (src.find("<|tool_list_start|>") != std::string::npos &&
src.find("<|tool_list_end|>") != std::string::npos) {
LOG_DBG("Using specialized template: LFM2\n");
return common_chat_params_init_lfm2(tmpl, params);
}
// GigaChatV3 format detection
if (src.find("<|role_sep|>") != std::string::npos &&
src.find("<|message_sep|>") != std::string::npos &&
src.find("<|function_call|>") == std::string::npos
) {
LOG_DBG("Using specialized template: GigaChatV3\n");
return common_chat_params_init_gigachat_v3(tmpl, params);
}
try {
LOG_DBG("Using differential autoparser\n");
struct autoparser::autoparser autoparser;
autoparser.analyze_template(tmpl);
auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
if (auto_params.supports_thinking) {
auto_params.thinking_start_tag = autoparser.reasoning.start;
auto_params.thinking_end_tag = autoparser.reasoning.end;
// FORCED_OPEN and FORCED_CLOSED both put <think> in the generation prompt
// (FORCED_CLOSED forces empty <think></think> when thinking is disabled,
// but forces <think> open when thinking is enabled)
auto_params.thinking_forced_open =
autoparser.reasoning.mode == autoparser::reasoning_mode::FORCED_OPEN ||
autoparser.reasoning.mode == autoparser::reasoning_mode::FORCED_CLOSED;
}
return auto_params;
} catch (const std::exception & e) {
throw std::invalid_argument(std::string("Unable to generate parser for this template. Automatic parser generation failed: ") + e.what());
@@ -1519,14 +1714,18 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
build_chat_peg_parser([](common_chat_peg_builder & p) { return p.content(p.rest()) + p.end(); }) :
src_parser;
if (src_parser.empty()) {
LOG_WRN("No parser definition detected, assuming pure content parser.");
if (src_parser.empty()) {
LOG_DBG("No parser definition detected, assuming pure content parser.");
}
LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), input.c_str());
common_peg_parse_context ctx(input, is_partial);
ctx.debug = params.debug;
common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_LENIENT;
if (params.debug) {
flags |= COMMON_PEG_PARSE_FLAG_DEBUG;
}
common_peg_parse_context ctx(input, flags);
auto result = parser.parse(ctx);
if (result.fail()) {
@@ -1539,7 +1738,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
auto mapper = common_chat_peg_mapper(msg);
mapper.from_ast(ctx.ast, result);
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "\nAST for partial parse (fail):\n%s\n", ctx.ast.dump().c_str());
fflush(stderr);
}
@@ -1555,7 +1754,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
auto mapper = common_chat_peg_mapper(msg);
mapper.from_ast(ctx.ast, result);
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "\nAST for %s parse:\n%s\n", is_partial ? "partial" : "full", ctx.ast.dump().c_str());
fflush(stderr);
}

View File

@@ -213,6 +213,8 @@ struct common_chat_params {
bool grammar_lazy = false;
bool thinking_forced_open = false;
bool supports_thinking = false;
std::string thinking_start_tag; // e.g., "<think>"
std::string thinking_end_tag; // e.g., "</think>"
std::vector<common_grammar_trigger> grammar_triggers;
std::vector<std::string> preserved_tokens;
std::vector<std::string> additional_stops;

View File

@@ -104,6 +104,8 @@ enum llama_example {
LLAMA_EXAMPLE_DIFFUSION,
LLAMA_EXAMPLE_FINETUNE,
LLAMA_EXAMPLE_FIT_PARAMS,
LLAMA_EXAMPLE_RESULTS,
LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,
LLAMA_EXAMPLE_COUNT,
};
@@ -234,6 +236,14 @@ struct common_params_sampling {
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
// reasoning budget sampler parameters
// these are populated by the server/CLI based on chat template params
int32_t reasoning_budget_tokens = -1; // -1 = disabled, >= 0 = token budget
bool reasoning_budget_activate_immediately = false;
std::vector<llama_token> reasoning_budget_start; // start tag token sequence
std::vector<llama_token> reasoning_budget_end; // end tag token sequence
std::vector<llama_token> reasoning_budget_forced; // forced sequence (message + end tag)
bool backend_sampling = false;
bool has_logit_bias() const {
@@ -456,6 +466,8 @@ struct common_params {
bool kl_divergence = false; // compute KL divergence
bool check = false; // check rather than generate results for llama-results
bool usage = false; // print usage
bool completion = false; // print source-able completion script
bool use_color = false; // use color to distinguish generations and inputs
@@ -533,7 +545,9 @@ struct common_params {
bool use_jinja = true; // NOLINT
bool enable_chat_template = true;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
int reasoning_budget = -1;
std::string reasoning_budget_message; // message injected before end tag when budget exhausted
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
@@ -913,7 +927,7 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
// MoE utils
//
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate|gate_up)_(ch|)exps";
inline std::string llm_ffn_exps_block_regex(int idx) {
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);

View File

@@ -7,6 +7,7 @@ struct common_http_url {
std::string user;
std::string password;
std::string host;
int port;
std::string path;
};
@@ -47,6 +48,20 @@ static common_http_url common_http_parse_url(const std::string & url) {
parts.host = rest;
parts.path = "/";
}
auto colon_pos = parts.host.find(':');
if (colon_pos != std::string::npos) {
parts.port = std::stoi(parts.host.substr(colon_pos + 1));
parts.host = parts.host.substr(0, colon_pos);
} else if (parts.scheme == "http") {
parts.port = 80;
} else if (parts.scheme == "https") {
parts.port = 443;
} else {
throw std::runtime_error("unsupported URL scheme: " + parts.scheme);
}
return parts;
}
@@ -68,7 +83,7 @@ static std::pair<httplib::Client, common_http_url> common_http_client(const std:
}
#endif
httplib::Client cli(parts.scheme + "://" + parts.host);
httplib::Client cli(parts.scheme + "://" + parts.host + ":" + std::to_string(parts.port));
if (!parts.user.empty()) {
cli.set_basic_auth(parts.user, parts.password);

View File

@@ -790,7 +790,7 @@ public:
} else if (target.is_array()) {
size_t sel_index;
try {
sel_index = std::stoul(sel);
sel_index = std::stoull(sel);
} catch (const std::invalid_argument & e) {
sel_index = target.size();
}

View File

@@ -349,7 +349,7 @@ struct parser_executor {
auto pos = start_pos;
for (auto i = 0u; i < p.literal.size(); ++i) {
if (pos >= ctx.input.size()) {
if (!ctx.is_partial) {
if (!ctx.is_lenient()) {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
@@ -364,7 +364,7 @@ struct parser_executor {
}
common_peg_parse_result operator()(const common_peg_sequence_parser & p) {
if (ctx.debug) {
if (ctx.is_debug()) {
LOG_DBG("%sSEQ start at %zu '%s' (%zu children)\n", debug_indent().c_str(), start_pos,
debug_input_snippet(start_pos).c_str(), p.children.size());
}
@@ -375,26 +375,19 @@ struct parser_executor {
for (size_t i = 0; i < p.children.size(); i++) {
const auto & child_id = p.children[i];
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sSEQ child %zu: %s\n", debug_indent().c_str(), i, arena.dump(child_id).c_str());
}
auto result = arena.parse(child_id, ctx, pos);
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sSEQ child %zu: %s at %zu->%zu\n", debug_indent().c_str(), i,
common_peg_parse_result_type_name(result.type), result.start, result.end);
}
if (result.fail()) {
ctx.parse_depth--;
if (ctx.is_partial && result.end >= ctx.input.size()) {
if (ctx.debug) {
fprintf(stderr, "%sSEQ -> NEED_MORE (child failed at end)\n", debug_indent().c_str());
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end,
std::move(nodes));
}
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sSEQ -> FAIL\n", debug_indent().c_str());
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, result.end);
@@ -406,7 +399,7 @@ struct parser_executor {
if (result.need_more_input()) {
ctx.parse_depth--;
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sSEQ -> NEED_MORE\n", debug_indent().c_str());
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end, std::move(nodes));
@@ -416,14 +409,14 @@ struct parser_executor {
}
ctx.parse_depth--;
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sSEQ -> SUCCESS at %zu->%zu\n", debug_indent().c_str(), start_pos, pos);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos, std::move(nodes));
}
common_peg_parse_result operator()(const common_peg_choice_parser & p) {
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sCHOICE start at %zu '%s' (%zu options)\n", debug_indent().c_str(), start_pos,
debug_input_snippet(start_pos).c_str(), p.children.size());
}
@@ -432,17 +425,17 @@ struct parser_executor {
auto pos = start_pos;
for (size_t i = 0; i < p.children.size(); i++) {
const auto & child_id = p.children[i];
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sCHOICE option %zu: %s\n", debug_indent().c_str(), i, arena.dump(child_id).c_str());
}
auto result = arena.parse(child_id, ctx, pos);
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sCHOICE option %zu: %s\n", debug_indent().c_str(), i,
common_peg_parse_result_type_name(result.type));
}
if (!result.fail()) {
ctx.parse_depth--;
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sCHOICE -> %s (option %zu)\n", debug_indent().c_str(),
common_peg_parse_result_type_name(result.type), i);
}
@@ -451,14 +444,14 @@ struct parser_executor {
}
ctx.parse_depth--;
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sCHOICE -> FAIL (no options matched)\n", debug_indent().c_str());
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
}
common_peg_parse_result operator()(const common_peg_repetition_parser & p) {
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sREPEAT start at %zu '%s' (min=%d, max=%d)\n", debug_indent().c_str(), start_pos,
debug_input_snippet(start_pos).c_str(), p.min_count, p.max_count);
}
@@ -471,7 +464,7 @@ struct parser_executor {
// Try to match up to max_count times (or unlimited if max_count is -1)
while (p.max_count == -1 || match_count < p.max_count) {
if (pos >= ctx.input.size()) {
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sREPEAT: at end of input, count=%d\n", debug_indent().c_str(), match_count);
}
break;
@@ -479,7 +472,7 @@ struct parser_executor {
auto result = arena.parse(p.child, ctx, pos);
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sREPEAT iter %d: %s at %zu->%zu, nodes=%zu\n", debug_indent().c_str(), match_count,
common_peg_parse_result_type_name(result.type), result.start, result.end, result.nodes.size());
fprintf(stderr, "%sREPEAT CHILD: %s\n", debug_indent().c_str(), arena.dump(p.child).c_str());
@@ -488,7 +481,7 @@ struct parser_executor {
if (result.success()) {
// Prevent infinite loop on empty matches
if (result.end == pos) {
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%s REPEAT: empty match, stopping\n", debug_indent().c_str());
}
break;
@@ -509,7 +502,7 @@ struct parser_executor {
}
ctx.parse_depth--;
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sREPEAT -> NEED_MORE (count=%d, nodes=%zu)\n", debug_indent().c_str(),
match_count, nodes.size());
}
@@ -517,7 +510,7 @@ struct parser_executor {
}
// Child failed - stop trying
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sREPEAT: child failed, stopping\n", debug_indent().c_str());
}
break;
@@ -526,14 +519,14 @@ struct parser_executor {
// Check if we got enough matches
if (p.min_count > 0 && match_count < p.min_count) {
ctx.parse_depth--;
if (pos >= ctx.input.size() && ctx.is_partial) {
if (ctx.debug) {
if (pos >= ctx.input.size() && ctx.is_lenient()) {
if (ctx.is_debug()) {
fprintf(stderr, "%sREPEAT -> NEED_MORE (not enough matches: %d < %d)\n", debug_indent().c_str(),
match_count, p.min_count);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos, std::move(nodes));
}
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sREPEAT -> FAIL (not enough matches: %d < %d)\n", debug_indent().c_str(), match_count,
p.min_count);
}
@@ -541,7 +534,7 @@ struct parser_executor {
}
ctx.parse_depth--;
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sREPEAT -> SUCCESS (count=%d, nodes=%zu)\n", debug_indent().c_str(), match_count,
nodes.size());
}
@@ -576,7 +569,7 @@ struct parser_executor {
auto result = common_parse_utf8_codepoint(ctx.input, start_pos);
if (result.status == utf8_parse_result::INCOMPLETE) {
if (!ctx.is_partial) {
if (!ctx.is_lenient()) {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos);
@@ -615,7 +608,7 @@ struct parser_executor {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
}
// Not enough matches yet
if (!ctx.is_partial) {
if (!ctx.is_lenient()) {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
@@ -656,7 +649,7 @@ struct parser_executor {
// Check if we got enough matches
if (match_count < p.min_count) {
if (pos >= ctx.input.size() && ctx.is_partial) {
if (pos >= ctx.input.size() && ctx.is_lenient()) {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
@@ -665,32 +658,23 @@ struct parser_executor {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
}
static common_peg_parse_result handle_escape_sequence(common_peg_parse_context & ctx, size_t start, size_t & pos) {
static common_peg_parse_result handle_escape_sequence(common_peg_parse_context & ctx, size_t start, size_t & pos, const char delimiter) {
++pos; // consume '\'
if (pos >= ctx.input.size()) {
if (!ctx.is_partial) {
if (!ctx.is_lenient()) {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos);
}
switch (ctx.input[pos]) {
case '"':
case '\'':
case '\\':
case '/':
case 'b':
case 'f':
case 'n':
case 'r':
case 't':
++pos;
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos);
case 'u':
return handle_unicode_escape(ctx, start, pos);
default:
// Invalid escape sequence
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
char c = ctx.input[pos];
if (c == delimiter || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' || c == 'r' || c == 't') {
++pos;
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos);
} else if (c == 'u') {
return handle_unicode_escape(ctx, start, pos);
} else {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
}
}
@@ -698,7 +682,7 @@ struct parser_executor {
++pos; // consume 'u'
for (int i = 0; i < 4; ++i) {
if (pos >= ctx.input.size()) {
if (!ctx.is_partial) {
if (!ctx.is_lenient()) {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos);
@@ -711,20 +695,20 @@ struct parser_executor {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos);
}
common_peg_parse_result operator()(const common_peg_json_string_parser & /* p */) {
common_peg_parse_result operator()(const common_peg_string_parser & p) {
auto pos = start_pos;
// Parse string content (without quotes)
while (pos < ctx.input.size()) {
char c = ctx.input[pos];
if (c == '"') {
// Found closing quote - success (don't consume it)
if (c == p.delimiter) {
// Found closing delimiter - success (don't consume it)
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
}
if (c == '\\') {
auto result = handle_escape_sequence(ctx, start_pos, pos);
auto result = handle_escape_sequence(ctx, start_pos, pos, p.delimiter);
if (!result.success()) {
return result;
}
@@ -732,7 +716,7 @@ struct parser_executor {
auto utf8_result = common_parse_utf8_codepoint(ctx.input, pos);
if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
if (!ctx.is_partial) {
if (!ctx.is_lenient()) {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
@@ -747,49 +731,7 @@ struct parser_executor {
}
// Reached end without finding closing quote
if (!ctx.is_partial) {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
}
common_peg_parse_result operator()(const common_peg_python_dict_string_parser & /* p */) {
auto pos = start_pos;
// Parse string content (without quotes)
while (pos < ctx.input.size()) {
char c = ctx.input[pos];
if (c == '\'') {
// Found closing quote - success (don't consume it)
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
}
if (c == '\\') {
auto result = handle_escape_sequence(ctx, start_pos, pos);
if (!result.success()) {
return result;
}
} else {
auto utf8_result = common_parse_utf8_codepoint(ctx.input, pos);
if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
if (!ctx.is_partial) {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
}
if (utf8_result.status == utf8_parse_result::INVALID) {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
}
pos += utf8_result.bytes_consumed;
}
}
// Reached end without finding closing quote
if (!ctx.is_partial) {
if (!ctx.is_lenient()) {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
@@ -807,7 +749,7 @@ struct parser_executor {
if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
// Incomplete UTF-8 sequence
if (!ctx.is_partial) {
if (!ctx.is_lenient()) {
// Input is complete but UTF-8 is incomplete = malformed
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
}
@@ -837,7 +779,7 @@ struct parser_executor {
last_valid_pos = pos;
}
if (last_valid_pos == ctx.input.size() && ctx.is_partial) {
if (last_valid_pos == ctx.input.size() && ctx.is_lenient()) {
// Reached the end of a partial stream, there might still be more input that we need to consume.
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, last_valid_pos);
}
@@ -876,7 +818,7 @@ struct parser_executor {
common_peg_parse_result operator()(const common_peg_tag_parser & p) {
// Parse the child
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sTAG: %s\n", debug_indent().c_str(), p.tag.c_str());
}
auto result = arena.parse(p.child, ctx, start_pos);
@@ -995,8 +937,7 @@ void common_peg_arena::resolve_refs() {
std::is_same_v<T, common_peg_ref_parser> ||
std::is_same_v<T, common_peg_until_parser> ||
std::is_same_v<T, common_peg_literal_parser> ||
std::is_same_v<T, common_peg_json_string_parser> ||
std::is_same_v<T, common_peg_python_dict_string_parser> ||
std::is_same_v<T, common_peg_string_parser> ||
std::is_same_v<T, common_peg_chars_parser> ||
std::is_same_v<T, common_peg_any_parser> ||
std::is_same_v<T, common_peg_space_parser>) {
@@ -1072,10 +1013,8 @@ std::string common_peg_arena::dump_impl(common_peg_parser_id
return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", unbounded)";
}
return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", " + std::to_string(p.max_count) + ")";
} else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
return "JsonString()";
} else if constexpr (std::is_same_v<T, common_peg_python_dict_string_parser>) {
return "PythonDictString()";
} else if constexpr (std::is_same_v<T, common_peg_string_parser>) {
return "String(" + std::string(1, p.delimiter) + ")";
} else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
return "Until(" + string_join(p.delimiters, " | ") + ")";
} else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
@@ -1288,47 +1227,25 @@ common_peg_arena common_peg_parser_builder::build() {
// String primitives
common_peg_parser common_peg_parser_builder::json_string_content() {
return wrap(arena_.add_parser(common_peg_json_string_parser{}));
}
common_peg_parser common_peg_parser_builder::single_quoted_string_content() {
return wrap(arena_.add_parser(common_peg_python_dict_string_parser{}));
common_peg_parser common_peg_parser_builder::string_content(char delimiter) {
return wrap(arena_.add_parser(common_peg_string_parser{delimiter}));
}
common_peg_parser common_peg_parser_builder::double_quoted_string() {
return rule("dq-string",
[this]() { return sequence({ literal("\""), json_string_content(), literal("\""), space() }); });
}
common_peg_parser common_peg_parser_builder::single_quoted_string() {
return rule("sq-string",
[this]() { return sequence({ literal("'"), single_quoted_string_content(), literal("'"), space() }); });
}
common_peg_parser common_peg_parser_builder::flexible_string() {
return rule("flexible-string", [this]() { return choice({ double_quoted_string(), single_quoted_string() }); });
}
// Generic helpers for object/array structure
common_peg_parser common_peg_parser_builder::generic_object(const std::string & name,
const common_peg_parser & string_parser,
const common_peg_parser & value_parser) {
return rule(name, [this, string_parser, value_parser]() {
auto ws = space();
auto member = sequence({ string_parser, ws, literal(":"), ws, value_parser });
auto members = sequence({ member, zero_or_more(sequence({ ws, literal(","), ws, member })) });
return sequence({ literal("{"), ws, choice({ literal("}"), sequence({ members, ws, literal("}") }) }) });
return rule("double-quoted-string", [this]() {
return sequence({literal("\""), string_content('"'), literal("\""), space()});
});
}
common_peg_parser common_peg_parser_builder::generic_array(const std::string & name,
const common_peg_parser & value_parser) {
return rule(name, [this, value_parser]() {
auto ws = space();
auto elements = sequence({ value_parser, zero_or_more(sequence({ literal(","), ws, value_parser })) });
return sequence({ literal("["), ws, choice({ literal("]"), sequence({ elements, ws, literal("]") }) }) });
common_peg_parser common_peg_parser_builder::single_quoted_string() {
return rule("single-quoted-string", [this]() {
return sequence({literal("'"), string_content('\''), literal("'"), space()});
});
}
common_peg_parser common_peg_parser_builder::quoted_string() {
return rule("quoted-string", [this]() {
return choice({double_quoted_string(), single_quoted_string()});
});
}
@@ -1351,7 +1268,7 @@ common_peg_parser common_peg_parser_builder::json_number() {
common_peg_parser common_peg_parser_builder::json_string() {
return rule("json-string", [this]() {
return sequence({literal("\""), json_string_content(), literal("\""), space()});
return sequence({literal("\""), string_content('"'), literal("\""), space()});
});
}
@@ -1368,11 +1285,36 @@ common_peg_parser common_peg_parser_builder::json_null() {
}
common_peg_parser common_peg_parser_builder::json_object() {
return generic_object("json-object", json_string(), json());
return rule("json-object", [this]() {
auto ws = space();
auto member = sequence({json_string(), ws, literal(":"), ws, json()});
auto members = sequence({member, zero_or_more(sequence({ws, literal(","), ws, member}))});
return sequence({
literal("{"),
ws,
choice({
literal("}"),
sequence({members, ws, literal("}")})
}),
ws
});
});
}
common_peg_parser common_peg_parser_builder::json_array() {
return generic_array("json-array", json());
return rule("json-array", [this]() {
auto ws = space();
auto elements = sequence({json(), zero_or_more(sequence({literal(","), ws, json()}))});
return sequence({
literal("["),
ws,
choice({
literal("]"),
sequence({elements, ws, literal("]")})
}),
ws
});
});
}
common_peg_parser common_peg_parser_builder::json() {
@@ -1389,7 +1331,9 @@ common_peg_parser common_peg_parser_builder::json() {
}
common_peg_parser common_peg_parser_builder::python_string() {
return rule("python-string", [this]() { return choice({ double_quoted_string(), single_quoted_string() }); });
return rule("python-string", [this]() {
return choice({double_quoted_string(), single_quoted_string()});
});
}
common_peg_parser common_peg_parser_builder::python_number() {
@@ -1397,24 +1341,63 @@ common_peg_parser common_peg_parser_builder::python_number() {
}
common_peg_parser common_peg_parser_builder::python_bool() {
return rule("python-bool", [this]() { return sequence({ choice({ literal("True"), literal("False") }), space() }); });
return rule("python-bool", [this]() {
return sequence({
choice({literal("True"), literal("False")}),
space()
});
});
}
common_peg_parser common_peg_parser_builder::python_null() {
return rule("python-none", [this]() { return sequence({ literal("None"), space() }); });
return rule("python-none", [this]() {
return sequence({literal("None"), space()});
});
}
common_peg_parser common_peg_parser_builder::python_dict() {
return generic_object("python-dict", python_string(), python_value());
return rule("python-dict", [this]() {
auto ws = space();
auto member = sequence({python_string(), ws, literal(":"), ws, python_value()});
auto members = sequence({member, zero_or_more(sequence({ws, literal(","), ws, member}))});
return sequence({
literal("{"),
ws,
choice({
literal("}"),
sequence({members, ws, literal("}")})
}),
ws
});
});
}
common_peg_parser common_peg_parser_builder::python_array() {
return generic_array("python-array", python_value());
return rule("python-array", [this]() {
auto ws = space();
auto elements = sequence({python_value(), zero_or_more(sequence({literal(","), ws, python_value()}))});
return sequence({
literal("["),
ws,
choice({
literal("]"),
sequence({elements, ws, literal("]")})
}),
ws
});
});
}
common_peg_parser common_peg_parser_builder::python_value() {
return rule("python-value", [this]() {
return choice({ python_dict(), python_array(), python_string(), python_number(), python_bool(), python_null() });
return choice({
python_dict(),
python_array(),
python_string(),
python_number(),
python_bool(),
python_null()
});
});
}
@@ -1535,8 +1518,7 @@ static std::unordered_set<std::string> collect_reachable_rules(
std::is_same_v<T, common_peg_chars_parser> ||
std::is_same_v<T, common_peg_space_parser> ||
std::is_same_v<T, common_peg_any_parser> ||
std::is_same_v<T, common_peg_json_string_parser> ||
std::is_same_v<T, common_peg_python_dict_string_parser>) {
std::is_same_v<T, common_peg_string_parser>) {
// These parsers do not have any children
} else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
for (auto child : p.children) {
@@ -1672,10 +1654,9 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
return result + "{" + std::to_string(p.min_count) + "}";
}
return result + "{" + std::to_string(p.min_count) + "," + std::to_string(p.max_count) + "}";
} else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
return R"(( [^"\\] | "\\" ( ["\\/ bfnrt] | "u" [0-9a-fA-F]{4} ) )*)";
} else if constexpr (std::is_same_v<T, common_peg_python_dict_string_parser>) {
return R"(( [^"\\] | "\\" ( ["\\/ bfnrt] | "u" [0-9a-fA-F]{4} ) )*)";
} else if constexpr (std::is_same_v<T, common_peg_string_parser>) {
const std::string delim(1, p.delimiter);
return R"(( [^)" + delim + R"(\\] | "\\" ( [)" + delim + R"(\\/ bfnrt] | "u" [0-9a-fA-F]{4} ) )*)";
} else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
if (p.delimiters.empty()) {
return ".*";
@@ -1805,10 +1786,8 @@ static nlohmann::json serialize_parser_variant(const common_peg_parser_variant &
{"min_count", p.min_count},
{"max_count", p.max_count}
};
} else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
return json{{"type", "json_string"}};
} else if constexpr (std::is_same_v<T, common_peg_python_dict_string_parser>) {
return json{{ "type", "python_dict_string" }};
} else if constexpr (std::is_same_v<T, common_peg_string_parser>) {
return json{{"type", "string"}, {"delimiter", std::string(1, p.delimiter)}};
} else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
return json{{"type", "until"}, {"delimiters", p.delimiters}};
} else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
@@ -1935,11 +1914,15 @@ static common_peg_parser_variant deserialize_parser_variant(const nlohmann::json
}
return parser;
}
if (type == "json_string") {
return common_peg_json_string_parser{};
}
if (type == "python_dict_string") {
return common_peg_python_dict_string_parser{};
if (type == "string") {
if (!j.contains("delimiter")) {
throw std::runtime_error("string parser missing delimiter field.");
}
std::string delimiter = j["delimiter"];
if (delimiter.empty()) {
throw std::runtime_error("string parser delimiter is empty.");
}
return common_peg_string_parser{delimiter[0]};
}
if (type == "until") {
if (!j.contains("delimiters") || !j["delimiters"].is_array()) {

View File

@@ -139,22 +139,43 @@ struct common_peg_parse_result {
bool success() const { return type == COMMON_PEG_PARSE_RESULT_SUCCESS; }
};
enum common_peg_parse_flags {
COMMON_PEG_PARSE_FLAG_NONE = 0,
COMMON_PEG_PARSE_FLAG_LENIENT = 1 << 0,
COMMON_PEG_PARSE_FLAG_DEBUG = 1 << 1,
};
inline common_peg_parse_flags operator|(common_peg_parse_flags a, common_peg_parse_flags b) {
return static_cast<common_peg_parse_flags>(int(a) | int(b));
}
inline common_peg_parse_flags & operator|=(common_peg_parse_flags & a, common_peg_parse_flags b) {
return a = a | b;
}
inline common_peg_parse_flags operator&(common_peg_parse_flags a, common_peg_parse_flags b) {
return static_cast<common_peg_parse_flags>(int(a) & int(b));
}
inline common_peg_parse_flags operator~(common_peg_parse_flags a) {
return static_cast<common_peg_parse_flags>(~int(a));
}
struct common_peg_parse_context {
std::string input;
bool is_partial;
bool debug = false; // Enable debug output for parser tracing
common_peg_parse_flags flags;
common_peg_ast_arena ast;
int parse_depth;
common_peg_parse_context()
: is_partial(false), parse_depth(0) {}
common_peg_parse_context(common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_NONE)
: flags(flags), parse_depth(0) {}
common_peg_parse_context(const std::string & input)
: input(input), is_partial(false), parse_depth(0) {}
common_peg_parse_context(const std::string & input, common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_NONE)
: input(input), flags(flags), parse_depth(0) {}
common_peg_parse_context(const std::string & input, bool is_partial)
: input(input), is_partial(is_partial), parse_depth(0) {}
bool is_lenient() const { return flags & COMMON_PEG_PARSE_FLAG_LENIENT; }
bool is_debug() const { return flags & COMMON_PEG_PARSE_FLAG_DEBUG; }
};
class common_peg_arena;
@@ -210,8 +231,9 @@ struct common_peg_chars_parser {
int max_count; // -1 for unbounded
};
struct common_peg_json_string_parser {};
struct common_peg_python_dict_string_parser {};
struct common_peg_string_parser {
char delimiter;
};
struct common_peg_until_parser {
std::vector<std::string> delimiters;
@@ -259,8 +281,7 @@ using common_peg_parser_variant = std::variant<
common_peg_any_parser,
common_peg_space_parser,
common_peg_chars_parser,
common_peg_json_string_parser,
common_peg_python_dict_string_parser,
common_peg_string_parser,
common_peg_until_parser,
common_peg_schema_parser,
common_peg_rule_parser,
@@ -319,10 +340,6 @@ class common_peg_parser_builder {
common_peg_parser wrap(common_peg_parser_id id) { return common_peg_parser(id, *this); }
common_peg_parser add(const common_peg_parser_variant & p) { return wrap(arena_.add_parser(p)); }
// Generic helpers for building object/array structures with configurable string/value parsers.
common_peg_parser generic_object(const std::string & name, const common_peg_parser & string_parser, const common_peg_parser & value_parser);
common_peg_parser generic_array(const std::string & name, const common_peg_parser & value_parser);
public:
common_peg_parser_builder();
@@ -423,13 +440,10 @@ class common_peg_parser_builder {
common_peg_parser single_quoted_string();
// Matches a string that accepts both double-quoted and single-quoted styles.
common_peg_parser flexible_string();
common_peg_parser quoted_string();
// Matches double-quoted string content without the surrounding quotes.
common_peg_parser json_string_content();
// Matches single-quoted string content without the surrounding quotes.
common_peg_parser single_quoted_string_content();
// Matches string content without the surrounding delimiter.
common_peg_parser string_content(char delimiter);
// Creates a complete JSON parser supporting objects, arrays, strings, numbers, booleans, and null.
// value -> object | array | string | number | true | false | null

219
common/reasoning-budget.cpp Normal file
View File

@@ -0,0 +1,219 @@
#include "reasoning-budget.h"
#include "common.h"
#include "unicode.h"
#include "log.h"
#include <cmath>
#include <cstdint>
#include <string>
#include <vector>
struct token_matcher {
std::vector<llama_token> tokens;
size_t pos = 0;
bool advance(llama_token token) {
if (tokens.empty()) {
return false;
}
if (token == tokens[pos]) {
pos++;
if (pos >= tokens.size()) {
pos = 0;
return true;
}
} else {
pos = 0;
if (token == tokens[0]) {
pos = 1;
}
}
return false;
}
void reset() { pos = 0; }
};
struct common_reasoning_budget_ctx {
const llama_vocab * vocab;
token_matcher start_matcher;
token_matcher end_matcher;
std::vector<llama_token> forced_tokens;
int32_t budget; // maximum tokens in reasoning block
int32_t remaining; // tokens remaining in budget
common_reasoning_budget_state state;
// for forcing
size_t force_pos; // next position in forced_tokens to force
};
static const char * common_reasoning_budget_name(const struct llama_sampler * /*smpl*/) {
return "reasoning-budget";
}
static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_token token) {
auto * ctx = (common_reasoning_budget_ctx *) smpl->ctx;
switch (ctx->state) {
case REASONING_BUDGET_IDLE:
{
if (ctx->start_matcher.advance(token)) {
ctx->state = REASONING_BUDGET_COUNTING;
ctx->remaining = ctx->budget;
LOG_INF("reasoning-budget: activated, budget=%d tokens\n", ctx->budget);
if (ctx->remaining <= 0) {
ctx->state = REASONING_BUDGET_FORCING;
ctx->force_pos = 0;
LOG_INF("reasoning-budget: budget=0, forcing immediately\n");
}
}
break;
}
case REASONING_BUDGET_COUNTING:
case REASONING_BUDGET_WAITING_UTF8:
{
if (ctx->end_matcher.advance(token)) {
ctx->state = REASONING_BUDGET_DONE;
LOG_INF("reasoning-budget: deactivated (natural end)\n");
break;
}
bool utf8_complete = true;
if (ctx->vocab != nullptr) {
const std::string piece = common_token_to_piece(ctx->vocab, token, false);
utf8_complete = common_utf8_is_complete(piece);
}
if (ctx->state == REASONING_BUDGET_WAITING_UTF8) {
if (utf8_complete) {
ctx->state = REASONING_BUDGET_FORCING;
ctx->force_pos = 0;
ctx->end_matcher.reset();
LOG_INF("reasoning-budget: UTF-8 complete, now forcing end sequence\n");
}
} else if (ctx->state == REASONING_BUDGET_COUNTING) {
ctx->remaining--;
if (ctx->remaining <= 0) {
if (utf8_complete) {
ctx->state = REASONING_BUDGET_FORCING;
ctx->force_pos = 0;
ctx->end_matcher.reset();
LOG_INF("reasoning-budget: budget exhausted, forcing end sequence\n");
} else {
ctx->state = REASONING_BUDGET_WAITING_UTF8;
ctx->end_matcher.reset();
LOG_INF("reasoning-budget: budget exhausted, waiting for UTF-8 completion\n");
}
}
}
break;
}
case REASONING_BUDGET_FORCING:
// force_pos is advanced in apply(), not here.
// This ensures the first forced token isn't skipped when the sampler
// is initialized directly in FORCING state (e.g. COUNTING + budget=0)
break;
case REASONING_BUDGET_DONE:
break;
}
}
static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (common_reasoning_budget_ctx *) smpl->ctx;
if (ctx->state != REASONING_BUDGET_FORCING) {
// passthrough — don't modify logits
return;
}
if (ctx->force_pos >= ctx->forced_tokens.size()) {
return;
}
const llama_token forced = ctx->forced_tokens[ctx->force_pos];
// set all logits to -inf except the forced token
for (size_t i = 0; i < cur_p->size; i++) {
if (cur_p->data[i].id != forced) {
cur_p->data[i].logit = -INFINITY;
}
}
// advance to next forced token (done here rather than in accept so that
// the first forced token isn't skipped when starting in FORCING state)
ctx->force_pos++;
if (ctx->force_pos >= ctx->forced_tokens.size()) {
ctx->state = REASONING_BUDGET_DONE;
LOG_INF("reasoning-budget: forced sequence complete, done\n");
}
}
static void common_reasoning_budget_reset(struct llama_sampler * smpl) {
auto * ctx = (common_reasoning_budget_ctx *) smpl->ctx;
ctx->state = REASONING_BUDGET_IDLE;
ctx->remaining = ctx->budget;
ctx->start_matcher.reset();
ctx->end_matcher.reset();
ctx->force_pos = 0;
}
static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl) {
const auto * ctx = (const common_reasoning_budget_ctx *) smpl->ctx;
return common_reasoning_budget_init(
ctx->vocab,
ctx->start_matcher.tokens,
ctx->end_matcher.tokens,
ctx->forced_tokens,
ctx->budget,
ctx->state);
}
static void common_reasoning_budget_free(struct llama_sampler * smpl) {
delete (common_reasoning_budget_ctx *) smpl->ctx;
}
static struct llama_sampler_i common_reasoning_budget_i = {
/* .name = */ common_reasoning_budget_name,
/* .accept = */ common_reasoning_budget_accept,
/* .apply = */ common_reasoning_budget_apply,
/* .reset = */ common_reasoning_budget_reset,
/* .clone = */ common_reasoning_budget_clone,
/* .free = */ common_reasoning_budget_free,
/* .backend_init = */ nullptr,
/* .backend_accept = */ nullptr,
/* .backend_apply = */ nullptr,
/* .backend_set_input = */ nullptr,
};
struct llama_sampler * common_reasoning_budget_init(
const struct llama_vocab * vocab,
const std::vector<llama_token> & start_tokens,
const std::vector<llama_token> & end_tokens,
const std::vector<llama_token> & forced_tokens,
int32_t budget,
common_reasoning_budget_state initial_state) {
// promote COUNTING with budget <= 0 to FORCING
if (initial_state == REASONING_BUDGET_COUNTING && budget <= 0) {
initial_state = REASONING_BUDGET_FORCING;
}
return llama_sampler_init(
/* .iface = */ &common_reasoning_budget_i,
/* .ctx = */ new common_reasoning_budget_ctx {
/* .vocab = */ vocab,
/* .start_matcher = */ { start_tokens, 0 },
/* .end_matcher = */ { end_tokens, 0 },
/* .forced_tokens = */ forced_tokens,
/* .budget = */ budget,
/* .remaining = */ budget,
/* .state = */ initial_state,
/* .force_pos = */ 0,
}
);
}

41
common/reasoning-budget.h Normal file
View File

@@ -0,0 +1,41 @@
#pragma once
#include "llama.h"
#include <cstdint>
#include <vector>
enum common_reasoning_budget_state {
REASONING_BUDGET_IDLE, // waiting for start sequence
REASONING_BUDGET_COUNTING, // counting down tokens
REASONING_BUDGET_FORCING, // forcing budget message + end sequence
REASONING_BUDGET_WAITING_UTF8, // budget exhausted, waiting for UTF-8 completion
REASONING_BUDGET_DONE, // passthrough forever
};
// Creates a reasoning budget sampler that limits token generation inside a
// reasoning block (e.g. between <think> and </think>).
//
// State machine: IDLE -> COUNTING -> WAITING_UTF8 -> FORCING -> DONE
// IDLE: passthrough, watching for start_tokens sequence
// COUNTING: counting down remaining tokens, watching for natural end_tokens
// WAITING_UTF8: budget exhausted, allowing tokens to complete a UTF-8 sequence
// FORCING: forces forced_tokens token-by-token (all other logits -> -inf)
// DONE: passthrough forever
//
// Parameters:
// vocab - vocabulary (used for UTF-8 boundary detection; can be nullptr)
// start_tokens - token sequence that activates counting
// end_tokens - token sequence for natural deactivation
// forced_tokens - token sequence forced when budget expires
// budget - max tokens allowed in the reasoning block
// initial_state - initial state of the sampler (e.g. IDLE or COUNTING)
// note: COUNTING with budget <= 0 is promoted to FORCING
//
struct llama_sampler * common_reasoning_budget_init(
const struct llama_vocab * vocab,
const std::vector<llama_token> & start_tokens,
const std::vector<llama_token> & end_tokens,
const std::vector<llama_token> & forced_tokens,
int32_t budget,
common_reasoning_budget_state initial_state);

View File

@@ -2,6 +2,7 @@
#include "common.h"
#include "log.h"
#include "reasoning-budget.h"
#include <algorithm>
#include <cmath>
@@ -250,6 +251,17 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
}
}
// reasoning budget sampler — added first so it can force tokens before other samplers
if (params.reasoning_budget_tokens >= 0 && !params.reasoning_budget_forced.empty()) {
samplers.push_back(common_reasoning_budget_init(
vocab,
params.reasoning_budget_start,
params.reasoning_budget_end,
params.reasoning_budget_forced,
params.reasoning_budget_tokens,
params.reasoning_budget_activate_immediately ? REASONING_BUDGET_COUNTING : REASONING_BUDGET_IDLE));
}
if (params.has_logit_bias()) {
samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data()));
}

View File

@@ -1,8 +1,10 @@
#include "unicode.h"
#include <algorithm>
#include <cassert>
#include <stdexcept>
#include <vector>
#include <string>
#include <vector>
// implementation adopted from src/unicode.cpp
@@ -67,6 +69,20 @@ utf8_parse_result common_parse_utf8_codepoint(std::string_view input, size_t off
return utf8_parse_result(utf8_parse_result::INVALID);
}
bool common_utf8_is_complete(const std::string & s) {
if (s.empty()) {
return true;
}
for (int i = 1; i <= std::min(4, (int)s.size()); i++) {
unsigned char c = s[s.size() - i];
if ((c & 0xC0) != 0x80) {
int expected = (c >= 0xF0) ? 4 : (c >= 0xE0) ? 3 : (c >= 0xC0) ? 2 : 1;
return i >= expected;
}
}
return false;
}
std::string common_unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
std::string result;
for (size_t i = 0; i < cps.size(); ++i) {

View File

@@ -20,6 +20,9 @@ struct utf8_parse_result {
// Returns 0 for invalid first bytes
size_t common_utf8_sequence_length(unsigned char first_byte);
// Check if a string ends with a complete UTF-8 sequence.
bool common_utf8_is_complete(const std::string & s);
// Parse a single UTF-8 codepoint from input
utf8_parse_result common_parse_utf8_codepoint(std::string_view input, size_t offset);

View File

@@ -144,6 +144,7 @@ class ModelBase:
self.metadata_override = metadata_override
self.model_name = model_name
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
self._is_nvfp4 = False
# Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
@@ -271,6 +272,9 @@ class ModelBase:
return tensors
def dequant_model(self):
if self._is_nvfp4:
return # NVFP4 weights are repacked in _generate_nvfp4_tensors
tensors_to_remove: list[str] = []
new_tensors: dict[str, Callable[[], Tensor]] = {}
@@ -516,6 +520,13 @@ class ModelBase:
raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses")
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# skip NVFP4 auxiliary tensors (handled in _generate_nvfp4_tensors)
if self._is_nvfp4:
if name.endswith((".weight_scale", ".weight_scale_2", ".input_scale", ".k_scale", ".v_scale")):
return []
if name.endswith(".weight") and name.replace(".weight", ".weight_scale") in self.model_tensors:
return []
new_name = self.map_tensor_name(name)
# Handle gate/up expert tensor fusion if enabled
@@ -551,9 +562,135 @@ class ModelBase:
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
return ()
@staticmethod
def _nvfp4_pack(weight: Tensor, scale: Tensor) -> tuple[np.ndarray, list[int]]:
"""Repack NVFP4 ModelOpt tensors into ggml super-block layout.
Preserves original E4M3 scale bits as UE4M3 (strip sign bit).
The per-tensor scale2 factor is stored as a separate tensor and applied at inference time via ggml_mul().
Returns (raw_data, logical_shape)."""
out_features = weight.shape[0]
n_blocks = scale.shape[1]
# Unpack ModelOpt nibble-packed weights
w = weight.reshape(out_features, n_blocks, 8)
vals = torch.stack([w & 0x0F, w >> 4], dim=-1).reshape(out_features, n_blocks, 16)
# Preserve original E4M3 scale bits as UE4M3 (strip sign bit)
d_ue = scale.view(torch.uint8).numpy().reshape(out_features, n_blocks) & 0x7F
qs = (vals[:, :, :8] | (vals[:, :, 8:] << 4)).to(torch.uint8).numpy()
# Pack into super-blocks: [4 UE4M3 scales, 32 qs bytes] = 36 bytes per 64 elements
n_super = n_blocks // 4
d_grouped = d_ue.reshape(out_features, n_super, 4)
qs_grouped = qs.reshape(out_features, n_super, 4, 8).reshape(out_features, n_super, 32)
raw = np.concatenate([d_grouped, qs_grouped], axis=-1).reshape(out_features, n_super * 36)
return raw, [out_features, n_super * 64]
@staticmethod
def _nvfp4_scale2_is_trivial(scale2: Tensor) -> bool:
return scale2.numel() <= 1 and abs(float(scale2.float().sum()) - 1.0) < 1e-6
def _repack_nvfp4(self, new_name: str, weight: Tensor, scale: Tensor, scale2: Tensor):
raw, shape = self._nvfp4_pack(weight, scale)
logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4")
self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
# Emit per-tensor scale2 as a separate F32 tensor when non-trivial
if not self._nvfp4_scale2_is_trivial(scale2):
scale2_f32 = scale2.float().numpy().flatten()
scale_name = new_name.replace(".weight", ".scale")
logger.info(f" + {scale_name} (per-tensor NVFP4 scale2, shape [{scale2_f32.size}])")
self.gguf_writer.add_tensor(scale_name, scale2_f32)
def _generate_nvfp4_tensors(self):
# Per-layer expert merging to avoid holding all experts in memory
expert_blocks: dict[tuple[int, str], list[tuple[int, np.ndarray]]] = {}
expert_scales: dict[tuple[int, str], list[tuple[int, float]]] = {}
expert_shapes: dict[tuple[int, str], list[int]] = {}
n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0
for name in list(self.model_tensors.keys()):
if not name.endswith(".weight"):
continue
scale_name = name.replace(".weight", ".weight_scale")
scale2_name = name.replace(".weight", ".weight_scale_2")
if scale_name not in self.model_tensors:
continue
# Force eager materialization of lazy tensors
weight = LazyTorchTensor.to_eager(self.model_tensors[name]())
scale = LazyTorchTensor.to_eager(self.model_tensors[scale_name]())
scale2 = LazyTorchTensor.to_eager(self.model_tensors.get(scale2_name, lambda: torch.tensor(1.0))())
# Check if this is a per-expert tensor
m = re.search(r'\.experts\.(\d+)\.(gate_proj|up_proj|down_proj)\.weight$', name)
if m:
expert_id = int(m.group(1))
proj_type = m.group(2)
bid_m = re.search(r'\.layers\.(\d+)\.', name)
bid = int(bid_m.group(1)) if bid_m else 0
key = (bid, proj_type)
raw, shape = self._nvfp4_pack(weight, scale)
if key not in expert_blocks:
expert_blocks[key] = []
expert_scales[key] = []
expert_shapes[key] = shape
expert_blocks[key].append((expert_id, raw.copy()))
# Collect per-expert scale2 (scalar per expert)
expert_scales[key].append((expert_id, float(scale2.float().sum())))
# Flush when all experts for this (layer, proj) are collected
if n_experts > 0 and len(expert_blocks[key]) >= n_experts:
self._flush_nvfp4_experts(key, expert_blocks, expert_scales, expert_shapes, bid, proj_type)
else:
new_name = self.map_tensor_name(name)
self._repack_nvfp4(new_name, weight, scale, scale2)
# Flush any remaining experts (fallback if n_experts was unknown)
for (bid, proj_type) in list(expert_blocks.keys()):
self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_shapes, bid, proj_type)
def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_shapes, bid, proj_type):
experts = expert_blocks.pop(key)
scales = expert_scales.pop(key)
shape = expert_shapes.pop(key)
experts.sort(key=lambda x: x[0])
merged = np.stack([e[1] for e in experts], axis=0)
merged_name = f"model.layers.{bid}.mlp.experts.{proj_type}.weight"
new_name = self.map_tensor_name(merged_name)
logger.info(f"Repacked {new_name} with shape [{len(experts)}, {shape[0]}, {shape[1]}] and quantization NVFP4")
self.gguf_writer.add_tensor(new_name, merged, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
# Emit per-expert scale2 tensor if any expert has non-trivial scale2
scales.sort(key=lambda x: x[0])
scale_vals = np.array([s[1] for s in scales], dtype=np.float32)
if not np.allclose(scale_vals, 1.0, atol=1e-6):
scale_name = new_name.replace(".weight", ".scale")
logger.info(f" + {scale_name} (per-expert NVFP4 scale2, shape [{len(scales)}])")
self.gguf_writer.add_tensor(scale_name, scale_vals)
del experts, merged
def prepare_tensors(self):
# detect NVFP4 quantization (ModelOpt format)
quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
quant_config_file = self.dir_model / "hf_quant_config.json"
if not quant_algo and quant_config_file.is_file():
with open(quant_config_file, "r", encoding="utf-8") as f:
quant_algo = (json.load(f).get("quantization") or {}).get("quant_algo")
self._is_nvfp4 = quant_algo == "NVFP4"
self.dequant_model()
# NVFP4 weights are repacked and written directly to gguf_writer
if self._is_nvfp4:
self._generate_nvfp4_tensors()
# Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
if self.tensor_map.mapping:
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
@@ -2057,6 +2194,8 @@ class GPTNeoXModel(TextModel):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
assert n_head is not None
assert n_embed is not None
if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name):
# Map bloom-style qkv_linear to gpt-style qkv_linear
@@ -2094,6 +2233,8 @@ class BloomModel(TextModel):
def set_gguf_parameters(self):
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
assert n_head is not None
assert n_embed is not None
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
self.gguf_writer.add_embedding_length(n_embed)
self.gguf_writer.add_feed_forward_length(4 * n_embed)
@@ -2106,6 +2247,8 @@ class BloomModel(TextModel):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
assert n_head is not None
assert n_embed is not None
name = re.sub(r'transformer\.', '', name)
@@ -3716,6 +3859,7 @@ class LLaDAModel(TextModel):
if (rope_dim := hparams.get("head_dim")) is None:
n_heads = hparams.get("num_attention_heads", hparams.get("n_heads"))
assert n_heads is not None
rope_dim = hparams.get("hidden_size", hparams.get("d_model")) // n_heads
self.gguf_writer.add_rope_dimension_count(rope_dim)
@@ -3747,6 +3891,7 @@ class LLaDAModel(TextModel):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
n_head = self.hparams.get("num_attention_heads", self.hparams.get("n_heads"))
assert n_head is not None
n_kv_head = self.hparams.get("num_key_value_heads", self.hparams.get("n_kv_heads"))
if self.undo_permute:
@@ -4303,6 +4448,14 @@ class Qwen2MoeModel(TextModel):
# process the experts separately
name = name.replace("language_model.", "") # InternVL
# NVFP4 expert weights are handled in _generate_nvfp4_tensors
if self._is_nvfp4 and "experts" in name:
if name.endswith((".weight", ".weight_scale", ".weight_scale_2", ".input_scale")):
if name.endswith(".weight") and name.replace(".weight", ".weight_scale") in self.model_tensors:
return
if not name.endswith(".weight"):
return
# handle aggregated expert tensors
# GGUF stores dimensions reversed from PyTorch, so:
# PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A}
@@ -4390,15 +4543,31 @@ class Qwen3Model(Qwen2Model):
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
self.origin_hf_arch = hparams.get('architectures', [None])[0]
# a bit hacky, but currently the only way to detect if this is a rerank model
# ref: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
if self._is_qwen3_reranker():
self._find_rerank_config()
def _is_qwen3_reranker(self) -> bool:
readme_path = self.dir_model / "README.md"
readme_text = ""
if readme_path.exists():
with readme_path.open("r", encoding="utf-8") as f:
readme_text = f.read()
if "# Qwen3-Reranker" in readme_text:
self._find_rerank_config()
name_hints = [
str(self.dir_model.name),
str(self.hparams.get("_name_or_path", "")),
str(self.hparams.get("model_type", "")),
str(self.origin_hf_arch or ""),
]
name_hints = [hint.lower() for hint in name_hints if hint]
if "# qwen3-reranker" in readme_text.lower() or "# qwen3-vl-reranker" in readme_text.lower():
return True
if any("qwen3-reranker" in hint or "qwen3-vl-reranker" in hint for hint in name_hints):
return True
return "sequenceclassification" in (self.origin_hf_arch or "").lower()
def set_vocab(self):
# deal with intern-s1-mini
@@ -4901,7 +5070,7 @@ class Phi2Model(TextModel):
self.gguf_writer.add_add_bos_token(False)
@ModelBase.register("Phi3ForCausalLM")
@ModelBase.register("Phi3ForCausalLM", "Phi4ForCausalLMV")
class Phi3MiniModel(TextModel):
model_arch = gguf.MODEL_ARCH.PHI3
@@ -5076,6 +5245,129 @@ class Phi3MiniModel(TextModel):
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith(("model.vision_tower.", "vision_tower.", "model.mm_projector.", "mm_projector.")):
return
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Phi4ForCausalLMV")
class Phi4VisionMmprojModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
self.vision_total_layers = int(self.find_vparam(self.n_block_keys))
if self.vision_total_layers < 2:
raise ValueError(
f"Phi-4 vision mmproj conversion requires at least 2 vision layers, got {self.vision_total_layers}"
)
# Phi-4 uses SigLIP2 hidden_states[-2], so export one fewer encoder block and
# drop post-layernorm/head weights. This makes the GGUF runtime output match
# the feature map consumed by the patched siglip.cpp Phi-4 projector path.
self.vision_export_layers = self.vision_total_layers - 1
self.vision_last_layer_idx = self.vision_total_layers - 1
for key in self.n_block_keys:
if key in self.hparams_vision:
self.hparams_vision[key] = self.vision_export_layers
break
self.block_count = self.vision_export_layers
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
patch_size = self.preprocessor_config.get("patch_size")
if patch_size is None:
raise KeyError("Phi-4 vision mmproj conversion requires patch_size in preprocessor_config.json")
self.hparams_vision["patch_size"] = patch_size
pos_emb_name = next(
(
name for name in self.model_tensors
if name.endswith("vision_model.embeddings.position_embedding.weight")
),
None,
)
if pos_emb_name is None:
raise KeyError("Phi-4 vision mmproj conversion could not find position_embedding.weight")
pos_emb_shape = self.model_tensors[pos_emb_name]().shape
base_grid_tokens = int(pos_emb_shape[0])
grid_side = math.isqrt(base_grid_tokens)
if grid_side * grid_side != base_grid_tokens:
raise ValueError(f"Unexpected Phi-4 position embedding shape: {tuple(pos_emb_shape)}")
self.hparams_vision["image_size"] = grid_side * patch_size
min_num_patches = self.preprocessor_config.get("min_num_patches", self.global_config.get("min_num_patches"))
max_num_patches = self.preprocessor_config.get("max_num_patches", self.global_config.get("max_num_patches"))
if min_num_patches is None or max_num_patches is None:
raise KeyError("Phi-4 vision mmproj conversion requires min_num_patches and max_num_patches")
self.min_pixels = int(min_num_patches) * patch_size * patch_size
self.max_pixels = int(max_num_patches) * patch_size * patch_size
def set_gguf_parameters(self):
super().set_gguf_parameters()
assert self.hparams_vision is not None
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PHI4)
self.gguf_writer.add_vision_min_pixels(self.min_pixels)
self.gguf_writer.add_vision_max_pixels(self.max_pixels)
self.gguf_writer.add_vision_use_gelu(True)
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith(("model.vision_tower.vision_tower.", "vision_tower.")):
if ".vision_model.head." in name:
return
new_name = name.replace("model.vision_tower.vision_tower.", "vision_tower.")
if ".vision_model.post_layernorm." in new_name:
return
if bid is not None and bid == self.vision_last_layer_idx:
return
if new_name.endswith("vision_model.embeddings.patch_embedding.weight"):
assert self.hparams_vision is not None
if data_torch.ndim != 2:
raise ValueError(f"Unexpected Phi-4 patch embedding shape: {tuple(data_torch.shape)}")
patch_area = self.hparams_vision["patch_size"] ** 2
in_features = data_torch.shape[1]
if in_features % patch_area != 0:
raise ValueError(
f"Phi-4 patch embedding input dim {in_features} is not divisible by patch area {patch_area}"
)
num_channels = in_features // patch_area
patch_size = self.hparams_vision["patch_size"]
data_torch = data_torch.view(data_torch.shape[0], patch_size, patch_size, num_channels)
data_torch = data_torch.permute(0, 3, 1, 2)
yield from super().modify_tensors(data_torch, new_name, bid)
return
if name.startswith(("model.mm_projector.", "mm_projector.")):
local_name = name
local_name = local_name.replace("model.mm_projector.", "")
local_name = local_name.replace("mm_projector.", "")
if not (local_name.startswith("0.") or local_name.startswith("2.")):
return
suffix = ".bias" if local_name.endswith(".bias") else ".weight"
mm_idx = int(local_name.split(".", maxsplit=1)[0])
yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_idx, suffix=suffix), data_torch)
return
return
@ModelBase.register("PhiMoEForCausalLM")
class PhiMoeModel(Phi3MiniModel):
@@ -9201,7 +9493,9 @@ class ChatGLMModel(TextModel):
def set_gguf_parameters(self):
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
assert n_embed is not None
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
assert n_head is not None
n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head))
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
self.gguf_writer.add_embedding_length(n_embed)
@@ -9727,20 +10021,35 @@ class NemotronHModel(GraniteHybridModel):
# M: Mamba2, *: Attention, -: MLP
# MoE:
# M: Mamba2, *: Attention, E: Expert
hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
self._ssm_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "M"]
self._mlp_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == ("E" if self.is_moe else "-")]
pattern = self.hparams.get("hybrid_override_pattern") or self.hparams.get("layers_block_type")
if pattern is None:
self._ssm_layers = []
self._mlp_layers = []
elif isinstance(pattern, str):
self._ssm_layers = [i for i, val in enumerate(pattern) if val == "M"]
self._mlp_layers = [i for i, val in enumerate(pattern) if val == ("E" if self.is_moe else "-")]
else:
self._ssm_layers = [i for i, val in enumerate(pattern) if val == "mamba"]
self._mlp_layers = [i for i, val in enumerate(pattern) if val == "moe"]
def get_attn_layers(self):
hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
assert len(hybrid_override_pattern) == self.block_count, "Mismatch between hybrid override and num_hidden_layers!"
return [i for i, val in enumerate(hybrid_override_pattern) if val == "*"]
pattern = self.hparams.get("hybrid_override_pattern") or self.hparams.get("layers_block_type")
if pattern is None:
return []
assert len(pattern) == self.block_count, f"Mismatch between pattern ({len(pattern)}) and block_count ({self.block_count})!"
if isinstance(pattern, str):
return [i for i, val in enumerate(pattern) if val == "*"]
return [i for i, val in enumerate(pattern) if val == "attention"]
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_key_length(self.head_dim)
self.gguf_writer.add_value_length(self.head_dim)
head_dim = self.head_dim
if head_dim is None:
raise ValueError("Could not find the attention head dim in config")
self.gguf_writer.add_key_length(head_dim)
self.gguf_writer.add_value_length(head_dim)
# Set feed_forward_length
# NOTE: This will trigger an override warning. This is preferable to
@@ -9768,6 +10077,9 @@ class NemotronHModel(GraniteHybridModel):
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
self.gguf_writer.add_expert_used_count(n_experts_used)
if (latent_size := self.hparams.get("moe_latent_size")) is not None:
self.gguf_writer.add_moe_latent_size(latent_size)
def set_vocab(self):
super().set_vocab()
@@ -9787,6 +10099,13 @@ class NemotronHModel(GraniteHybridModel):
name = name[len("language_model."):]
if self.is_moe and bid is not None:
# Skip Multi-Token Prediction (MTP) tensors. These are used for
# for speculative decoding but we don't include them in this model
# conversion. See https://github.com/ggml-org/llama.cpp/pull/18886
if name.startswith("mtp."):
logger.info(f"gguf: Skipping MTP (Speculative) layer: {name}")
return
if name.endswith("mixer.gate.e_score_correction_bias"):
new_name = name.replace("e_score_correction_bias", "e_score_correction.bias")
yield from ModelBase.modify_tensors(self, data_torch, new_name, bid)

343
docs/backend/OPENVINO.md Normal file
View File

@@ -0,0 +1,343 @@
# OpenVINO Backend for llama.cpp
[OpenVINO](https://docs.openvino.ai/) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge.
This document describes the [OpenVINO backend for llama.cpp](../../src/ggml-openvino), which enables hardware-accelerated inference on **Intel® CPUs, GPUs, and NPUs** while remaining compatible with the existing **GGUF model ecosystem**. The backend translates GGML compute graphs into OpenVINO graphs and leverages graph compilation, kernel fusion, and device-specific optimizations to improve inference performance on supported Intel hardware.
The OpenVINO backend is implemented in `ggml/src/ggml-openvino` and provides a translation layer for core GGML operations. The OpenVINO backend replaces the standard GGML graph execution path with Intel's OpenVINO inference engine. This approach allows the same GGUF model file to run on Intel CPUs, Intel GPUs (integrated and discrete), and Intel NPUs without changes to the model or the rest of the llama.cpp stack. When a `ggml_cgraph` is dispatched to OpenVINO backend, it:
- Walks the GGML graph and identifies inputs, outputs, weights, and KV cache tensors.
- Translates the GGML operations into an `ov::Model` using OpenVINO's frontend API.
- Compiles and caches the model for the target device.
- Binds GGML tensor memory to OpenVINO inference tensors and runs inference.
## Supported Devices
OpenVINO backend supports the following hardware:
- Intel CPUs
- Intel GPUs (integrated and discrete)
- Intel NPUs
Although OpenVINO supports a wide range of [Intel hardware](https://docs.openvino.ai/2026/about-openvino/release-notes-openvino/system-requirements.html), the llama.cpp OpenVINO backend has been validated specifically on AI PCs such as the Intel® Core™ Ultra Series 1 and Series 2.
## Supported Model Precisions
- `FP16`
- `BF16` (on Intel Xeon)
- `Q8_0`
- `Q4_0`
- `Q4_1`
- `Q4_K`
- `Q4_K_M`
- `Q5_K` (converted to Q8_0_C at runtime)
- `Q6_K` (converted to Q8_0_C at runtime)
> [!NOTE]
> Accuracy validation and performance optimizations for quantized models are a work in progress.
## Quantization Support Details
### CPU and GPU
- **`Q4_0`, `Q4_1`, `Q4_K_M`, `Q6_K` models are supported**
- `Q5_K` and `Q6_K` tensors are converted to `Q8_0_C`
### NPU
- **Primary supported quantization scheme is `Q4_0`**
- `Q6_K` tensors are requantized to `Q4_0_128` in general. For embedding weights, `Q6_K` tensors are requantized to `Q8_0_C` except for the token embedding matrix which is dequantized to fp16
### Additional Notes
- Both `Q4_0` and `Q4_1` models use `Q6_K` for the token embedding tensor and the final matmul weight tensor (often the same tensor)
- `Q4_0` models may produce some `Q4_1` tensors if an imatrix is provided during quantization using `llama-quantize`
- `Q4_K_M` models may include both `Q6_K` and `Q5_K` tensors (observed in Phi-3)
## Validated Models
The following models have been validated for functionality on Intel® Core™ Ultra Series 1 and Series 2:
- [Llama-3.2-1B-Instruct-GGUF](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/)
- [Llama-3.1-8B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF)
- [microsoft/Phi-3-mini-4k-instruct-gguf](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf)
- [Qwen/Qwen2.5-1.5B-Instruct-GGUF](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF)
- [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B-GGUF)
- [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-S-1B-sft-gguf)
- [tencent/Hunyuan-7B-Instruct](https://huggingface.co/bartowski/tencent_Hunyuan-7B-Instruct-GGUF)
- [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/bartowski/Mistral-7B-Instruct-v0.3-GGUF)
- [bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF)
## Build Instructions
### Prerequisites
- Linux or Windows system with Intel hardware (CPU, GPU, or NPU)
- **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2025/get-started/install-openvino/configurations.html).
- **Linux:**
- Git, CMake, and Ninja software tools are needed for building.
```bash
sudo apt-get update
sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
```
- OpenCL
```bash
sudo apt install ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
```
- **Windows:**
- Download and install [Microsoft Visual Studio 2022 Build Tools](https://aka.ms/vs/17/release/vs_BuildTools.exe). During installation, select the **"Desktop development with C++"** workload.
- Install required tools:
```powershell
# Windows PowerShell
winget install Git.Git
winget install GNU.Wget
winget install Ninja-build.Ninja
```
- Install **OpenCL** using **vcpkg**:
```powershell
# Windows PowerShell
cd C:\
git clone https://github.com/microsoft/vcpkg
cd vcpkg
.\bootstrap-vcpkg.bat
.\vcpkg install opencl
# Optional but recommended: Integrate vcpkg with Visual Studio / CMake:
.\vcpkg integrate install
```
### 1. Install OpenVINO Runtime
- Follow the guide to install OpenVINO Runtime from an archive file: [Linux](https://docs.openvino.ai/2026/get-started/install-openvino/install-openvino-archive-linux.html) | [Windows](https://docs.openvino.ai/2026/get-started/install-openvino/install-openvino-archive-windows.html)
- **Linux:**
<details>
<summary>📦 Click to expand OpenVINO installation from an archive file on Ubuntu</summary>
<br>
```bash
wget https://raw.githubusercontent.com/ravi9/misc-scripts/main/openvino/ov-archive-install/install-openvino-from-archive.sh
chmod +x install-openvino-from-archive.sh
./install-openvino-from-archive.sh
```
Verify OpenVINO is initialized properly:
```bash
echo $OpenVINO_DIR
```
</details>
### 2. Build llama.cpp with OpenVINO Backend
Clone the OpenVINO-enabled llama.cpp fork and build it:
```bash
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
```
- **Linux:**
```bash
source /opt/intel/openvino/setupvars.sh
cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON
cmake --build build/ReleaseOV --parallel
```
- **Windows:**
```cmd
# x64 Native Tools Command Prompt for VS 2022
"C:\Program Files (x86)\Intel\openvino_2026.0\setupvars.bat"
cmake -B build\ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DLLAMA_CURL=OFF -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
cmake --build build\ReleaseOV --parallel
```
> [!NOTE]
> Use `x64 Native Tools Command Prompt` for Windows build. After building, you could use either `cmd` or `PowerShell` to run the OpenVINO backend.
### 3. Download Sample Model
Download models for testing:
```bash
# Linux
mkdir -p ~/models/
wget https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf \
-O ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf
# Windows PowerShell
mkdir C:\models
Invoke-WebRequest -Uri https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf -OutFile C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf
# Windows Command Line
mkdir C:\models
curl -L https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf -o C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf
```
### 4. Run Inference with OpenVINO Backend
When using the OpenVINO backend, the first inference token may have slightly higher latency due to on-the-fly conversion to the OpenVINO graph. Subsequent tokens and runs will be faster.
```bash
# If device is unset or unavailable, defaults to CPU.
# If the system has multiple GPUs, use GPU.0 or GPU.1 to explicitly target a specific GPU.
# Linux
export GGML_OPENVINO_DEVICE=GPU
# To run llama-simple:
./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
# To run in chat mode:
./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf
# Windows Command Line
set GGML_OPENVINO_DEVICE=GPU
# Windows PowerShell
$env:GGML_OPENVINO_DEVICE = "GPU"
# To run llama-simple
build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -n 50 "The story of AI is "
# To run in chat mode:
build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf"
```
> [!NOTE]
> On systems with multiple GPUs, use `GPU.0` or `GPU.1` to explicitly target specific GPU. See [OpenVINO GPU Device](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html) for more details.
### Docker Build
You can build and run llama.cpp with OpenVINO backend using Docker.
```bash
# Build the base runtime image with compiled shared libraries and minimal dependencies.
docker build -t llama-openvino:base -f .devops/openvino.Dockerfile .
# Build the complete image with all binaries, Python tools, gguf-py library, and model conversion utilities.
docker build --target=full -t llama-openvino:full -f .devops/openvino.Dockerfile .
# Build a minimal CLI-only image containing just the llama-cli executable.
docker build --target=light -t llama-openvino:light -f .devops/openvino.Dockerfile .
# Builds a server-only image with llama-server executable, health check endpoint, and REST API support.
docker build --target=server -t llama-openvino:server -f .devops/openvino.Dockerfile .
# If you are behind a proxy:
docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --target=light -t llama-openvino:light -f .devops/openvino.Dockerfile .
```
Run llama.cpp with OpenVINO backend Docker container.
Save sample models in `~/models` as [shown above](#3-download-sample-model). It will be mounted to the container in the examples below.
```bash
# Run Docker container
docker run --rm -it -v ~/models:/models llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
# With Intel GPU access (iGPU or dGPU)
docker run --rm -it -v ~/models:/models \
--device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
# With Intel NPU access
docker run --rm -it --env GGML_OPENVINO_DEVICE=NPU -v ~/models:/models \
--device=/dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
```
Run Llama.cpp Server with OpenVINO Backend:
```bash
# Run the Server Docker container
docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
# In a NEW terminal, test the server with curl
# If you are behind a proxy, make sure to set NO_PROXY to avoid proxy for localhost
export NO_PROXY=localhost,127.0.0.1
# Test health endpoint
curl -f http://localhost:8080/health
# Test with a simple prompt
curl -X POST "http://localhost:8080/v1/chat/completions" -H "Content-Type: application/json" \
-d '{"messages":[{"role":"user","content":"Write a poem about OpenVINO"}],"max_tokens":100}' | jq .
```
## Runtime Configuration
The OpenVINO backend can be configured using the following environment variables at runtime to control device selection, caching, debugging, and profiling behavior.
### Configuration Options
| Variable | Default | Description |
|-----------------------------------|------------|-------------------------------------------------------------------------------------------------------------|
| `GGML_OPENVINO_DEVICE` | `CPU` | Specify the target device (CPU, GPU, NPU). On systems with multiple GPUs, use `GPU.0` or `GPU.1` to explicitly target specific GPU. See [OpenVINO GPU Device](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html). When set to **NPU**, static compilation mode is enabled for optimal performance. |
| `GGML_OPENVINO_CACHE_DIR` | `not set` | Directory for OpenVINO model caching (recommended: `/tmp/ov_cache`). Enables model caching when set. **Not supported on NPU devices.** |
| `GGML_OPENVINO_PREFILL_CHUNK_SIZE`| `256` | Token chunk size for **NPU** prefill. |
| `GGML_OPENVINO_STATEFUL_EXECUTION`| `0` | Enable stateful KV cache on for better performance. Recommended on CPU, GPU. |
| `GGML_OPENVINO_PROFILING` | `0` | Enable execution-time profiling. |
| `GGML_OPENVINO_DUMP_CGRAPH` | `0` | Dump the GGML compute graph to `cgraph_ov.txt`. |
| `GGML_OPENVINO_DUMP_IR` | `0` | Serialize OpenVINO IR files with timestamps. |
| `GGML_OPENVINO_DEBUG_INPUT` | `0` | Enable input debugging and print input tensor info. |
| `GGML_OPENVINO_DEBUG_OUTPUT` | `0` | Enable output debugging and print output tensor info. |
| `GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS` | `0` | Print tensor address map once. |
> [!NOTE]
>`GGML_OPENVINO_STATEFUL_EXECUTION` is an **Experimental** feature to allow stateful execution for managing the KV cache internally inside the OpenVINO model, improving performance on CPUs and GPUs. Stateful execution is not effective on NPUs, and not all models currently support this feature. This feature is experimental and has been validated only with the llama-simple, llama-cli, llama-bench, and llama-run applications and is recommended to enable for the best performance. Other applications, such as llama-server and llama-perplexity, are not yet supported.
### Example Usage
#### GPU Inference with Profiling
```bash
# If the system has multiple GPUs, use GPU.0 or GPU.1 to explicitly target a specific GPU.
# Linux
export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
export GGML_OPENVINO_PROFILING=1
export GGML_OPENVINO_DEVICE=GPU
./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
# Windows Command Line
set GGML_OPENVINO_CACHE_DIR=C:\tmp\ov_cache
set GGML_OPENVINO_PROFILING=1
set GGML_OPENVINO_DEVICE=GPU
# Windows PowerShell
$env:GGML_OPENVINO_CACHE_DIR = "C:\tmp\ov_cache"
$env:GGML_OPENVINO_PROFILING = "1"
$env:GGML_OPENVINO_DEVICE = "GPU"
build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -n 50 "The story of AI is "
```
#### llama-bench
```bash
# -fa 1 is required when running llama-bench with the OpenVINO backend.
GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1
```
### NPU Notes
- Model caching is not yet supported
- Does not support llama-server -np > 1 (multiple parallel sequences)
- Only supports llama-perplexity -b 512 or smaller
## Llama.cpp Tools
The following tools work with the OpenVINO backend on CPU, GPU, NPU:
- llama-simple
- llama-run
- llama-cli
- llama-server
- llama-bench
- llama-perplexity
## Work in Progress
- Performance and memory optimizations
- Accuracy validation
- Broader quantization coverage
- Support for additional model architectures

View File

@@ -9,6 +9,7 @@
- [Linux](#linux)
- [Windows](#windows)
- [Environment Variable](#environment-variable)
- [Design Rule](#design-rule)
- [Known Issue](#known-issues)
- [Q&A](#qa)
- [TODO](#todo)
@@ -41,6 +42,9 @@ The following releases are verified and recommended:
## News
- 2026.03
- Support Flash-Attention: less memory usage, performance impact depends on LLM.
- 2026.02
- Remove support for Nvidia & AMD GPU, because the oneAPI plugin for Nvidia & AMD GPU is unavailable: download/installation channels are out of work. User can't build up the software for Nvidia & AMD GPU.
@@ -378,17 +382,27 @@ use 1 SYCL GPUs: [0] with Max compute units:512
## Windows
### I. Setup Environment
1. Install GPU driver
### Install GPU driver
Intel GPU drivers instructions guide and download page can be found here: [Get Intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
2. Install Visual Studio
### Option 1: download the binary package directly
Download the binary package for Windows from: https://github.com/ggml-org/llama.cpp/releases.
Extract the package to local folder, run the llama tools directly. Refer to [Run the inference](#iii-run-the-inference-1).
Note, the package includes the SYCL running time and all depended dll files, no need to install oneAPI package and activte them.
### Option 2: build locally from the source code.
#### I. Setup environment
1. Install Visual Studio
If you already have a recent version of Microsoft Visual Studio, you can skip this step. Otherwise, please refer to the official download page for [Microsoft Visual Studio](https://visualstudio.microsoft.com/).
3. Install Intel® oneAPI Base toolkit
2. Install Intel® oneAPI Base toolkit
SYCL backend depends on:
- Intel® oneAPI DPC++/C++ compiler/running-time.
@@ -439,25 +453,25 @@ Output (example):
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
```
4. Install build tools
3. Install build tools
a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
### II. Build llama.cpp
#### II. Build llama.cpp
You could download the release package for Windows directly, which including binary files and depended oneAPI dll files.
Choose one of following methods to build from source code.
#### 1. Script
##### Option 1: Script
```sh
.\examples\sycl\win-build-sycl.bat
```
#### 2. CMake
##### Option 2: CMake
On the oneAPI command line window, step into the llama.cpp main directory and run the following:
@@ -486,7 +500,7 @@ cmake --preset x64-windows-sycl-debug
cmake --build build-x64-windows-sycl-debug -j --target llama-completion
```
#### 3. Visual Studio
##### Option 3: Visual Studio
You have two options to use Visual Studio to build llama.cpp:
- As CMake Project using CMake presets.
@@ -496,7 +510,7 @@ You have two options to use Visual Studio to build llama.cpp:
All following commands are executed in PowerShell.
##### - Open as a CMake Project
###### - Open as a CMake Project
You can use Visual Studio to open the `llama.cpp` folder directly as a CMake project. Before compiling, select one of the SYCL CMake presets:
@@ -511,7 +525,7 @@ You can use Visual Studio to open the `llama.cpp` folder directly as a CMake pro
cmake --build build --config Release -j --target llama-completion
```
##### - Generating a Visual Studio Solution
###### - Generating a Visual Studio Solution
You can use Visual Studio solution to build and work on llama.cpp on Windows. You need to convert the CMake Project into a `.sln` file.
@@ -599,7 +613,7 @@ found 2 SYCL devices:
```
#### Choose level-zero devices
##### Choose level-zero devices
|Chosen Device ID|Setting|
|-|-|
@@ -607,7 +621,7 @@ found 2 SYCL devices:
|1|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"` or `set ONEAPI_DEVICE_SELECTOR="level_zero:*"`|
#### Execute
##### Execute
Choose one of following methods to run.
@@ -665,7 +679,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
## Environment Variable
#### Build
### Build
| Name | Value | Function |
|--------------------|---------------------------------------|---------------------------------------------|
@@ -680,23 +694,50 @@ use 1 SYCL GPUs: [0] with Max compute units:512
1. FP32 or FP16 have different performance impact to LLM. Recommended to test them for better prompt processing performance on your models. You need to rebuild the code after change `GGML_SYCL_F16=OFF/ON`.
#### Runtime
### Runtime
| Name | Value | Function |
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
| GGML_SYCL_ENABLE_FLASH_ATTN | 1 (default) or 0| Enable Flash-Attention. It can reduce memory usage. The performance impact depends on the LLM.|
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for intel devices older than Gen 10) |
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|
## Design Rule
- Open to all contributors.
- All code change should be useful to user:
- Fix bug.
- Add new function.
- Improve the performance/usage.
- Make code be easy to maintain.
- ...
- Don't accept the codes of following cases:
- Break legacy function.
- Reduce the performance of legacy case in default.
- Not completed work/the functionality cannot be demonstrated.
- Encourage to use environment variable to control features to be opened/closed.
- User can evaluate the feature without rebuild the code.
- Recommend the best features to user by setting them be opened as default.
- Design the code based on the published official releases of oneAPI packages: compiler, library, driver, OS kernel.
- Developers need to maintain the code they submit.
## Known Issues
- `Split-mode:[row]` is not supported.
- Missed the AOT (Ahead-of-Time) in buiding.
- Good: build quickly, smaller size of binary file.
- Bad: The startup is slow (JIT) in first time, but subsequent performance is unaffected.
## Q&A
- Error: `error while loading shared libraries: libsycl.so: cannot open shared object file: No such file or directory`.
@@ -746,7 +787,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
```
### **GitHub contribution**:
Please add the `SYCL :` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.
Please add the `[SYCL]` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.
## TODO

View File

@@ -55,7 +55,8 @@ LLAMA_MAC_BUILD=$PWD/build/ggml-virtgpu-backend
cmake -S . -B $LLAMA_MAC_BUILD \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=ON \
-DGGML_REMOTINGBACKEND=ONLY \
-DGGML_VIRTGPU=ON \
-DGGML_VIRTGPU_BACKEND=ONLY \
-DGGML_METAL=ON
TARGETS="ggml-metal"
@@ -71,6 +72,7 @@ cmake --build $LLAMA_MAC_BUILD --parallel 8 --target $EXTRA_TARGETS
```bash
# Build virglrenderer with APIR support
mkdir virglrenderer
cd virglrenderer
git clone https://gitlab.freedesktop.org/kpouget/virglrenderer -b main-macos src
cd src
@@ -95,7 +97,7 @@ mkdir llama.cpp
git clone https://github.com/ggml-org/llama.cpp.git src
cd src
LLAMA_LINUX_BUILD=$PWD//build-virtgpu
LLAMA_LINUX_BUILD=$PWD/build-virtgpu
cmake -S . -B $LLAMA_LINUX_BUILD \
-DGGML_VIRTGPU=ON

View File

@@ -13,6 +13,21 @@ cd llama.cpp
The following sections describe how to build with different backends and options.
* [CPU Build](#cpu-build)
* [BLAS Build](#blas-build)
* [Metal Build](#metal-build)
* [SYCL](#sycl)
* [CUDA](#cuda)
* [MUSA](#musa)
* [HIP](#hip)
* [Vulkan](#vulkan)
* [CANN](#cann)
* [Arm® KleidiAI™](#arm-kleidiai)
* [OpenCL](#opencl)
* [Android](#android-1)
* [OpenVINO](#openvino)
* [Notes about GPU-accelerated backends](#notes-about-gpu-accelerated-backends)
## CPU Build
Build llama.cpp using `CMake`:
@@ -254,6 +269,14 @@ The environment variable [`CUDA_SCALE_LAUNCH_QUEUES`](https://docs.nvidia.com/cu
Consider setting `CUDA_SCALE_LAUNCH_QUEUES=4x`, which increases the CUDA command buffer to 4 times its default size. This optimization is particularly beneficial for **Multi-GPU setups with pipeline parallelism**, where it significantly improves prompt processing throughput by allowing more operations to be enqueued across GPUs.
#### GGML_CUDA_FORCE_CUBLAS_COMPUTE_32F
Use `GGML_CUDA_FORCE_CUBLAS_COMPUTE_32F` environment variable to use FP32 compute type on all GPUs in FP16 cuBLAS for preventing possible numerical overflows in exchange for slower prompt processing (small impact on RTX PRO/Datacenter products and significant on GeForce products).
#### GGML_CUDA_FORCE_CUBLAS_COMPUTE_16F
Use `GGML_CUDA_FORCE_CUBLAS_COMPUTE_16F` environment variable to force use FP16 compute type (instead of default FP32) in FP16 cuBLAS for V100, CDNA and RDNA4.
### Unified Memory
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
@@ -265,7 +288,7 @@ The following compilation options are also available to tweak performance:
| Option | Legal values | Default | Description |
|-------------------------------|------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, CDNA and RDNA3+). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models. There may be issues with numerical overflows (except for CDNA and RDNA4) and memory use will be higher. Prompt processing may become faster on recent datacenter GPUs (the custom kernels were tuned primarily for RTX 3000/4000). |
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models. There may be issues with numerical overflows (except for V100, CDNA and RDNA4 which use FP32 compute type by default) and memory use will be higher. Prompt processing may become faster on recent datacenter GPUs (the custom kernels were tuned primarily for RTX 3000/4000). |
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
@@ -599,7 +622,13 @@ If KleidiAI is enabled, the output will contain a line similar to:
```
load_tensors: CPU_KLEIDIAI model buffer size = 3474.00 MiB
```
KleidiAI's microkernels implement optimized tensor operations using Arm CPU features such as dotprod, int8mm and SME. llama.cpp selects the most efficient kernel based on runtime CPU feature detection. However, on platforms that support SME, you must manually enable SME microkernels by setting the environment variable `GGML_KLEIDIAI_SME=1`.
KleidiAIs microkernels implement optimized tensor operations using Arm CPU features such as dotprod, int8mm, SVE, and SME. Llama.cpp selects the most efficient kernels at runtime based on detected CPU capabilities.
On CPUs that support SME, SME microkernels are enabled automatically using runtime detection.
The environment variable GGML_KLEIDIAI_SME can be used to control SME behavior:
- Not set: enable SME automatically if supported and detected.
- 0: disable SME.
- <n> > 0: enable SME and assume <n> available SME units (override auto detection).
If SME is not supported by the CPU, SME microkernels are always disabled.
Depending on your build target, other higher priority backends may be enabled by default. To ensure the CPU backend is used, you must disable the higher priority backends either at compile time, e.g. -DGGML_METAL=OFF, or during run-time using the command line option `--device none`.
@@ -718,6 +747,14 @@ Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/m
To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md)
## OpenVINO
[OpenVINO](https://docs.openvino.ai/) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware (CPUs, GPUs, and NPUs).
For build instructions and usage examples, refer to [OPENVINO.md](backend/OPENVINO.md).
---
## Notes about GPU-accelerated backends
The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.

View File

@@ -15,7 +15,7 @@ Legend:
| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | WebGPU | ZenDNN | zDNN |
|-----------|------|------|------|------|------|------|------|------|------|------|------|
| ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | | ✅ | ❌ | ❌ | ❌ |
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
@@ -23,7 +23,7 @@ Legend:
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | | 🟡 | ✅ | ❌ | ❌ |
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
@@ -31,22 +31,23 @@ Legend:
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | | 🟡 | ✅ | ❌ | ❌ |
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
| CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CUMSUM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
| DIAG | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | ❌ | ❌ | ❌ |
| DIAG | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | ❌ | ❌ | ❌ |
| DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
| ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | | ✅ | ❌ | ❌ |
| ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| EXPM1 | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
| FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | | 🟡 | 🟡 | ❌ | ❌ |
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
| GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
@@ -54,7 +55,7 @@ Legend:
| GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
| GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
| GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
| GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
| GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | | 🟡 | ❌ | ❌ |
| GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
@@ -63,7 +64,7 @@ Legend:
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
| LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | | ✅ | ✅ | ❌ | ❌ |
| LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
@@ -75,34 +76,34 @@ Legend:
| OUT_PROD | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | 🟡 |
| PAD | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
| POOL_1D | ❌ | ❌ | | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| POOL_1D | ❌ | ❌ | | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
| REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | | ❌ | ❌ |
| REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | | ❌ | ❌ |
| REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| ROPE | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | | ✅ | ❌ | ❌ | ❌ |
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | | ✅ | ❌ | ❌ | ❌ |
| ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| SET | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | 🟡 | | ❌ | ❌ | ❌ |
| SET | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | 🟡 | | ❌ | ❌ | ❌ |
| SET_ROWS | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
| SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | | ✅ | ❌ | ❌ |
| SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | | 🟡 | ✅ | ❌ | ❌ |
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
| SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | | 🟡 | ✅ | ❌ | ❌ |
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | | 🟡 | ✅ | ❌ | ❌ |
| SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | | ❌ | ❌ | ❌ |
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
@@ -116,5 +117,5 @@ Legend:
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ |
| XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | | ✅ | ❌ | ❌ |
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | | ❌ | ❌ | ❌ |
| XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | | ✅ | ❌ | ❌ |

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -5023,20 +5023,20 @@
"WebGPU: WebGPU","ARGMAX","type=f32,ne=[1024,12,1,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","ARGMAX","type=f32,ne=[2000,10,1,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","ARGMAX","type=f32,ne=[5438,3,1,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,1,1]","support","0","no","WebGPU"
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[2,1,1,1]","support","0","no","WebGPU"
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,2,1,1]","support","0","no","WebGPU"
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,2,1]","support","0","no","WebGPU"
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,1,2]","support","0","no","WebGPU"
"WebGPU: WebGPU","REPEAT","type=i32,ne=[10,5,4,1],nr=[2,1,1,1]","support","0","no","WebGPU"
"WebGPU: WebGPU","REPEAT","type=i16,ne=[10,5,4,1],nr=[1,1,1,2]","support","0","no","WebGPU"
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,1]","support","0","no","WebGPU"
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[2,1,1,1]","support","0","no","WebGPU"
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,2,1,1]","support","0","no","WebGPU"
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,2,1]","support","0","no","WebGPU"
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,2]","support","0","no","WebGPU"
"WebGPU: WebGPU","REPEAT","type=i32,ne=[10,5,4,3],nr=[2,1,1,1]","support","0","no","WebGPU"
"WebGPU: WebGPU","REPEAT","type=i16,ne=[10,5,4,3],nr=[1,1,1,2]","support","0","no","WebGPU"
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,1,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[2,1,1,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,2,1,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,2,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,1,2]","support","1","yes","WebGPU"
"WebGPU: WebGPU","REPEAT","type=i32,ne=[10,5,4,1],nr=[2,1,1,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","REPEAT","type=i16,ne=[10,5,4,1],nr=[1,1,1,2]","support","1","yes","WebGPU"
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,2,1,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,2,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,2]","support","1","yes","WebGPU"
"WebGPU: WebGPU","REPEAT","type=i32,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","REPEAT","type=i16,ne=[10,5,4,3],nr=[1,1,1,2]","support","1","yes","WebGPU"
"WebGPU: WebGPU","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[1,1,1,1],v=0","support","0","no","WebGPU"
"WebGPU: WebGPU","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[2,1,1,1],v=0","support","0","no","WebGPU"
"WebGPU: WebGPU","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[1,2,1,1],v=0","support","0","no","WebGPU"
Can't render this file because it is too large.

View File

@@ -633,7 +633,7 @@ class SchemaConverter:
return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
items = schema.get('items') or schema['prefixItems']
items = schema.get('items', schema.get('prefixItems'))
if isinstance(items, list):
return self._add_rule(
rule_name,

View File

@@ -248,12 +248,14 @@ set (GGML_SYCL_TARGET "INTEL" CACHE STRING
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
"ggml: sycl device architecture")
option(GGML_OPENVINO "ggml: use OPENVINO" OFF)
option(GGML_OPENCL "ggml: use OpenCL" OFF)
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
"gmml: OpenCL API version to target")
"ggml: OpenCL API version to target")
option(GGML_HEXAGON "ggml: enable Hexagon backend" OFF)
set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml: quantize group size (32, 64, or 128)")
@@ -327,6 +329,7 @@ set(GGML_PUBLIC_HEADERS
include/ggml-vulkan.h
include/ggml-webgpu.h
include/ggml-zendnn.h
include/ggml-openvino.h
include/gguf.h)
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")

View File

@@ -0,0 +1,37 @@
#pragma once
#include "ggml-backend.h"
#include <cstring>
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_OPENVINO_NAME "OPENVINO"
// backend API
GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device);
GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend);
GGML_BACKEND_API bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer);
GGML_BACKEND_API bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft);
GGML_BACKEND_API bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft);
GGML_BACKEND_API size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer);
// device buffer
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device);
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(int device);
GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void);
#ifdef __cplusplus
}
#endif

View File

@@ -8,7 +8,12 @@ extern "C" {
#define RPC_PROTO_MAJOR_VERSION 3
#define RPC_PROTO_MINOR_VERSION 6
#define RPC_PROTO_PATCH_VERSION 0
#define RPC_PROTO_PATCH_VERSION 1
#ifdef __cplusplus
static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
#endif
#define GGML_RPC_MAX_SERVERS 16
// backend API

View File

@@ -427,7 +427,8 @@ extern "C" {
// GGML_TYPE_IQ4_NL_4_8 = 37,
// GGML_TYPE_IQ4_NL_8_8 = 38,
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
GGML_TYPE_COUNT = 40,
GGML_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale)
GGML_TYPE_COUNT = 41,
};
// precision
@@ -463,6 +464,7 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors
GGML_FTYPE_MOSTLY_NVFP4 = 26, // except 1d tensors
};
// available tensor operations:
@@ -556,6 +558,7 @@ extern "C" {
GGML_OP_GATED_LINEAR_ATTN,
GGML_OP_RWKV_WKV7,
GGML_OP_SOLVE_TRI,
GGML_OP_GATED_DELTA_NET,
GGML_OP_UNARY,
@@ -2463,6 +2466,17 @@ extern "C" {
bool lower,
bool uni);
// TODO: add ggml_gated_delta_net_set_bcast() to be able to configure Q, K broadcast type: tiled vs interleaved [TAG_GGML_GDN_BCAST]
// ref: https://github.com/ggml-org/llama.cpp/pull/19468#discussion_r2786394306
GGML_API struct ggml_tensor * ggml_gated_delta_net(
struct ggml_context * ctx,
struct ggml_tensor * q,
struct ggml_tensor * k,
struct ggml_tensor * v,
struct ggml_tensor * g,
struct ggml_tensor * beta,
struct ggml_tensor * state);
// custom operators
typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);

View File

@@ -460,6 +460,7 @@ ggml_add_backend(zDNN)
ggml_add_backend(OpenCL)
ggml_add_backend(Hexagon)
ggml_add_backend(ZenDNN)
ggml_add_backend(OPENVINO)
foreach (target ggml-base ggml)
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)

View File

@@ -82,6 +82,10 @@
#include "ggml-zendnn.h"
#endif
#ifdef GGML_USE_OPENVINO
#include "ggml-openvino.h"
#endif
namespace fs = std::filesystem;
static std::string path_str(const fs::path & path) {
@@ -154,6 +158,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg());
#endif
#ifdef GGML_USE_OPENVINO
register_backend(ggml_backend_openvino_reg());
#endif
#ifdef GGML_USE_CPU
register_backend(ggml_backend_cpu_reg());
#endif
@@ -557,6 +564,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
ggml_backend_load_best("opencl", silent, dir_path);
ggml_backend_load_best("hexagon", silent, dir_path);
ggml_backend_load_best("musa", silent, dir_path);
ggml_backend_load_best("openvino", silent, dir_path);
ggml_backend_load_best("cpu", silent, dir_path);
// check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
const char * backend_path = std::getenv("GGML_BACKEND_PATH");

View File

@@ -1455,10 +1455,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
int split_backend_id = split->backend_id;
ggml_backend_t split_backend = sched->backends[split_backend_id];
if (sched->events[split_backend_id][sched->cur_copy] == NULL) {
ggml_backend_synchronize(split_backend);
}
// copy the input tensors to the split backend
for (int input_id = 0; input_id < split->n_inputs; input_id++) {
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
@@ -1469,12 +1465,16 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
} else {
ggml_backend_synchronize(split_backend);
}
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
ggml_backend_tensor_copy(input, input_cpy);
} else {
// wait for the split backend to finish using the input before overwriting it
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
} else {
ggml_backend_synchronize(split_backend);
}
// when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used
@@ -1578,10 +1578,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
}
}
if (sched->events[split_backend_id][sched->cur_copy] == NULL) {
ggml_backend_synchronize(split_backend);
}
if (!sched->callback_eval) {
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
if (ec != GGML_STATUS_SUCCESS) {

View File

@@ -102,6 +102,9 @@ typedef sycl::half2 ggml_half2;
#define QI_MXFP4 (QK_MXFP4 / (4 * QR_MXFP4))
#define QR_MXFP4 2
#define QI_NVFP4 (QK_NVFP4 / (4 * QR_NVFP4))
#define QR_NVFP4 2
#define QI5_0 (QK5_0 / (4 * QR5_0))
#define QR5_0 2
@@ -194,6 +197,14 @@ typedef struct {
} block_mxfp4;
static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + QK_MXFP4/2, "wrong mxfp4 block size/padding");
#define QK_NVFP4 64
#define QK_NVFP4_SUB 16 // sub-block size for per-group scales
typedef struct {
uint8_t d[QK_NVFP4/QK_NVFP4_SUB]; // UE4M3 scales (4 bytes, one per 16-element sub-block)
uint8_t qs[QK_NVFP4/2]; // packed 4-bit E2M1 values (32 bytes)
} block_nvfp4;
static_assert(sizeof(block_nvfp4) == sizeof(uint8_t)*(QK_NVFP4/QK_NVFP4_SUB) + QK_NVFP4/2, "wrong nvfp4 block size/padding");
#define QK5_0 32
typedef struct {
ggml_half d; // delta

View File

@@ -15,6 +15,7 @@
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
@@ -79,6 +80,8 @@
#define ggml_gemm_mxfp4_8x8_q8_0_generic ggml_gemm_mxfp4_8x8_q8_0
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
// quants.c
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
@@ -108,6 +111,7 @@
// ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
// quants.c
#define quantize_row_q8_K_generic quantize_row_q8_K
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
@@ -155,6 +159,7 @@
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@@ -194,16 +199,11 @@
#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
#elif defined(__riscv)
// quants.c
#define quantize_row_q8_K_generic quantize_row_q8_K
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
// repack.cpp
#define ggml_quantize_mat_q8_0_4x1_generic ggml_quantize_mat_q8_0_4x1
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
#define ggml_quantize_mat_q8_K_4x1_generic ggml_quantize_mat_q8_K_4x1
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
@@ -239,6 +239,7 @@
#elif defined(__s390x__)
// quants.c
#define quantize_row_q8_K_generic quantize_row_q8_K
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
@@ -301,6 +302,7 @@
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8

View File

@@ -650,6 +650,90 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
*s = sumf;
}
void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
assert(n % QK_NVFP4 == 0);
const block_nvfp4 * GGML_RESTRICT x = vx;
const block_q8_0 * GGML_RESTRICT y = vy;
// Each NVFP4 super-block (64 elements) spans 2 q8_0 blocks
const int nb = n / QK_NVFP4;
float sumf = 0;
#if defined __ARM_NEON
const int8x16_t values = vld1q_s8(kvalues_mxfp4);
const uint8x16_t m4b = vdupq_n_u8(0x0f);
float32x4_t acc = vdupq_n_f32(0.0f);
for (int ib = 0; ib < nb; ++ib) {
const uint8x16_t q4bits_0 = vld1q_u8(x[ib].qs);
const uint8x16_t q4bits_1 = vld1q_u8(x[ib].qs + 16);
const int8x16_t q4_lo_0 = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits_0, m4b));
const int8x16_t q4_hi_0 = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits_0, 4));
const int8x16_t q4_lo_1 = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits_1, m4b));
const int8x16_t q4_hi_1 = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits_1, 4));
const int8x16_t q8_0a = vld1q_s8(y[2*ib].qs);
const int8x16_t q8_0b = vld1q_s8(y[2*ib].qs + 16);
const int8x16_t q8_lo_0 = vcombine_s8(vget_low_s8(q8_0a), vget_low_s8(q8_0b));
const int8x16_t q8_hi_0 = vcombine_s8(vget_high_s8(q8_0a), vget_high_s8(q8_0b));
const int8x16_t q8_1a = vld1q_s8(y[2*ib+1].qs);
const int8x16_t q8_1b = vld1q_s8(y[2*ib+1].qs + 16);
const int8x16_t q8_lo_1 = vcombine_s8(vget_low_s8(q8_1a), vget_low_s8(q8_1b));
const int8x16_t q8_hi_1 = vcombine_s8(vget_high_s8(q8_1a), vget_high_s8(q8_1b));
const int32x4_t p0 = vaddq_s32(
ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_0, q8_lo_0),
ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_0, q8_hi_0));
const int32x4_t p1 = vaddq_s32(
ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_1, q8_lo_1),
ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_1, q8_hi_1));
const int32x4_t sums = vpaddq_s32(p0, p1);
// Decode 4 UE4M3 scales to f32 and multiply with q8 scales
const float dy0 = GGML_CPU_FP16_TO_FP32(y[2*ib].d);
const float dy1 = GGML_CPU_FP16_TO_FP32(y[2*ib+1].d);
const float32x4_t nvsc = {
ggml_ue4m3_to_fp32(x[ib].d[0]),
ggml_ue4m3_to_fp32(x[ib].d[1]),
ggml_ue4m3_to_fp32(x[ib].d[2]),
ggml_ue4m3_to_fp32(x[ib].d[3])
};
const float32x4_t scales = vmulq_f32(nvsc, (float32x4_t){dy0, dy0, dy1, dy1});
acc = vfmaq_f32(acc, vcvtq_f32_s32(sums), scales);
}
sumf = vaddvq_f32(acc);
#else
for (int ib = 0; ib < nb; ++ib) {
for (int si = 0; si < 4; ++si) {
const float d = ggml_ue4m3_to_fp32(x[ib].d[si]);
const int q8b = si / 2;
const int q8o = (si % 2) * QK_NVFP4_SUB;
const float dy = GGML_CPU_FP16_TO_FP32(y[2*ib + q8b].d);
int sumi_lo = 0, sumi_hi = 0;
for (int j = 0; j < QK_NVFP4_SUB/2; ++j) {
const uint8_t qv = x[ib].qs[si*(QK_NVFP4_SUB/2) + j];
sumi_lo += y[2*ib + q8b].qs[q8o + j + 0] * kvalues_mxfp4[qv & 0xf];
sumi_hi += y[2*ib + q8b].qs[q8o + j + QK_NVFP4_SUB/2] * kvalues_mxfp4[qv >> 4];
}
sumf += dy * d * (sumi_lo + sumi_hi);
}
}
#endif
*s = sumf;
}
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK8_0;
const int nb = n / qk;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -270,6 +270,12 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1,
},
[GGML_TYPE_NVFP4] = {
.from_float = quantize_row_nvfp4,
.vec_dot = ggml_vec_dot_nvfp4_q8_0,
.vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1,
},
[GGML_TYPE_Q2_K] = {
.from_float = quantize_row_q2_K,
.vec_dot = ggml_vec_dot_q2_K_q8_K,
@@ -2021,6 +2027,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_solve_tri(params, tensor);
} break;
case GGML_OP_GATED_DELTA_NET:
{
ggml_compute_forward_gated_delta_net(params, tensor);
} break;
case GGML_OP_MAP_CUSTOM1:
{
ggml_compute_forward_map_custom1(params, tensor);
@@ -2200,6 +2210,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break;
case GGML_OP_COUNT_EQUAL:
case GGML_OP_SOLVE_TRI:
case GGML_OP_GATED_DELTA_NET:
{
n_tasks = n_threads;
} break;
@@ -2905,6 +2916,11 @@ struct ggml_cplan ggml_graph_plan(
{
cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
} break;
case GGML_OP_GATED_DELTA_NET:
{
const int64_t S_v = node->src[2]->ne[0];
cur = S_v * sizeof(float) * n_tasks;
} break;
case GGML_OP_COUNT:
{
GGML_ABORT("fatal error");

View File

@@ -520,7 +520,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
},
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
/* .required_cpu = */ CPU_FEATURE_I8MM,
/* .lhs_type = */ GGML_TYPE_F32,
/* .rhs_type = */ GGML_TYPE_Q4_0,
/* .op_type = */ GGML_TYPE_F32,
@@ -631,7 +631,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
/* .pack_func_ex = */ &rhs_pack_fn12<kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0>,
},
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
/* .required_cpu = */ CPU_FEATURE_I8MM,
/* .lhs_type = */ GGML_TYPE_F32,
/* .rhs_type = */ GGML_TYPE_Q4_0,
/* .op_type = */ GGML_TYPE_F32,
@@ -801,7 +801,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels_q8[] = {
/* .packed_stride_ex = */ &rhs_stride_fn4<kai_get_rhs_packed_stride_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
/* .pack_func_ex = */ &rhs_pack_scale_fn12<kai_run_rhs_pack_nxk_qsi8cxp_qsi8cx_neon>,
},
/* .required_cpu = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
/* .required_cpu = */ CPU_FEATURE_I8MM,
/* .lhs_type = */ GGML_TYPE_F32,
/* .rhs_type = */ GGML_TYPE_Q8_0,
/* .op_type = */ GGML_TYPE_F32,

File diff suppressed because it is too large Load Diff

View File

@@ -670,6 +670,7 @@ void ggml_compute_forward_add(
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
@@ -1119,6 +1120,7 @@ void ggml_compute_forward_add1(
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
@@ -1247,6 +1249,7 @@ void ggml_compute_forward_acc(
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
@@ -4334,6 +4337,7 @@ void ggml_compute_forward_out_prod(
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
@@ -4609,6 +4613,7 @@ void ggml_compute_forward_set(
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
@@ -4831,6 +4836,7 @@ void ggml_compute_forward_get_rows(
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
@@ -5555,6 +5561,7 @@ void ggml_compute_forward_clamp(
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
@@ -9617,7 +9624,7 @@ void ggml_compute_forward_win_unpart(
}
}
//gmml_compute_forward_unary
//ggml_compute_forward_unary
void ggml_compute_forward_unary(
const ggml_compute_params * params,
@@ -10380,6 +10387,195 @@ void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, s
}
}
// ggml_compute_forward_gated_delta_net
static void ggml_compute_forward_gated_delta_net_one_chunk(
const ggml_compute_params * params,
ggml_tensor * dst,
int64_t ir0,
int64_t ir1) {
ggml_tensor * src_q = dst->src[0];
ggml_tensor * src_k = dst->src[1];
ggml_tensor * src_v = dst->src[2];
ggml_tensor * src_g = dst->src[3];
ggml_tensor * src_beta = dst->src[4];
ggml_tensor * src_state = dst->src[5];
const int64_t S_v = src_v->ne[0];
const int64_t H = src_v->ne[1];
const int64_t n_tokens = src_v->ne[2];
const int64_t n_seqs = src_v->ne[3];
GGML_ASSERT(ggml_is_contiguous_rows(src_q));
GGML_ASSERT(ggml_is_contiguous_rows(src_k));
GGML_ASSERT(ggml_is_contiguous_rows(src_v));
GGML_ASSERT(ggml_is_contiguous(src_g));
GGML_ASSERT(ggml_is_contiguous(src_beta));
GGML_ASSERT(ggml_is_contiguous(src_state));
GGML_ASSERT(src_g->ne[0] == 1 || src_g->ne[0] == S_v);
GGML_ASSERT(src_beta->ne[0] == 1);
GGML_TENSOR_LOCALS(int64_t, neq, src_q, ne);
GGML_TENSOR_LOCALS(size_t, nbq, src_q, nb);
GGML_TENSOR_LOCALS(int64_t, nek, src_k, ne);
GGML_TENSOR_LOCALS(size_t, nbk, src_k, nb);
GGML_TENSOR_LOCALS(int64_t, nev, src_v, ne);
GGML_TENSOR_LOCALS(size_t, nbv, src_v, nb);
GGML_TENSOR_LOCALS(int64_t, neg, src_g, ne);
GGML_TENSOR_LOCALS(size_t, nbg, src_g, nb);
GGML_TENSOR_LOCALS(size_t, nbb, src_beta, nb);
const bool kda = (neg0 == S_v);
// scratch layout per thread: [delta(S_v)]
const int64_t scratch_per_thread = S_v;
const int ith = params->ith;
float * delta = (float *)params->wdata + ith * scratch_per_thread + CACHE_LINE_SIZE_F32;
// output layout: [attn_scores | new_states]
// attn_scores: S_v * H * n_tokens * n_seqs floats
// new_states: S_v * S_v * H * n_seqs floats
const int64_t attn_score_elems = S_v * H * n_tokens * n_seqs;
float * attn_out_base = (float *)dst->data;
float * state_out_base = (float *)dst->data + attn_score_elems;
const float * state_in_base = (const float *)src_state->data;
//const int64_t rq1 = nev1 / neq1;
//const int64_t rk1 = nev1 / nek1;
const int64_t rq3 = nev3 / neq3;
const int64_t rk3 = nev3 / nek3;
const float scale = 1.0f / sqrtf((float) S_v);
for (int64_t ir = ir0; ir < ir1; ++ir) {
const int64_t iv1 = ir % H; // head_index
const int64_t iv3 = ir / H; // sequence
const int64_t iq1 = iv1 % neq1;
const int64_t ik1 = iv1 % nek1;
const int64_t iq3 = iv3 / rq3;
const int64_t ik3 = iv3 / rk3;
float * s_out = state_out_base + (iv3 * H + iv1) * S_v * S_v;
// copy input state into output buffer and operate in-place
const float * s_in = state_in_base + (iv3 * H + iv1) * S_v * S_v;
memcpy(s_out, s_in, S_v * S_v * sizeof(float));
// attn output pointer for first token of this (head, seq)
float * attn_data = attn_out_base + (iv3 * n_tokens * H + iv1) * S_v;
for (int64_t t = 0; t < n_tokens; t++) {
const float * q_d = (const float *)((const char *)src_q->data + iq3 * nbq3 + t * nbq2 + iq1 * nbq1);
const float * k_d = (const float *)((const char *)src_k->data + ik3 * nbk3 + t * nbk2 + ik1 * nbk1);
const float * v_d = (const float *)((const char *)src_v->data + iv3 * nbv3 + t * nbv2 + iv1 * nbv1);
const float beta_val = *(const float *)((const char *)src_beta->data + iv3 * nbb3 + t * nbb2 + iv1 * nbb1);
const float * g_d = (const float *)((const char *)src_g->data + iv3 * nbg3 + t * nbg2 + iv1 * nbg1);
// state is stored transposed: s_out[j*S_v + i] = S[i][j]
// so row j of s_out = column j of S (contiguous access)
if (kda) {
// precompute exp(g) into delta scratch (reused below)
for (int64_t i = 0; i < S_v; ++i) {
delta[i] = expf(g_d[i]);
}
// S[i][:] *= exp(g[i]) => for each row j of M: M[j][i] *= exp(g[i])
for (int64_t j = 0; j < S_v; ++j) {
ggml_vec_mul_f32(S_v, &s_out[j * S_v], &s_out[j * S_v], delta);
}
} else {
ggml_vec_scale_f32(S_v * S_v, s_out, expf(g_d[0]));
}
// delta[j] = sum_i S[i][j] * k[i] = dot(row j of M, k)
for (int64_t j = 0; j < S_v; ++j) {
float sum = 0.0f;
ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, k_d, 0, 1);
delta[j] = (v_d[j] - sum) * beta_val;
}
// outer product: S[i][j] += k[i] * delta[j] => M[j][i] += delta[j] * k[i]
for (int64_t j = 0; j < S_v; ++j) {
ggml_vec_mad_f32(S_v, &s_out[j * S_v], k_d, delta[j]);
}
// attn_out[j] = sum_i S[i][j] * q[i] = dot(row j of M, q)
for (int64_t j = 0; j < S_v; ++j) {
float sum = 0.0f;
ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, q_d, 0, 1);
attn_data[j] = sum * scale;
}
attn_data += S_v * H; // advance to next token
}
}
}
static void ggml_compute_forward_gated_delta_net_f32(
const ggml_compute_params * params,
ggml_tensor * dst) {
ggml_tensor * V = dst->src[2];
int64_t nr = V->ne[1] * V->ne[3];
// disable for NUMA
const bool disable_chunking = ggml_is_numa();
int nth = params->nth;
int ith = params->ith;
// 4x chunks per thread
int nth_scaled = nth * 4;
int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled;
int64_t nchunk = (nr + chunk_size - 1) / chunk_size;
if (nth == 1 || nchunk < nth || disable_chunking) {
nchunk = nth;
}
if (ith == 0) {
ggml_threadpool_chunk_set(params->threadpool, nth);
}
ggml_barrier(params->threadpool);
const int64_t dr = (nr + nchunk - 1) / nchunk;
int current_chunk = ith;
while (current_chunk < nchunk) {
const int64_t ir0 = dr * current_chunk;
const int64_t ir1 = MIN(ir0 + dr, nr);
ggml_compute_forward_gated_delta_net_one_chunk(params, dst, ir0, ir1);
current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
}
}
void ggml_compute_forward_gated_delta_net(
const ggml_compute_params * params,
ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_gated_delta_net_f32(params, dst);
} break;
default:
{
GGML_ABORT("fatal error");
}
}
}
// ggml_compute_forward_rwkv_wkv7
static void ggml_compute_forward_rwkv_wkv7_f32(

View File

@@ -102,6 +102,7 @@ void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, s
void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_gated_delta_net(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_map_custom3(const struct ggml_compute_params * params, struct ggml_tensor * dst);

View File

@@ -50,6 +50,10 @@ void quantize_row_mxfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, i
quantize_row_mxfp4_ref(x, y, k);
}
void quantize_row_nvfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
quantize_row_nvfp4_ref(x, y, k);
}
//
// 2-6 bit quantization in super-blocks
//
@@ -216,6 +220,42 @@ void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
*s = sumf;
}
// NVFP4: super-block of 64 elements = 4 sub-blocks of 16 = 2 q8_0 blocks
void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
assert(n % QK_NVFP4 == 0);
const block_nvfp4 * GGML_RESTRICT x = vx;
const block_q8_0 * GGML_RESTRICT y = vy;
const int nb = n / QK_NVFP4;
float sumf = 0;
for (int ib = 0; ib < nb; ++ib) {
for (int s_idx = 0; s_idx < 4; ++s_idx) {
const float d = ggml_ue4m3_to_fp32(x[ib].d[s_idx]);
const int q8_block = s_idx / 2;
const int q8_off = (s_idx % 2) * QK_NVFP4_SUB;
const float dy = GGML_CPU_FP16_TO_FP32(y[2*ib + q8_block].d);
int sumi_lo = 0, sumi_hi = 0;
for (int j = 0; j < QK_NVFP4_SUB/2; ++j) {
const uint8_t qv = x[ib].qs[s_idx*(QK_NVFP4_SUB/2) + j];
sumi_lo += y[2*ib + q8_block].qs[q8_off + j + 0] * kvalues_mxfp4[qv & 0xf];
sumi_hi += y[2*ib + q8_block].qs[q8_off + j + QK_NVFP4_SUB/2] * kvalues_mxfp4[qv >> 4];
}
sumf += dy * d * (sumi_lo + sumi_hi);
}
}
*s = sumf;
}
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK8_0;
const int nb = n / qk;

View File

@@ -20,6 +20,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_mxfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_nvfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
@@ -42,6 +43,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
@@ -73,6 +75,7 @@ void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_nvfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

File diff suppressed because it is too large Load Diff

View File

@@ -28,13 +28,17 @@ template <int K, int N> struct block {
// control size
static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding");
static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding");
static_assert(sizeof(block<4, 16>) == 16 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<4,16> size/padding");
static_assert(sizeof(block<8, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding");
static_assert(sizeof(block<8, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding");
static_assert(sizeof(block<8, 16>) == 16 * sizeof(ggml_half) + QK8_0 * 16, "wrong block<8,16> size/padding");
using block_q4_0x4 = block<4, 4>;
using block_q4_0x8 = block<4, 8>;
using block_q4_0x16 = block<4, 16>;
using block_q8_0x4 = block<8, 4>;
using block_q8_0x8 = block<8, 8>;
using block_q8_0x16 = block<8, 16>;
struct block_q4_Kx8 {
ggml_half d[8]; // super-block scale for quantized scales
@@ -44,7 +48,14 @@ struct block_q4_Kx8 {
};
static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
struct block_q4_Kx16 {
ggml_half d[16]; // super-block scale for quantized scales
ggml_half dmin[16]; // super-block scale for quantized mins
uint8_t scales[192]; // scales and mins, quantized with 6 bits
uint8_t qs[2048]; // 4--bit quants
};
static_assert(sizeof(block_q4_Kx16) == sizeof(ggml_half) * 32 + K_SCALE_SIZE * 16 + QK_K * 8, "wrong q4_K block size/padding");
struct block_q2_Kx8 {
ggml_half d[8]; // super-block scale for quantized scales
ggml_half dmin[8]; // super-block scale for quantized mins
@@ -53,6 +64,13 @@ struct block_q2_Kx8 {
};
static_assert(sizeof(block_q2_Kx8) == sizeof(ggml_half) * 16 + QK_K/2 + QK_K * 2, "wrong q2_K block size/padding");
struct block_q2_Kx16 {
ggml_half d[16]; // Super-block scale for quantized scales
ggml_half dmin[16]; // Super-block scale for quantized mins
uint8_t scales[256]; // Sub-block scales (16 cols * 16 sub-blocks)
uint8_t qs[1024]; // Data (16 cols * 64 bytes per block)
};
static_assert(sizeof(block_q2_Kx16) == sizeof(ggml_half) * 32 + QK_K + QK_K * 4, "wrong q2_K block size/padding");
struct block_q5_Kx8 {
ggml_half d[8]; // super-block scale for quantized scales
@@ -97,6 +115,12 @@ struct block_iq4_nlx8 {
static_assert(sizeof(block_iq4_nlx8) == 8 * sizeof(ggml_half) + QK4_NL * 4, "wrong iq4_nlx8 block size/padding");
struct block_iq4_nlx16 {
ggml_half d[16]; // deltas for 16 iq4_nl blocks
uint8_t qs[QK4_NL * 8]; // nibbles / quants for 16 iq4_nl blocks
};
static_assert(sizeof(block_iq4_nlx16) == 16 * sizeof(ggml_half) + QK4_NL * 8, "wrong iq4_nlx16 block size/padding");
struct block_mxfp4x4 {
uint8_t e[4];
uint8_t qs[QK_MXFP4 * 2];
@@ -109,7 +133,6 @@ struct block_mxfp4x8 {
};
static_assert(sizeof(block_mxfp4x8) == 8 + QK_MXFP4 * 4, "wrong mxfp4x8 block size/padding");
#if defined(__cplusplus)
extern "C" {
#endif
@@ -132,6 +155,8 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_mxfp4_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_mxfp4_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -146,10 +171,22 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_mxfp4_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_mxfp4_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
#if defined __riscv_zvfh
void ggml_quantize_mat_q8_0_4x1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
void ggml_quantize_mat_q8_K_4x1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
void ggml_gemv_q4_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q2_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_iq4_nl_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q8_0_16x1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q2_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
#endif
// Native implementations
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
@@ -170,6 +207,8 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@@ -184,10 +223,22 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
#if defined __riscv_zvfh
void ggml_quantize_mat_q8_0_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
void ggml_quantize_mat_q8_K_4x1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
void ggml_gemv_q4_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q8_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_iq4_nl_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q8_0_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q2_K_16x1_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_iq4_nl_16x1_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
#endif
#if defined(__cplusplus)
} // extern "C"

View File

@@ -479,13 +479,51 @@ do { \
// F16 AVX512
// F16 AVX
#if defined(__AVX512FP16__)
#define GGML_F16_STEP 128
#define GGML_F16_EPR 32
#define GGML_F16x32 __m512h
#define GGML_F16x32_ZERO _mm512_setzero_ph()
#define GGML_F16x32_SET1(x) _mm512_set1_ph(__extension__(_Float16)(x))
#define GGML_F16x32_LOAD(x) _mm512_loadu_ph(x)
#define GGML_F16x32_STORE(x, y) _mm512_storeu_ph(x, y)
#define GGML_F16x32_FMA(a, b, c) _mm512_fmadd_ph(b, c, a)
#define GGML_F16x32_ADD _mm512_add_ph
#define GGML_F16x32_MUL _mm512_mul_ph
#define GGML_F16x32_REDUCE(res, x) \
do { \
int offset = GGML_F16_ARR >> 1; \
for (int i = 0; i < offset; ++i) { \
x[i] = _mm512_add_ph(x[i], x[offset+i]); \
} \
offset >>= 1; \
for (int i = 0; i < offset; ++i) { \
x[i] = _mm512_add_ph(x[i], x[offset+i]); \
} \
offset >>= 1; \
for (int i = 0; i < offset; ++i) { \
x[i] = _mm512_add_ph(x[i], x[offset+i]); \
} \
res = (ggml_float) _mm512_reduce_add_ph(x[0]); \
} while (0)
#define GGML_F16_VEC GGML_F16x32
#define GGML_F16_VEC_ZERO GGML_F16x32_ZERO
#define GGML_F16_VEC_SET1 GGML_F16x32_SET1
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x32_LOAD(p)
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x32_STORE(p, r[i])
#define GGML_F16_VEC_FMA GGML_F16x32_FMA
#define GGML_F16_VEC_ADD GGML_F16x32_ADD
#define GGML_F16_VEC_MUL GGML_F16x32_MUL
#define GGML_F16_VEC_REDUCE GGML_F16x32_REDUCE
#else // Fallback FP16 <-> FP32
#define GGML_F16_STEP 64
#define GGML_F16_EPR 16
// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
#define GGML_F32Cx16 __m512
#define GGML_F32Cx16_ZERO _mm512_setzero_ps()
#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
@@ -525,6 +563,8 @@ do { \
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
#endif // __AVX512FP16__
#elif defined(__AVX__)
#define GGML_SIMD

View File

@@ -56,7 +56,8 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const
const int tx = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.x; // transpose block offset
const int ty = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
__shared__ float tile[CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1];
__shared__ float tile[2][CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1];
int cur_tile_buf = 0;
#pragma unroll
for (int i = 0; i < CUDA_CPY_BLOCK_NM; ++i) {
@@ -70,7 +71,7 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const
if(x < ne01 && y + j < ne00){
const int row = threadIdx.y+j;
const int col = threadIdx.x * sizeof(float)/sizeof(T);
T *tile2 = reinterpret_cast<T*>(tile[row]);
T *tile2 = reinterpret_cast<T*>(tile[cur_tile_buf][row]);
tile2[col] = src[imat*n + (y+j)*ne01 + x];
}
}
@@ -81,10 +82,12 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const
for (int j = 0; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
if (ty + j < ne01 && tx < ne00) {
const int col = (threadIdx.y+j)*sizeof(float)/sizeof(T);
const T *tile2 = reinterpret_cast<const T*>(tile[threadIdx.x]);
const T *tile2 = reinterpret_cast<const T*>(tile[cur_tile_buf][threadIdx.x]);
dst[imat*n + (ty+j)*ne00 + tx] = tile2[col];
}
}
cur_tile_buf = (cur_tile_buf + 1) % 2;
}
GGML_UNUSED_VARS(ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11,

View File

@@ -0,0 +1,263 @@
#include "gated_delta_net.cuh"
template <int S_v, bool KDA>
__global__ void gated_delta_net_cuda(const float * q,
const float * k,
const float * v,
const float * g,
const float * beta,
const float * curr_state,
float * dst,
int64_t H,
int64_t n_tokens,
int64_t n_seqs,
int64_t sq1,
int64_t sq2,
int64_t sq3,
int64_t sv1,
int64_t sv2,
int64_t sv3,
int64_t sb1,
int64_t sb2,
int64_t sb3,
const uint3 neqk1_magic,
const uint3 rq3_magic,
float scale) {
const uint32_t h_idx = blockIdx.x;
const uint32_t sequence = blockIdx.y;
// each warp owns one column, using warp-level primitives to reduce across rows
const int lane = threadIdx.x;
const int col = blockIdx.z * blockDim.y + threadIdx.y;
const uint32_t iq1 = fastmodulo(h_idx, neqk1_magic);
const uint32_t iq3 = fastdiv(sequence, rq3_magic);
const int64_t attn_score_elems = S_v * H * n_tokens * n_seqs;
float * attn_data = dst;
float * state = dst + attn_score_elems;
const int64_t state_offset = (sequence * H + h_idx) * S_v * S_v;
state += state_offset;
curr_state += state_offset;
attn_data += (sequence * n_tokens * H + h_idx) * S_v;
constexpr int warp_size = ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v;
static_assert(S_v % warp_size == 0, "S_v must be a multiple of warp_size");
constexpr int rows_per_lane = (S_v + warp_size - 1) / warp_size;
float s_shard[rows_per_lane];
// state is stored transposed: M[col][i] = S[i][col], row col is contiguous
#pragma unroll
for (int r = 0; r < rows_per_lane; r++) {
const int i = r * warp_size + lane;
s_shard[r] = curr_state[col * S_v + i];
}
for (int t = 0; t < n_tokens; t++) {
const float * q_t = q + iq3 * sq3 + t * sq2 + iq1 * sq1;
const float * k_t = k + iq3 * sq3 + t * sq2 + iq1 * sq1;
const float * v_t = v + sequence * sv3 + t * sv2 + h_idx * sv1;
const int64_t gb_offset = sequence * sb3 + t * sb2 + h_idx * sb1;
const float * beta_t = beta + gb_offset;
const float * g_t = g + gb_offset * (KDA ? S_v : 1);
const float beta_val = *beta_t;
if constexpr (!KDA) {
const float g_val = expf(*g_t);
// kv[col] = (S^T @ k)[col] = sum_i S[i][col] * k[i]
float kv_shard = 0.0f;
#pragma unroll
for (int r = 0; r < rows_per_lane; r++) {
const int i = r * warp_size + lane;
kv_shard += s_shard[r] * k_t[i];
}
float kv_col = warp_reduce_sum<warp_size>(kv_shard);
// delta[col] = (v[col] - g * kv[col]) * beta
float delta_col = (v_t[col] - g_val * kv_col) * beta_val;
// fused: S[i][col] = g * S[i][col] + k[i] * delta[col]
// attn[col] = (S^T @ q)[col] = sum_i S[i][col] * q[i]
float attn_partial = 0.0f;
#pragma unroll
for (int r = 0; r < rows_per_lane; r++) {
const int i = r * warp_size + lane;
s_shard[r] = g_val * s_shard[r] + k_t[i] * delta_col;
attn_partial += s_shard[r] * q_t[i];
}
float attn_col = warp_reduce_sum<warp_size>(attn_partial);
if (lane == 0) {
attn_data[col] = attn_col * scale;
}
} else {
// kv[col] = sum_i g[i] * S[i][col] * k[i]
float kv_shard = 0.0f;
#pragma unroll
for (int r = 0; r < rows_per_lane; r++) {
const int i = r * warp_size + lane;
kv_shard += expf(g_t[i]) * s_shard[r] * k_t[i];
}
float kv_col = warp_reduce_sum<warp_size>(kv_shard);
// delta[col] = (v[col] - kv[col]) * beta
float delta_col = (v_t[col] - kv_col) * beta_val;
// fused: S[i][col] = g[i] * S[i][col] + k[i] * delta[col]
// attn[col] = (S^T @ q)[col] = sum_i S[i][col] * q[i]
float attn_partial = 0.0f;
#pragma unroll
for (int r = 0; r < rows_per_lane; r++) {
const int i = r * warp_size + lane;
s_shard[r] = expf(g_t[i]) * s_shard[r] + k_t[i] * delta_col;
attn_partial += s_shard[r] * q_t[i];
}
float attn_col = warp_reduce_sum<warp_size>(attn_partial);
if (lane == 0) {
attn_data[col] = attn_col * scale;
}
}
attn_data += S_v * H;
}
// Write state back to global memory (transposed layout)
#pragma unroll
for (int r = 0; r < rows_per_lane; r++) {
const int i = r * warp_size + lane;
state[col * S_v + i] = s_shard[r];
}
}
template <bool KDA>
static void launch_gated_delta_net(
const float * q_d, const float * k_d, const float * v_d,
const float * g_d, const float * b_d, const float * s_d,
float * dst_d,
int64_t S_v, int64_t H, int64_t n_tokens, int64_t n_seqs,
int64_t sq1, int64_t sq2, int64_t sq3,
int64_t sv1, int64_t sv2, int64_t sv3,
int64_t sb1, int64_t sb2, int64_t sb3,
int64_t neqk1, int64_t rq3,
float scale, cudaStream_t stream) {
//TODO: Add chunked kernel for even faster pre-fill
const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;
const int num_warps = 4;
dim3 grid_dims(H, n_seqs, (S_v + num_warps - 1) / num_warps);
dim3 block_dims(warp_size <= S_v ? warp_size : S_v, num_warps, 1);
const uint3 neqk1_magic = init_fastdiv_values(neqk1);
const uint3 rq3_magic = init_fastdiv_values(rq3);
int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
switch (S_v) {
case 16:
gated_delta_net_cuda<16, KDA><<<grid_dims, block_dims, 0, stream>>>(
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
break;
case 32:
gated_delta_net_cuda<32, KDA><<<grid_dims, block_dims, 0, stream>>>(
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
break;
case 64: {
gated_delta_net_cuda<64, KDA><<<grid_dims, block_dims, 0, stream>>>(
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
break;
}
case 128: {
gated_delta_net_cuda<128, KDA><<<grid_dims, block_dims, 0, stream>>>(
q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
break;
}
default:
GGML_ABORT("fatal error");
break;
}
}
void ggml_cuda_op_gated_delta_net(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_tensor * src_q = dst->src[0];
ggml_tensor * src_k = dst->src[1];
ggml_tensor * src_v = dst->src[2];
ggml_tensor * src_g = dst->src[3];
ggml_tensor * src_beta = dst->src[4];
ggml_tensor * src_state = dst->src[5];
GGML_TENSOR_LOCALS(int64_t, neq, src_q, ne);
GGML_TENSOR_LOCALS(size_t , nbq, src_q, nb);
GGML_TENSOR_LOCALS(int64_t, nek, src_k, ne);
GGML_TENSOR_LOCALS(size_t , nbk, src_k, nb);
GGML_TENSOR_LOCALS(int64_t, nev, src_v, ne);
GGML_TENSOR_LOCALS(size_t, nbv, src_v, nb);
GGML_TENSOR_LOCALS(size_t, nbb, src_beta, nb);
const int64_t S_v = nev0;
const int64_t H = nev1;
const int64_t n_tokens = nev2;
const int64_t n_seqs = nev3;
const bool kda = (src_g->ne[0] == S_v);
GGML_ASSERT(neq1 == nek1);
const int64_t neqk1 = neq1;
const int64_t rq3 = nev3 / neq3;
const float * q_d = (const float *) src_q->data;
const float * k_d = (const float *) src_k->data;
const float * v_d = (const float *) src_v->data;
const float * g_d = (const float *) src_g->data;
const float * b_d = (const float *) src_beta->data;
const float * s_d = (const float *) src_state->data;
float * dst_d = (float *) dst->data;
GGML_ASSERT(ggml_is_contiguous_rows(src_q));
GGML_ASSERT(ggml_is_contiguous_rows(src_k));
GGML_ASSERT(ggml_is_contiguous_rows(src_v));
GGML_ASSERT(ggml_are_same_stride(src_q, src_k));
GGML_ASSERT(src_g->ne[0] == 1 || kda);
GGML_ASSERT(ggml_is_contiguous(src_g));
GGML_ASSERT(ggml_is_contiguous(src_beta));
GGML_ASSERT(ggml_is_contiguous(src_state));
// strides in floats (beta strides used for both g and beta offset computation)
const int64_t sq1 = nbq1 / sizeof(float);
const int64_t sq2 = nbq2 / sizeof(float);
const int64_t sq3 = nbq3 / sizeof(float);
const int64_t sv1 = nbv1 / sizeof(float);
const int64_t sv2 = nbv2 / sizeof(float);
const int64_t sv3 = nbv3 / sizeof(float);
const int64_t sb1 = nbb1 / sizeof(float);
const int64_t sb2 = nbb2 / sizeof(float);
const int64_t sb3 = nbb3 / sizeof(float);
const float scale = 1.0f / sqrtf((float) S_v);
cudaStream_t stream = ctx.stream();
if (kda) {
launch_gated_delta_net<true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
sb1, sb2, sb3, neqk1, rq3, scale, stream);
} else {
launch_gated_delta_net<false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
sb1, sb2, sb3, neqk1, rq3, scale, stream);
}
}

View File

@@ -0,0 +1,4 @@
#include "common.cuh"
#include "ggml.h"
void ggml_cuda_op_gated_delta_net(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@@ -53,6 +53,7 @@
#include "ggml-cuda/upscale.cuh"
#include "ggml-cuda/wkv.cuh"
#include "ggml-cuda/gla.cuh"
#include "ggml-cuda/gated_delta_net.cuh"
#include "ggml-cuda/set.cuh"
#include "ggml-cuda/set-rows.cuh"
#include "ggml-cuda/pad_reflect_1d.cuh"
@@ -204,7 +205,14 @@ static ggml_cuda_device_info ggml_cuda_init() {
GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
int64_t total_vram = 0;
GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
for (int id = 0; id < info.device_count; ++id) {
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
total_vram += prop.totalGlobalMem;
}
GGML_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices (Total VRAM: %zu MiB):\n",
__func__, info.device_count, (size_t)(total_vram / (1024 * 1024)));
total_vram = 0;
std::vector<std::pair<int, std::string>> turing_devices_without_mma;
for (int id = 0; id < info.device_count; ++id) {
@@ -242,6 +250,12 @@ static ggml_cuda_device_info ggml_cuda_init() {
#else
info.devices[id].supports_cooperative_launch = false;
#endif // !(GGML_USE_MUSA)
// cudaMemGetInfo returns info for the current device
size_t free_mem;
CUDA_CHECK(cudaSetDevice(id));
CUDA_CHECK(cudaMemGetInfo(&free_mem, NULL));
#if defined(GGML_USE_HIP)
info.devices[id].smpbo = prop.sharedMemPerBlock;
@@ -256,22 +270,25 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].cc += prop.minor * 0x10;
}
}
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, VRAM: %zu MiB (%zu MiB free)\n",
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
device_vmm ? "yes" : "no", prop.warpSize);
device_vmm ? "yes" : "no", prop.warpSize,
(size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024));
#elif defined(GGML_USE_MUSA)
// FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
info.devices[id].warp_size = 32;
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
info.devices[id].cc += prop.minor * 0x10;
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB (%zu MiB free)\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
(size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024));
#else
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = 100*prop.major + 10*prop.minor;
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, VRAM: %zu MiB (%zu MiB free)\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
(size_t)(prop.totalGlobalMem / (1024 * 1024)), free_mem / (1024 * 1024));
std::string device_name(prop.name);
if (device_name == "NVIDIA GeForce MX450") {
turing_devices_without_mma.push_back({ id, device_name });
@@ -1225,6 +1242,34 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
}
}
struct cublas_force_compute_type {
bool fp32 = false;
bool fp16 = false;
};
static const cublas_force_compute_type & ggml_cuda_cublas_get_force_compute_type() {
static const cublas_force_compute_type compute_type = [] {
cublas_force_compute_type result;
const bool ggml_cuda_force_cublas_compute_32f_env = getenv("GGML_CUDA_FORCE_CUBLAS_COMPUTE_32F") != nullptr;
const bool ggml_cuda_force_cublas_compute_16f_env = getenv("GGML_CUDA_FORCE_CUBLAS_COMPUTE_16F") != nullptr;
GGML_ASSERT(ggml_cuda_force_cublas_compute_16f_env == false || ggml_cuda_force_cublas_compute_32f_env == false);
if (ggml_cuda_force_cublas_compute_32f_env) {
GGML_LOG_INFO("Detected GGML_CUDA_FORCE_CUBLAS_COMPUTE_32F\n");
result.fp32 = true;
} else if (ggml_cuda_force_cublas_compute_16f_env) {
GGML_LOG_INFO("Detected GGML_CUDA_FORCE_CUBLAS_COMPUTE_16F\n");
result.fp16 = true;
}
return result;
}();
return compute_type;
}
static void ggml_cuda_op_mul_mat_cublas(
ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
@@ -1307,7 +1352,13 @@ static void ggml_cuda_op_mul_mat_cublas(
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
const auto & force_compute_type = ggml_cuda_cublas_get_force_compute_type();
if (!force_compute_type.fp16 && (GGML_CUDA_CC_IS_CDNA(cc)
|| GGML_CUDA_CC_IS_RDNA4(cc)
|| cc == GGML_CUDA_CC_VOLTA
|| force_compute_type.fp32))
{
const float alpha = 1.0f;
const float beta = 0.0f;
CUBLAS_CHECK(
@@ -1906,10 +1957,23 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
cudaDataType_t cu_data_type_b = traits::data_type;
const void * alpha = traits::get_alpha();
const void * beta = traits::get_beta();
const float alpha_f32 = 1.0f;
const float beta_f32 = 0.0f;
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
const auto & force_compute_type = ggml_cuda_cublas_get_force_compute_type();
int id = ggml_cuda_get_device();
const int cc = ggml_cuda_info().devices[id].cc;
static constexpr bool is_src0_type_f16 = src0_type == GGML_TYPE_F16;
// bf16 and fp32 are already being computed in fp32 (ensure it using static_assert),
// so checking necessity of forced fp32 only for fp16 src0_type
static_assert(is_src0_type_f16 || traits::compute_type == CUBLAS_COMPUTE_32F);
const bool need_compute_32f = is_src0_type_f16 && !force_compute_type.fp16 && (GGML_CUDA_CC_IS_CDNA(cc)
|| GGML_CUDA_CC_IS_RDNA4(cc)
|| cc == GGML_CUDA_CC_VOLTA
|| force_compute_type.fp32);
if (dst->op_params[0] == GGML_PREC_DEFAULT && !need_compute_32f) {
if constexpr (src0_type == GGML_TYPE_F32) {
dst_t = (char *) dst_ddf; // Direct F32 output
} else {
@@ -1919,18 +1983,10 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
}
} else {
dst_t = (char *) dst_ddf;
cu_compute_type = CUBLAS_COMPUTE_32F;
cu_data_type = CUDA_R_32F;
alpha = &alpha_f32;
beta = &beta_f32;
}
int id = ggml_cuda_get_device();
const int cc = ggml_cuda_info().devices[id].cc;
if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
cu_compute_type = CUBLAS_COMPUTE_32F;
alpha = &alpha_f32;
beta = &beta_f32;
cu_compute_type = batched_mul_mat_traits<GGML_TYPE_F32>::compute_type;
cu_data_type = batched_mul_mat_traits<GGML_TYPE_F32>::data_type;
alpha = batched_mul_mat_traits<GGML_TYPE_F32>::get_alpha();
beta = batched_mul_mat_traits<GGML_TYPE_F32>::get_beta();
}
GGML_ASSERT(ne12 % ne02 == 0);
@@ -2733,6 +2789,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_GATED_LINEAR_ATTN:
ggml_cuda_op_gated_linear_attn(ctx, dst);
break;
case GGML_OP_GATED_DELTA_NET:
ggml_cuda_op_gated_delta_net(ctx, dst);
break;
case GGML_OP_RWKV_WKV7:
ggml_cuda_op_rwkv_wkv7(ctx, dst);
break;
@@ -2803,14 +2862,11 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
//enables async copies from CPU to CUDA, instead of only CUDA-to-CUDA
bool copy_from_host = ggml_backend_buffer_is_host(buf_src) && ggml_backend_dev_type(backend_src->device) == GGML_BACKEND_DEVICE_TYPE_CPU;
if (!(copy_from_host || ggml_backend_is_cuda(backend_src)) || !ggml_backend_is_cuda(backend_dst)) {
if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
return false;
}
if (!(copy_from_host || ggml_backend_buffer_is_cuda(buf_src)) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
return false;
}
@@ -2821,17 +2877,14 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
if ((copy_from_host && cuda_ctx_dst->device != buf_ctx_dst->device) ||
!copy_from_host && (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device)) {
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
#endif
return false;
}
if (copy_from_host) {
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyHostToDevice, cuda_ctx_dst->stream()));
} else if (backend_src != backend_dst) {
if (backend_src != backend_dst) {
// copy on src stream
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
@@ -4974,6 +5027,13 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_GATED_LINEAR_ATTN:
case GGML_OP_RWKV_WKV7:
return true;
case GGML_OP_GATED_DELTA_NET:
//TODO: enable once MUSA compiler is solved https://github.com/ggml-org/llama.cpp/pull/19504#issuecomment-4018634327
#ifdef GGML_USE_MUSA
return false;
#else
return true;
#endif // GGML_USE_MUSA
case GGML_OP_FLASH_ATTN_EXT:
return ggml_cuda_flash_attn_ext_supported(dev_ctx->device, op);
case GGML_OP_CROSS_ENTROPY_LOSS:

View File

@@ -60,11 +60,17 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
enum mmvq_parameter_table_id {
MMVQ_PARAMETERS_GENERIC = 0,
MMVQ_PARAMETERS_GCN,
MMVQ_PARAMETERS_RDNA2
MMVQ_PARAMETERS_RDNA2,
MMVQ_PARAMETERS_RDNA3_0,
MMVQ_PARAMETERS_RDNA4
};
static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
#if defined(RDNA2) || defined(RDNA3) || defined(RDNA4)
#if defined(RDNA4)
return MMVQ_PARAMETERS_RDNA4;
#elif defined(RDNA3_0)
return MMVQ_PARAMETERS_RDNA3_0;
#elif defined(RDNA2) || defined(RDNA3_5)
return MMVQ_PARAMETERS_RDNA2;
#elif defined(GCN) || defined(CDNA)
return MMVQ_PARAMETERS_GCN;
@@ -74,7 +80,13 @@ static constexpr __device__ mmvq_parameter_table_id get_device_table_id() {
}
static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
if (GGML_CUDA_CC_IS_RDNA2(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
if (GGML_CUDA_CC_IS_RDNA4(cc)) {
return MMVQ_PARAMETERS_RDNA4;
}
if (GGML_CUDA_CC_IS_RDNA3_0(cc)) {
return MMVQ_PARAMETERS_RDNA3_0;
}
if (GGML_CUDA_CC_IS_RDNA2(cc) || GGML_CUDA_CC_IS_RDNA3_5(cc)) {
return MMVQ_PARAMETERS_RDNA2;
}
if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {
@@ -83,7 +95,7 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
return MMVQ_PARAMETERS_GENERIC;
}
static constexpr __host__ __device__ int calc_nwarps(int ncols_dst, mmvq_parameter_table_id table_id) {
static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_dst, mmvq_parameter_table_id table_id) {
if (table_id == MMVQ_PARAMETERS_GENERIC) {
switch (ncols_dst) {
case 1:
@@ -114,6 +126,50 @@ static constexpr __host__ __device__ int calc_nwarps(int ncols_dst, mmvq_paramet
return 1;
}
}
if (table_id == MMVQ_PARAMETERS_RDNA4) {
// nwarps=8 benefits types with simple vec_dot on RDNA4 (ncols_dst=1).
// Types with complex vec_dot (Q3_K, IQ2_*, IQ3_*) regress due to register
// pressure and lookup table contention at higher thread counts.
if (ncols_dst == 1) {
switch (type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
return 8;
default:
return 1;
}
}
return 1;
}
if (table_id == MMVQ_PARAMETERS_RDNA3_0) {
// RDNA3 (W7900): stricter whitelist than RDNA4.
// Q2_K / Q5_K / IQ4_XS regress in full quant sweeps.
if (ncols_dst == 1) {
switch (type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ4_NL:
return 8;
default:
return 1;
}
}
return 1;
}
return 1;
}
@@ -138,7 +194,7 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int
}
template <ggml_type type, int ncols_dst, bool has_fusion, bool is_multi_token_id = false>
__launch_bounds__(calc_nwarps(ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
__launch_bounds__(calc_nwarps(type, ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
static __global__ void mul_mat_vec_q(
const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst,
const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
@@ -151,7 +207,7 @@ static __global__ void mul_mat_vec_q(
constexpr int qi = ggml_cuda_type_traits<type>::qi;
constexpr int vdr = get_vdr_mmvq(type);
constexpr mmvq_parameter_table_id table_id = get_device_table_id();
constexpr int nwarps = calc_nwarps(ncols_dst, table_id);
constexpr int nwarps = calc_nwarps(type, ncols_dst, table_id);
constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_dst, table_id);
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
@@ -355,12 +411,13 @@ static __global__ void mul_mat_vec_q(
}
}
template<ggml_type type>
static std::pair<dim3, dim3> calc_launch_params(
const int ncols_dst, const int nrows_x, const int nchannels_dst, const int nsamples_or_ntokens,
const int warp_size, const mmvq_parameter_table_id table_id) {
const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_dst, table_id) - 1) / calc_rows_per_block(ncols_dst, table_id);
const dim3 block_nums(nblocks, nchannels_dst, nsamples_or_ntokens);
const dim3 block_dims(warp_size, calc_nwarps(ncols_dst, table_id), 1);
const dim3 block_dims(warp_size, calc_nwarps(type, ncols_dst, table_id), 1);
return {block_nums, block_dims};
}
@@ -420,7 +477,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
if (has_ids && ncols_dst > 1) {
// Multi-token MUL_MAT_ID path only - single-token goes through regular path below
constexpr int c_ncols_dst = 1;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, ncols_dst, warp_size, table_id);
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, ncols_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst, true>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
@@ -431,7 +488,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
switch (ncols_dst) {
case 1: {
constexpr int c_ncols_dst = 1;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
@@ -439,7 +496,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
} break;
case 2: {
constexpr int c_ncols_dst = 2;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
@@ -447,7 +504,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
} break;
case 3: {
constexpr int c_ncols_dst = 3;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
@@ -455,7 +512,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
} break;
case 4: {
constexpr int c_ncols_dst = 4;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
@@ -463,7 +520,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
} break;
case 5: {
constexpr int c_ncols_dst = 5;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
@@ -471,7 +528,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
} break;
case 6: {
constexpr int c_ncols_dst = 6;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
@@ -479,7 +536,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
} break;
case 7: {
constexpr int c_ncols_dst = 7;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
@@ -487,7 +544,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
} break;
case 8: {
constexpr int c_ncols_dst = 8;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,

View File

@@ -76,7 +76,7 @@ static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0,
int row = tid / load_cols;
int col = tid % load_cols;
#pragma unroll
for (int idx = tid; idx < total_elems; idx += split_d_inner) {
for (int idx = 0; idx < total_elems; idx += split_d_inner) {
if (row < (int)split_d_inner) {
smem[row * n_cols + col] = x_block[row * stride_x + col];
}
@@ -84,6 +84,9 @@ static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0,
col += split_d_inner;
row += col / load_cols;
col = col % load_cols;
if (idx >= total_elems - tid - split_d_inner) {
break;
}
}
__syncthreads();

View File

@@ -207,6 +207,14 @@
#define RDNA3
#endif // defined(__GFX11__)
#if defined(__gfx1150__) || defined(__gfx1151__)
#define RDNA3_5
#endif // defined(__gfx1150__) || defined(__gfx1151__)
#if defined(RDNA3) && !defined(RDNA3_5)
#define RDNA3_0
#endif // defined(RDNA3) && !defined(RDNA3_5)
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
#define RDNA2

View File

@@ -402,6 +402,7 @@ static void pack_q4_0_quants(block_q4_0 * x, const uint8_t * qs, unsigned int bi
static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
static const int qk = QK_Q4_0x4x2;
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
const int nloe = k % qk; // leftovers
const int dblk_size = 8 * 2; // 8x __fp16
const int qblk_size = qk / 2; // int4
@@ -435,9 +436,11 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
unpack_q4_0_quants(qs, &x[i * 8 + 6], 6);
unpack_q4_0_quants(qs, &x[i * 8 + 7], 7);
bool partial = (nloe && i == nb-1);
uint8_t * q = y_q + (i * qblk_size);
for (int j = 0; j < qk / 2; j++) {
q[j] = (qs[j + 128] << 4) | qs[j];
q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
}
}
@@ -467,6 +470,7 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
static const int qk = QK_Q4_0x4x2;
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
const int nloe = k % qk; // leftovers
const int dblk_size = 8 * 2; // 8x __fp16
const int qblk_size = qk / 2; // int4
@@ -485,10 +489,17 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
for (int i = 0; i < nb; i++) {
uint8_t qs[QK_Q4_0x4x2]; // unpacked quants
bool partial = (nloe && i == nb-1);
const uint8_t * q = y_q + (i * qblk_size);
for (int j = 0; j < qk / 2; j++) {
qs[j] = q[j] & 0xf;
qs[j + 128] = q[j] >> 4;
if (partial) {
qs[j*2+0] = q[j] & 0xf;
qs[j*2+1] = q[j] >> 4;
} else {
qs[j+000] = q[j] & 0xf;
qs[j+128] = q[j] >> 4;
}
}
pack_q4_0_quants(&x[i * 8 + 0], qs, 0);
@@ -1078,6 +1089,7 @@ static void pack_mxfp4_quants(block_mxfp4 * x, const uint8_t * qs, unsigned int
static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k) {
static const int qk = QK_MXFP4x4x2;
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
const int nloe = k % qk; // leftovers
const int eblk_size = 8 * 1; // 8x E8M0
const int qblk_size = qk / 2; // int4
@@ -1112,9 +1124,11 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
unpack_mxfp4_quants(qs, &x[i * 8 + 6], 6);
unpack_mxfp4_quants(qs, &x[i * 8 + 7], 7);
bool partial = (nloe && i == nb-1);
uint8_t * q = y_q + (i * qblk_size);
for (int j = 0; j < qk / 2; j++) {
q[j] = (qs[j + 128] << 4) | qs[j];
q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000];
}
}
@@ -1144,6 +1158,7 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k) {
static const int qk = QK_MXFP4x4x2;
const int nb = (k + qk - 1) / qk; // number of blocks (padded)
const int nloe = k % qk; // leftovers
const int eblk_size = 8 * 1; // 8x E8M0
const int qblk_size = qk / 2; // int4
@@ -1162,10 +1177,17 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
for (int i = 0; i < nb; i++) {
uint8_t qs[QK_MXFP4x4x2]; // unpacked quants
bool partial = (nloe && i == nb-1);
const uint8_t * q = y_q + (i * qblk_size);
for (int j = 0; j < qk / 2; j++) {
qs[j] = q[j] & 0xf;
qs[j + 128] = q[j] >> 4;
if (partial) {
qs[j*2+0] = q[j] & 0xf;
qs[j*2+1] = q[j] >> 4;
} else {
qs[j+000] = q[j] & 0xf;
qs[j+128] = q[j] >> 4;
}
}
pack_mxfp4_quants(&x[i * 8 + 0], qs, 0);
@@ -1801,12 +1823,12 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
return false;
}
if (src0->ne[1] > 16 * 1024) {
if (ggml_nrows(src0) > 16 * 1024) {
return false; // typically the lm-head which would be too large for VTCM
}
if ((src1->ne[2] != 1 || src1->ne[3] != 1)) {
return false;
if (ggml_nrows(src1) > 1024 || src1->ne[2] != 1 || src1->ne[3] != 1) {
return false; // no huge batches or broadcasting (for now)
}
// src0 (weights) must be repacked
@@ -1820,6 +1842,9 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F16 src0 not supported\n");
return false;
}
if (ggml_nrows(src1) > 1024) {
return false; // no huge batches (for now)
}
break;
default:

View File

@@ -77,7 +77,7 @@ static inline size_t q8x4x2_row_size(uint32_t ne) {
return hex_round_up(ne + nb * 8 * sizeof(__fp16), 128);
}
static inline HVX_Vector_x8 hvx_vec_load_q4x4x8(const uint8_t * restrict ptr) {
static inline HVX_Vector_x8 hvx_vec_load_q4x4x8_full(const uint8_t * restrict ptr) {
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
HVX_Vector v0_1 = vptr[0]; // first 256 elements (128 bytes)
@@ -88,9 +88,9 @@ static inline HVX_Vector_x8 hvx_vec_load_q4x4x8(const uint8_t * restrict ptr) {
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
const HVX_Vector i8 = Q6_Vb_vsplat_R(8);
HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4); // & 0x0F
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4); // >> 4
HVX_Vector v2 = Q6_V_vand_VV(v2_3, mask_h4); // & 0x0F
HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4); // & 0x0F : first 128 elements
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4); // >> 4 : second 128 elements
HVX_Vector v2 = Q6_V_vand_VV(v2_3, mask_h4); // & 0x0F ...
HVX_Vector v3 = Q6_Vub_vlsr_VubR(v2_3, 4); // >> 4
HVX_Vector v4 = Q6_V_vand_VV(v4_5, mask_h4); // & 0x0F
HVX_Vector v5 = Q6_Vub_vlsr_VubR(v4_5, 4); // >> 4
@@ -111,7 +111,41 @@ static inline HVX_Vector_x8 hvx_vec_load_q4x4x8(const uint8_t * restrict ptr) {
return r;
}
static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8(const uint8_t * restrict ptr) {
static HVX_Vector_x8 hvx_vec_load_q4x4x8_partial(const uint8_t * restrict ptr, uint32_t n) {
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
const uint32_t qk = QK_Q4_0x4x2; // 256
const uint32_t nb = n / qk;
const uint32_t nloe = n % qk;
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
const HVX_Vector i8 = Q6_Vb_vsplat_R(8);
HVX_Vector_x8 r;
uint32_t i = 0;
#pragma unroll(2)
for (i=0; i < nb; i++) {
HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : first 128 elements
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : second 128 elements
r.v[i*2+0] = Q6_Vb_vsub_VbVb(v0, i8);
r.v[i*2+1] = Q6_Vb_vsub_VbVb(v1, i8);
}
if (nloe) {
HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : even 128 elements
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : odd 128 elements
HVX_VectorPair v0_1_p = Q6_W_vshuff_VVR(v1, v0, -1); // zip even:odd:...
r.v[i*2+0] = Q6_Vb_vsub_VbVb(Q6_V_lo_W(v0_1_p), i8);
r.v[i*2+1] = Q6_Vb_vsub_VbVb(Q6_V_hi_W(v0_1_p), i8);
}
return r;
}
static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8_full(const uint8_t * restrict ptr) {
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
HVX_Vector v0_1 = vptr[0]; // first 256 elements (128 bytes)
@@ -144,7 +178,41 @@ static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8(const uint8_t * restrict ptr)
return r;
}
static inline HVX_Vector_x8 hvx_vec_load_q8x4x8(const uint8_t * restrict ptr) {
static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8_partial(const uint8_t * restrict ptr, uint32_t n) {
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
const uint32_t qk = QK_Q4_0x4x2; // 256
const uint32_t nb = n / qk;
const uint32_t nloe = n % qk;
const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
const HVX_Vector lut = *(const HVX_Vector *) kvalues_mxfp4_lut;
HVX_Vector_x8 r;
uint32_t i = 0;
#pragma unroll(2)
for (i=0; i < nb; i++) {
HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : first 128 elements
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : second 128 elements
r.v[i*2+0] = Q6_Vb_vlut32_VbVbI(v0, lut, 0);
r.v[i*2+1] = Q6_Vb_vlut32_VbVbI(v1, lut, 0);
}
if (nloe) {
HVX_Vector v = vptr[i]; // 256 elements (128 bytes)
HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : even 128 elements
HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : odd 128 elements
HVX_VectorPair v0_1_p = Q6_W_vshuff_VVR(v1, v0, -1); // zip even:odd:...
r.v[i*2+0] = Q6_Vb_vlut32_VbVbI(Q6_V_lo_W(v0_1_p), lut, 0);
r.v[i*2+1] = Q6_Vb_vlut32_VbVbI(Q6_V_hi_W(v0_1_p), lut, 0);
}
return r;
}
static inline HVX_Vector_x8 hvx_vec_load_q8x4x8_full(const uint8_t * restrict ptr) {
const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr;
HVX_Vector v0 = vptr[0]; // first 128 vals
@@ -160,6 +228,10 @@ static inline HVX_Vector_x8 hvx_vec_load_q8x4x8(const uint8_t * restrict ptr) {
return r;
}
static inline HVX_Vector_x8 hvx_vec_load_q8x4x8_partial(const uint8_t * restrict ptr, uint32_t nloe) {
return hvx_vec_load_q8x4x8_full(ptr);
}
// Reduce multiply 1024 x 1024 int8 elements (32x q4/8 blocks in 8x HVX vectors).
// Accumulate each block into a single int32 value.
// Return a single HVX vector with 32x int32 accumulators.
@@ -167,14 +239,14 @@ static inline HVX_Vector_x8 hvx_vec_load_q8x4x8(const uint8_t * restrict ptr) {
// if() checks are optimized out at compile time -- make sure to pass N as a constexpr.
static inline HVX_Vector hvx_vec_rmpy_x8_n(HVX_Vector_x8 x, HVX_Vector_x8 y, unsigned int n) {
HVX_Vector r0 = Q6_V_vsplat_R(0);
HVX_Vector r1 = Q6_V_vsplat_R(0);
HVX_Vector r2 = Q6_V_vsplat_R(0);
HVX_Vector r3 = Q6_V_vsplat_R(0);
HVX_Vector r4 = Q6_V_vsplat_R(0);
HVX_Vector r5 = Q6_V_vsplat_R(0);
HVX_Vector r6 = Q6_V_vsplat_R(0);
HVX_Vector r7 = Q6_V_vsplat_R(0);
HVX_Vector r0 = Q6_V_vzero();
HVX_Vector r1 = Q6_V_vzero();
HVX_Vector r2 = Q6_V_vzero();
HVX_Vector r3 = Q6_V_vzero();
HVX_Vector r4 = Q6_V_vzero();
HVX_Vector r5 = Q6_V_vzero();
HVX_Vector r6 = Q6_V_vzero();
HVX_Vector r7 = Q6_V_vzero();
HVX_VectorPair p3;
HVX_VectorPair p2;
@@ -213,15 +285,42 @@ static inline HVX_Vector hvx_vec_rmpy_x8_n(HVX_Vector_x8 x, HVX_Vector_x8 y, uns
}
static inline HVX_Vector hvx_vec_rmpy_x8_full(HVX_Vector_x8 x, HVX_Vector_x8 y) {
return hvx_vec_rmpy_x8_n(x, y, 1024);
HVX_Vector r0 = Q6_Vw_vrmpy_VbVb(x.v[0], y.v[0]);
HVX_Vector r1 = Q6_Vw_vrmpy_VbVb(x.v[1], y.v[1]);
HVX_Vector r2 = Q6_Vw_vrmpy_VbVb(x.v[2], y.v[2]);
HVX_Vector r3 = Q6_Vw_vrmpy_VbVb(x.v[3], y.v[3]);
HVX_Vector r4 = Q6_Vw_vrmpy_VbVb(x.v[4], y.v[4]);
HVX_Vector r5 = Q6_Vw_vrmpy_VbVb(x.v[5], y.v[5]);
HVX_Vector r6 = Q6_Vw_vrmpy_VbVb(x.v[6], y.v[6]);
HVX_Vector r7 = Q6_Vw_vrmpy_VbVb(x.v[7], y.v[7]);
HVX_VectorPair p0 = Q6_W_vdeal_VVR(r1, r0, -4);
HVX_VectorPair p1 = Q6_W_vdeal_VVR(r3, r2, -4);
HVX_VectorPair p2 = Q6_W_vdeal_VVR(r5, r4, -4);
HVX_VectorPair p3 = Q6_W_vdeal_VVR(r7, r6, -4);
r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0));
r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1));
r2 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p2), Q6_V_hi_W(p2));
r3 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p3), Q6_V_hi_W(p3));
p0 = Q6_W_vdeal_VVR(r1, r0, -4);
p1 = Q6_W_vdeal_VVR(r3, r2, -4);
r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0));
r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1));
p0 = Q6_W_vdeal_VVR(r1, r0, -4);
r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0));
return r0;
}
// Handle most common cases of tensors not multiple of 1024.
static inline HVX_Vector hvx_vec_rmpy_x8_nloe(HVX_Vector_x8 x, HVX_Vector_x8 y, unsigned int n) {
if (n <= 256) { return hvx_vec_rmpy_x8_n(x, y, 256); };
if (n <= 512) { return hvx_vec_rmpy_x8_n(x, y, 512); };
if (n <= 768) { return hvx_vec_rmpy_x8_n(x, y, 768); };
return hvx_vec_rmpy_x8_n(x, y, 1024);
static inline HVX_Vector hvx_vec_rmpy_x8_partial(HVX_Vector_x8 x, HVX_Vector_x8 y, unsigned int n) {
if (n >= 512)
return hvx_vec_rmpy_x8_full(x, y);
return hvx_vec_rmpy_x8_partial(x, y, 512);
}
static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) {
@@ -246,7 +345,7 @@ static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
// Row sum (sf)
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
HVX_Vector r0_sum = Q6_V_vzero();
// Multiply and accumulate into int32.
// Compute combined scale (fp32).
@@ -257,12 +356,12 @@ static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo
uint32_t i = 0;
for (; i < nb; i++) {
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size);
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
@@ -272,19 +371,19 @@ static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
}
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
// Process leftovers
if (nloe) {
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe));
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
// Zero out unused scales
// Zero out unused elements
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
@@ -326,8 +425,8 @@ static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
// Row sum (sf)
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
HVX_Vector r1_sum = Q6_V_vsplat_R(0);
HVX_Vector r0_sum = Q6_V_vzero();
HVX_Vector r1_sum = Q6_V_vzero();
// Multiply and accumulate into int32.
// Compute combined scale (fp32).
@@ -338,14 +437,14 @@ static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
uint32_t i = 0;
for (; i < nb; i++) {
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size);
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size);
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
@@ -359,23 +458,23 @@ static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
}
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
// Process leftovers
if (nloe) {
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size);
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe));
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy_q, nloe));
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe));
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
// Zero out unused scales
// Zero out unused elements
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
r1_dd = Q6_V_vand_QV(bmask, r1_dd);
@@ -423,10 +522,10 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales
// Row sums (sf) - 4 accumulators for 2×2 tile
HVX_Vector r0_c0_sum = Q6_V_vsplat_R(0);
HVX_Vector r0_c1_sum = Q6_V_vsplat_R(0);
HVX_Vector r1_c0_sum = Q6_V_vsplat_R(0);
HVX_Vector r1_c1_sum = Q6_V_vsplat_R(0);
HVX_Vector r0_c0_sum = Q6_V_vzero();
HVX_Vector r0_c1_sum = Q6_V_vzero();
HVX_Vector r1_c0_sum = Q6_V_vzero();
HVX_Vector r1_c1_sum = Q6_V_vzero();
const uint32_t nb = n / qk; // num full blocks
const uint32_t nloe = n % qk; // num leftover elements
@@ -434,12 +533,12 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
uint32_t i = 0;
for (; i < nb; i++) {
// Load src1 columns (reused across both src0 rows)
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size);
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size);
// Load src0 rows (reused across both src1 columns)
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size);
// Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
@@ -448,8 +547,8 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q));
// Load scales
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
@@ -473,18 +572,18 @@ static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
// Process leftovers
if (nloe) {
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size);
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial(y0_q + i * y_qblk_size, nloe);
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial(y1_q + i * y_qblk_size, nloe);
HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy0_q, nloe));
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy1_q, nloe));
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy0_q, nloe));
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy1_q, nloe));
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe));
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe));
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe));
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe));
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
@@ -545,7 +644,7 @@ static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
// Row sum (sf)
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
HVX_Vector r0_sum = Q6_V_vzero();
// Multiply and accumulate into int32.
// Compute combined scale (fp32).
@@ -556,12 +655,12 @@ static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo
uint32_t i = 0;
for (; i < nb; i++) {
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size);
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
@@ -571,19 +670,19 @@ static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const vo
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
}
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
// Process leftovers
if (nloe) {
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe));
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
// Zero out unused scales
// Zero out unused elements
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
@@ -625,8 +724,8 @@ static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0,
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
// Row sum (qf32)
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
HVX_Vector r1_sum = Q6_V_vsplat_R(0);
HVX_Vector r0_sum = Q6_V_vzero();
HVX_Vector r1_sum = Q6_V_vzero();
// Multiply and accumulate into int32.
// Compute combined scale (fp32).
@@ -637,14 +736,14 @@ static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0,
uint32_t i = 0;
for (; i < nb; i++) {
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8(r1_x_q + i * x_qblk_size);
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_full(r1_x_q + i * x_qblk_size);
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
@@ -658,14 +757,14 @@ static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0,
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
}
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
// Process leftovers
if (nloe) {
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8(r1_x_q + i * x_qblk_size);
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe);
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe));
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy_q, nloe));
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe));
HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
@@ -674,7 +773,7 @@ static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0,
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
// Zero out unused scales
// Zero out unused elements
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
r1_dd = Q6_V_vand_QV(bmask, r1_dd);
@@ -722,10 +821,10 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales
// Row sums (sf) - 4 accumulators for 2×2 tile
HVX_Vector r0_c0_sum = Q6_V_vsplat_R(0);
HVX_Vector r0_c1_sum = Q6_V_vsplat_R(0);
HVX_Vector r1_c0_sum = Q6_V_vsplat_R(0);
HVX_Vector r1_c1_sum = Q6_V_vsplat_R(0);
HVX_Vector r0_c0_sum = Q6_V_vzero();
HVX_Vector r0_c1_sum = Q6_V_vzero();
HVX_Vector r1_c0_sum = Q6_V_vzero();
HVX_Vector r1_c1_sum = Q6_V_vzero();
const uint32_t nb = n / qk; // num full blocks
const uint32_t nloe = n % qk; // num leftover elements
@@ -733,12 +832,12 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
uint32_t i = 0;
for (; i < nb; i++) {
// Load src1 columns (reused across both src0 rows)
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size);
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size);
// Load src0 rows (reused across both src1 columns)
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8(r1_x_q + i * x_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_full(r1_x_q + i * x_qblk_size);
// Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
@@ -747,8 +846,8 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q));
// Load scales
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
@@ -772,18 +871,18 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
// Process leftovers
if (nloe) {
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8(r1_x_q + i * x_qblk_size);
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial(y0_q + i * y_qblk_size, nloe);
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial(y1_q + i * y_qblk_size, nloe);
HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy0_q, nloe));
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy1_q, nloe));
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy0_q, nloe));
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy1_q, nloe));
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe));
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe));
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe));
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe));
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size));
HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size));
HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
@@ -792,7 +891,7 @@ static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float *
HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d)));
HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d)));
// Zero out unused scales
// Zero out unused elements
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd);
r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd);
@@ -844,7 +943,7 @@ static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const
const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales
// Row sum (sf)
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
HVX_Vector r0_sum = Q6_V_vzero();
// Multiply and accumulate into int32.
// Compute combined scale (fp32).
@@ -855,8 +954,8 @@ static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const
uint32_t i = 0;
for (; i < nb; i++) {
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full( y_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size);
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
@@ -887,12 +986,12 @@ static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const
// Process leftovers
if (nloe) {
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial( y_q + i * y_qblk_size, nloe);
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe));
HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
// Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
@@ -954,8 +1053,8 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
const uint8_t * restrict y_d = ((const uint8_t *) vy0) + y_qrow_size; // then scales
// Row sum (sf)
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
HVX_Vector r1_sum = Q6_V_vsplat_R(0);
HVX_Vector r0_sum = Q6_V_vzero();
HVX_Vector r1_sum = Q6_V_vzero();
// Multiply and accumulate into int32.
// Compute combined scale (fp32).
@@ -966,9 +1065,9 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
uint32_t i = 0;
for (; i < nb; i++) {
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8(r1_x_q + i * x_qblk_size);
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full( y_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_full(r1_x_q + i * x_qblk_size);
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
@@ -1007,14 +1106,14 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0,
// Process leftovers
if (nloe) {
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8(r1_x_q + i * x_qblk_size);
HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial( y_q + i * y_qblk_size, nloe);
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
@@ -1087,10 +1186,10 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float
const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales
// Row sums (sf) - 4 accumulators for 2×2 tile
HVX_Vector r0_c0_sum = Q6_V_vsplat_R(0);
HVX_Vector r0_c1_sum = Q6_V_vsplat_R(0);
HVX_Vector r1_c0_sum = Q6_V_vsplat_R(0);
HVX_Vector r1_c1_sum = Q6_V_vsplat_R(0);
HVX_Vector r0_c0_sum = Q6_V_vzero();
HVX_Vector r0_c1_sum = Q6_V_vzero();
HVX_Vector r1_c0_sum = Q6_V_vzero();
HVX_Vector r1_c1_sum = Q6_V_vzero();
const uint32_t nb = n / qk; // num full blocks
const uint32_t nloe = n % qk; // num leftover elements
@@ -1098,12 +1197,12 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float
uint32_t i = 0;
for (; i < nb; i++) {
// Load src1 columns (reused across both src0 rows)
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size);
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size);
// Load src0 rows (reused across both src1 columns)
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8(r1_x_q + i * x_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_full(r1_x_q + i * x_qblk_size);
// Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q));
@@ -1157,15 +1256,15 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float
// Process leftovers
if (nloe) {
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8(y0_q + i * y_qblk_size);
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8(y1_q + i * y_qblk_size);
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8(r0_x_q + i * x_qblk_size);
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8(r1_x_q + i * x_qblk_size);
HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial( y0_q + i * y_qblk_size, nloe);
HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial( y1_q + i * y_qblk_size, nloe);
HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe);
HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_partial(r1_x_q + i * x_qblk_size, nloe);
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy0_q, nloe));
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy1_q, nloe));
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy0_q, nloe));
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy1_q, nloe));
HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe));
HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe));
HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe));
HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe));
HVX_Vector vy0_d = *(const HVX_UVector *) (y0_d + i * y_dblk_size);
HVX_Vector vy1_d = *(const HVX_UVector *) (y1_d + i * y_dblk_size);
@@ -1234,7 +1333,7 @@ static void vec_dot_f16_f16_aa_1x1(const int n, float * restrict s, const void *
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
uint32_t nloe = n % VLEN_FP16; // leftover elements
HVX_VectorPair rsum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
HVX_VectorPair rsum_p = Q6_W_vzero();
uint32_t i = 0;
@@ -1264,8 +1363,8 @@ static void vec_dot_f16_f16_aa_2x1(const int n, float * restrict s0,
uint32_t nvec = n / VLEN_FP16;
uint32_t nloe = n % VLEN_FP16;
HVX_VectorPair rsum0_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
HVX_VectorPair rsum1_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
HVX_VectorPair rsum0_p = Q6_W_vzero();
HVX_VectorPair rsum1_p = Q6_W_vzero();
uint32_t i = 0;
@@ -1303,10 +1402,10 @@ static void vec_dot_f16_f16_aa_2x2(const int n, float * restrict s0, float * res
uint32_t nloe = n % VLEN_FP16;
// Row sums (sf) - 4 accumulators for 2×2 tile
HVX_VectorPair r0_c0_sum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
HVX_VectorPair r0_c1_sum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
HVX_VectorPair r1_c0_sum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
HVX_VectorPair r1_c1_sum_p = Q6_W_vcombine_VV(Q6_V_vsplat_R(0), Q6_V_vsplat_R(0));
HVX_VectorPair r0_c0_sum_p = Q6_W_vzero();
HVX_VectorPair r0_c1_sum_p = Q6_W_vzero();
HVX_VectorPair r1_c0_sum_p = Q6_W_vzero();
HVX_VectorPair r1_c1_sum_p = Q6_W_vzero();
uint32_t i = 0;
@@ -1358,7 +1457,7 @@ static void vec_dot_f16_f16_uu_1x1(const int n, float * restrict s, const void *
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
uint32_t nloe = n % VLEN_FP16; // leftover elements
HVX_Vector rsum = Q6_V_vsplat_R(0);
HVX_Vector rsum = Q6_V_vzero();
uint32_t i = 0;
@@ -1388,9 +1487,9 @@ static void vec_dot_f16_f32_uu_1x1(const int n, float * restrict s, const void *
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
uint32_t nloe = n % VLEN_FP16; // leftover elements
const HVX_Vector zero = Q6_V_vsplat_R(0);
const HVX_Vector zero = Q6_V_vzero();
HVX_Vector rsum = Q6_V_vsplat_R(0);
HVX_Vector rsum = Q6_V_vzero();
uint32_t i = 0;
@@ -1973,7 +2072,7 @@ static inline void quantize_block_f32_q8x1(float * restrict x, uint8_t * restric
assert((unsigned long) y_q % 128 == 0);
HVX_Vector * vx = (HVX_Vector *) x;
HVX_Vector zero = Q6_V_vsplat_R(0);
HVX_Vector zero = Q6_V_vzero();
// Use reduce max fp32 to find max(abs(e)) first
HVX_Vector vmax0_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[0]));
@@ -2034,7 +2133,7 @@ static inline void quantize_block_f32_q8x2(float * restrict x, uint8_t * restric
HVX_Vector * vx = (HVX_Vector *) x;
// Load and convert into QF32
HVX_Vector zero = Q6_V_vsplat_R(0);
HVX_Vector zero = Q6_V_vzero();
HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); // 32 elements
HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); // 32 elements
HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero); // 32 elements
@@ -2077,7 +2176,7 @@ static inline void quantize_block_f32_q8x4(float * restrict x, uint8_t * restric
HVX_Vector * vx = (HVX_Vector *) x;
// Load and convert into QF32
HVX_Vector zero = Q6_V_vsplat_R(0);
HVX_Vector zero = Q6_V_vzero();
HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); // 32 elements
HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); // 32 elements
HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero); // 32 elements

View File

@@ -11,6 +11,10 @@ endif()
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
if (NOT DEFINED CMAKE_HIP_FLAGS_DEBUG)
set(CMAKE_HIP_FLAGS_DEBUG "-g -O2")
endif()
# CMake on Windows doesn't support the HIP language yet
if (WIN32)
set(CXX_IS_HIPCC TRUE)

View File

@@ -491,6 +491,61 @@ static inline float ggml_e8m0_to_fp32_half(uint8_t x) {
#define GGML_E8M0_TO_FP32(x) ggml_e8m0_to_fp32(x)
#define GGML_E8M0_TO_FP32_HALF(x) ggml_e8m0_to_fp32_half(x)
// UE4M3: unsigned, 4 exp bits (bias=7), 3 mantissa bits
// Returns value * 0.5 to match kvalues_mxfp4 convention (kvalues = 2 * E2M1_float)
static inline float ggml_ue4m3_to_fp32(uint8_t x) {
if (x == 0 || x == 0x7F) {
return 0.0f;
}
int exp = (x >> 3) & 0xF;
int man = x & 0x7;
float raw;
if (exp == 0) {
raw = ldexpf((float) man, -9);
} else {
raw = ldexpf(1.0f + (float) man / 8.0f, exp - 7);
}
return raw * 0.5f;
}
static inline uint8_t ggml_fp32_to_ue4m3(float x) {
if (!(x > 0.0f)) {
return 0;
}
if (x > 448.0f) {
x = 448.0f;
}
uint32_t bits;
memcpy(&bits, &x, 4);
int fp32_exp = ((bits >> 23) & 0xFF) - 127;
int fp32_man = (bits >> 20) & 0x7;
int ue4m3_exp = fp32_exp + 7;
if (ue4m3_exp <= 0) {
// subnormal: value = man * 2^-9, man = round(x * 2^9)
int man = (int) (x * 512.0f + 0.5f);
if (man > 7) {
man = 7;
}
if (man < 1) {
return 0;
}
return (uint8_t) man;
}
if (ue4m3_exp >= 15) {
return 0x7E;
}
int round_bit = (bits >> 19) & 1;
int ue4m3_man = fp32_man + round_bit;
if (ue4m3_man > 7) {
ue4m3_man = 0;
ue4m3_exp++;
if (ue4m3_exp >= 15) {
return 0x7E;
}
}
return (uint8_t) ((ue4m3_exp << 3) | ue4m3_man);
}
/**
* Converts brain16 to float32.
*

View File

@@ -47,7 +47,7 @@ struct ggml_metal {
uint64_t fuse_cnt[GGML_OP_COUNT];
// capture state
bool capture_next_compute;
int capture_compute;
bool capture_started;
id<MTLCaptureScope> capture_scope;
@@ -75,6 +75,10 @@ struct ggml_metal {
// abort ggml_metal_graph_compute if callback returns true
ggml_abort_callback abort_callback;
void * abort_callback_data;
// error state - set when a command buffer fails during synchronize
// once set, graph_compute will return GGML_STATUS_FAILED until the backend is recreated
bool has_error;
};
ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
@@ -154,10 +158,19 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
GGML_LOG_INFO("%s: use concurrency = %s\n", __func__, res->use_concurrency ? "true" : "false");
GGML_LOG_INFO("%s: use graph optimize = %s\n", __func__, res->use_graph_optimize ? "true" : "false");
res->capture_next_compute = false;
res->capture_compute = 0;
res->capture_started = false;
res->capture_scope = nil;
{
const char * val = getenv("GGML_METAL_CAPTURE_COMPUTE");
if (val) {
res->capture_compute = atoi(val);
}
}
res->has_error = false;
res->gf = nil;
res->encode_async = nil;
for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
@@ -246,7 +259,8 @@ void ggml_metal_synchronize(ggml_metal_t ctx) {
if (status == MTLCommandBufferStatusError) {
GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
}
GGML_ABORT("fatal error");
ctx->has_error = true;
return;
}
}
}
@@ -262,7 +276,15 @@ void ggml_metal_synchronize(ggml_metal_t ctx) {
if (status == MTLCommandBufferStatusError) {
GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
}
GGML_ABORT("fatal error");
// release this and all remaining command buffers before returning
for (size_t j = i; j < ctx->cmd_bufs_ext.count; ++j) {
[ctx->cmd_bufs_ext[j] release];
}
[ctx->cmd_bufs_ext removeAllObjects];
ctx->has_error = true;
return;
}
[cmd_buf release];
@@ -414,6 +436,11 @@ bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, con
}
enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * gf) {
if (ctx->has_error) {
GGML_LOG_ERROR("%s: backend is in error state from a previous command buffer failure - recreate the backend to recover\n", __func__);
return GGML_STATUS_FAILED;
}
// number of nodes encoded by the main thread (empirically determined)
const int n_main = MAX(64, 0.1*gf->n_nodes);
@@ -438,9 +465,13 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
ctx->n_nodes_per_cb = (ctx->n_nodes_1 + ctx->n_cb - 1) / ctx->n_cb;
const bool use_capture = ctx->capture_next_compute;
if (ctx->capture_compute >= 0) {
ctx->capture_compute--;
}
const bool use_capture = ctx->capture_compute == 0;
if (use_capture) {
ctx->capture_next_compute = false;
ctx->capture_compute = -1;
// make sure all previous computations have finished before starting the capture
if (ctx->cmd_buf_last) {
@@ -449,6 +480,10 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
}
if (!ctx->capture_started) {
NSString * path = [NSString stringWithFormat:@"/tmp/perf-metal-%d.gputrace", getpid()];
GGML_LOG_WARN("%s: capturing graph in %s\n", __func__, [path UTF8String]);
// create capture scope
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:device];
@@ -456,7 +491,7 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
descriptor.captureObject = ctx->capture_scope;
descriptor.destination = MTLCaptureDestinationGPUTraceDocument;
descriptor.outputURL = [NSURL fileURLWithPath:[NSString stringWithFormat:@"/tmp/perf-metal.gputrace"]];
descriptor.outputURL = [NSURL fileURLWithPath:path];
NSError * error = nil;
if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
@@ -519,7 +554,7 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
// enter here only when capturing in order to wait for all computation to finish
// otherwise, we leave the graph to compute asynchronously
if (!use_capture && ctx->capture_started) {
if (use_capture && ctx->capture_started) {
// wait for completion and check status of each command buffer
// needed to detect if the device ran out-of-memory for example (#1881)
{
@@ -571,6 +606,8 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
[ctx->capture_scope endScope];
[[MTLCaptureManager sharedCaptureManager] stopCapture];
ctx->capture_started = false;
}
}
@@ -663,7 +700,7 @@ void ggml_metal_set_n_cb(ggml_metal_t ctx, int n_cb) {
idx_end,
ctx->use_fusion,
ctx->use_concurrency,
ctx->capture_next_compute,
ctx->capture_compute,
ctx->debug_graph,
ctx->debug_fusion);
@@ -698,5 +735,5 @@ bool ggml_metal_supports_family(ggml_metal_t ctx, int family) {
}
void ggml_metal_capture_next_compute(ggml_metal_t ctx) {
ctx->capture_next_compute = true;
ctx->capture_compute = 1;
}

View File

@@ -577,6 +577,41 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv(ggml_metal_
return res;
}
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_gated_delta_net(ggml_metal_library_t lib, const ggml_tensor * op) {
char base[256];
char name[256];
// v is src[2], dimensions: S_v = ne[0], H = ne[1]
const int ne20 = op->src[2]->ne[0]; // S_v
const int ne21 = op->src[2]->ne[1]; // H
const int ne30 = op->src[3]->ne[0]; // G
const int nsg = op->src[2]->ne[0]/32;
GGML_ASSERT(op->src[5]->type == GGML_TYPE_F32);
GGML_ASSERT(op->ne[0] == ne20 * ne21);
GGML_ASSERT(ne20 % 32 == 0);
snprintf(base, 256, "kernel_gated_delta_net_%s_%d", ggml_type_name(op->src[0]->type), nsg);
snprintf(name, 256, "%s_ne20=%d_ne30=%d", base, ne20, ne30);
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
if (!res.pipeline) {
ggml_metal_cv_t cv = ggml_metal_cv_init();
ggml_metal_cv_set_int16(cv, ne20, FC_GATED_DELTA_NET + 0);
ggml_metal_cv_set_int16(cv, ne30, FC_GATED_DELTA_NET + 1);
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
ggml_metal_cv_free(cv);
}
res.nsg = nsg;
return res;
}
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_solve_tri(ggml_metal_library_t lib, const ggml_tensor * op) {
char base[256];
char name[256];
@@ -1435,10 +1470,11 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin(ggml_metal_l
const bool is_c4 = (op->src[0]->ne[0] % 4 == 0) && (op->src[1]->ne[0] % 4 == 0);
const bool is_cb = op->src[0]->ne[0] != op->src[1]->ne[0];
const bool is_rb = ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]) && (ggml_nrows(op->src[1]) == 1) && ggml_nelements(op) < 65536;
snprintf(base, 256, "kernel_bin_fuse_%s_%s_%s%s", t0_str, t1_str, t_str, is_c4 ? "_4" : "");
snprintf(name, 256, "%s_op=%d_nf=%d_rb=%d", base, op_num, n_fuse, is_rb);
snprintf(name, 256, "%s_op=%d_nf=%d_rb=%d_cb=%d", base, op_num, n_fuse, is_rb, is_cb);
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
if (!res.pipeline) {
@@ -1447,6 +1483,7 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin(ggml_metal_l
ggml_metal_cv_set_int16(cv, op_num, FC_BIN + 0);
ggml_metal_cv_set_int16(cv, n_fuse, FC_BIN + 1);
ggml_metal_cv_set_bool (cv, is_rb, FC_BIN + 2);
ggml_metal_cv_set_bool (cv, is_cb, FC_BIN + 3);
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
@@ -1717,12 +1754,29 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_upscale(ggml_met
char base[256];
char name[256];
snprintf(base, 256, "kernel_upscale_%s", ggml_type_name(op->src[0]->type));
snprintf(name, 256, "%s", base);
const int32_t mode_flags = ggml_get_op_params_i32(op, 0);
const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF);
const bool antialias = (mode_flags & GGML_SCALE_FLAG_ANTIALIAS);
if (mode == GGML_SCALE_MODE_BILINEAR) {
snprintf(base, 256, "kernel_upscale_bilinear_%s", ggml_type_name(op->src[0]->type));
} else if (mode == GGML_SCALE_MODE_BICUBIC) {
snprintf(base, 256, "kernel_upscale_bicubic_%s", ggml_type_name(op->src[0]->type));
} else {
snprintf(base, 256, "kernel_upscale_nearest_%s", ggml_type_name(op->src[0]->type));
}
snprintf(name, 256, "%s_aa=%d", base, antialias);
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
if (!res.pipeline) {
res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
ggml_metal_cv_t cv = ggml_metal_cv_init();
ggml_metal_cv_set_bool(cv, antialias, FC_UPSCALE + 0);
res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
ggml_metal_cv_free(cv);
}
return res;

View File

@@ -125,6 +125,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv_batched (ggml_metal_library_t lib, const struct ggml_tensor * op, int ssm_conv_bs);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_gated_delta_net (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_solve_tri (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1, int nsg, int nxpsg, int r1ptg);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm (ggml_metal_library_t lib, const struct ggml_tensor * op);

View File

@@ -1108,7 +1108,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
op->type == GGML_TYPE_F32 &&
(op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32);
case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST && !(op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS);
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_POOL_1D:
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_POOL_2D:
@@ -1142,6 +1142,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
op->src[0]->ne[0] != 128 &&
op->src[0]->ne[0] != 192 &&
op->src[0]->ne[0] != 256 &&
op->src[0]->ne[0] != 320 &&
op->src[0]->ne[0] != 576) {
return false;
}
@@ -1155,10 +1156,12 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
case GGML_OP_RWKV_WKV6:
case GGML_OP_RWKV_WKV7:
return true;
case GGML_OP_GATED_DELTA_NET:
return has_simdgroup_reduction && op->src[2]->ne[0] % 32 == 0;
case GGML_OP_SOLVE_TRI:
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
return has_simdgroup_reduction;
return has_simdgroup_reduction && op->src[0]->type != GGML_TYPE_NVFP4;
case GGML_OP_SET:
case GGML_OP_CPY:
case GGML_OP_DUP:
@@ -1216,7 +1219,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
};
}
case GGML_OP_GET_ROWS:
return true;
return op->src[0]->type != GGML_TYPE_NVFP4;
case GGML_OP_SET_ROWS:
{
if (op->src[0]->type != GGML_TYPE_F32) {

View File

@@ -35,7 +35,7 @@
#define N_R0_Q4_K 2
#define N_SG_Q4_K 2
#define N_R0_Q5_K 2
#define N_R0_Q5_K 1
#define N_SG_Q5_K 2
#define N_R0_Q6_K 2
@@ -83,6 +83,8 @@
#define FC_UNARY 1200
#define FC_BIN 1300
#define FC_SUM_ROWS 1400
#define FC_UPSCALE 1500
#define FC_GATED_DELTA_NET 1600
// op-specific constants
#define OP_FLASH_ATTN_EXT_NQPSG 8
@@ -792,6 +794,44 @@ typedef struct {
uint64_t nb0;
} ggml_metal_kargs_ssm_scan;
typedef struct {
int32_t ne00;
int32_t ne01;
int32_t ne02;
int32_t ne03;
uint64_t nb00;
uint64_t nb01;
uint64_t nb02;
uint64_t nb03;
int32_t ne10;
int32_t ne11;
int32_t ne12;
int32_t ne13;
uint64_t nb10;
uint64_t nb11;
uint64_t nb12;
uint64_t nb13;
int32_t ne20;
int32_t ne21;
int32_t ne22;
int32_t ne23;
uint64_t nb20;
uint64_t nb21;
uint64_t nb22;
uint64_t nb23;
int32_t ns02;
int32_t ns12;
int32_t ns22;
int32_t ne0;
int32_t ne1;
int32_t ne2;
int32_t ne3;
uint64_t nb0;
uint64_t nb1;
uint64_t nb2;
uint64_t nb3;
} ggml_metal_kargs_gated_delta_net;
typedef struct {
int32_t ne00;
int32_t ne01;
@@ -890,6 +930,7 @@ typedef struct {
float sf1;
float sf2;
float sf3;
float poffs;
} ggml_metal_kargs_upscale;
typedef struct {

View File

@@ -333,6 +333,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
{
n_fuse = ggml_metal_op_rwkv(ctx, idx);
} break;
case GGML_OP_GATED_DELTA_NET:
{
n_fuse = ggml_metal_op_gated_delta_net(ctx, idx);
} break;
case GGML_OP_SOLVE_TRI:
{
n_fuse = ggml_metal_op_solve_tri(ctx, idx);
@@ -1562,6 +1566,81 @@ int ggml_metal_op_rwkv(ggml_metal_op_t ctx, int idx) {
return 1;
}
int ggml_metal_op_gated_delta_net(ggml_metal_op_t ctx, int idx) {
ggml_tensor * op = ctx->node(idx);
ggml_metal_library_t lib = ctx->lib;
ggml_metal_encoder_t enc = ctx->enc;
GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne);
GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb);
GGML_TENSOR_LOCALS( int32_t, ne2, op->src[2], ne);
GGML_TENSOR_LOCALS(uint64_t, nb2, op->src[2], nb);
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
auto pipeline = ggml_metal_library_get_pipeline_gated_delta_net(lib, op);
int ida = 0;
ggml_metal_kargs_gated_delta_net args = {
/*.ne00 =*/ ne00,
/*.ne01 =*/ ne01,
/*.ne02 =*/ ne02,
/*.ne03 =*/ ne03,
/*.nb00 =*/ nb00,
/*.nb01 =*/ nb01,
/*.nb02 =*/ nb02,
/*.nb03 =*/ nb03,
/*.ne10 =*/ ne10,
/*.ne11 =*/ ne11,
/*.ne12 =*/ ne12,
/*.ne13 =*/ ne13,
/*.nb10 =*/ nb10,
/*.nb11 =*/ nb11,
/*.nb12 =*/ nb12,
/*.nb13 =*/ nb13,
/*.ne20 =*/ ne20,
/*.ne21 =*/ ne21,
/*.ne22 =*/ ne22,
/*.ne23 =*/ ne23,
/*.nb20 =*/ nb20,
/*.nb21 =*/ nb21,
/*.nb22 =*/ nb22,
/*.nb23 =*/ nb23,
/*.ns02 =*/ (int32_t) (nb02/sizeof(float)),
/*.ns12 =*/ (int32_t) (nb12/sizeof(float)),
/*.ns22 =*/ (int32_t) (nb22/sizeof(float)),
/*.ne0 =*/ ne0,
/*.ne1 =*/ ne1,
/*.ne2 =*/ ne2,
/*.ne3 =*/ ne3,
/*.nb0 =*/ nb0,
/*.nb1 =*/ nb1,
/*.nb2 =*/ nb2,
/*.nb3 =*/ nb3,
};
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), ida++);
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), ida++); // q
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), ida++); // k
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[2]), ida++); // v
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[3]), ida++); // gate
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[4]), ida++); // beta
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[5]), ida++); // state
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), ida++); // dst
const int nsg = pipeline.nsg;
ggml_metal_encoder_dispatch_threadgroups(enc, op->src[2]->ne[0]/nsg, op->src[2]->ne[1], op->src[2]->ne[3], 32, nsg, 1);
return 1;
}
int ggml_metal_op_solve_tri(ggml_metal_op_t ctx, int idx) {
ggml_tensor * op = ctx->node(idx);
@@ -1963,6 +2042,7 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
(
op->src[0]->type == GGML_TYPE_F32 || // TODO: helper function
op->src[0]->type == GGML_TYPE_F16 ||
op->src[0]->type == GGML_TYPE_BF16 ||
op->src[0]->type == GGML_TYPE_Q4_0 ||
op->src[0]->type == GGML_TYPE_Q4_1 ||
op->src[0]->type == GGML_TYPE_Q5_0 ||
@@ -1977,6 +2057,8 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
op->src[0]->type == GGML_TYPE_Q4_K ||
op->src[0]->type == GGML_TYPE_Q5_K ||
op->src[0]->type == GGML_TYPE_Q6_K ||
op->src[0]->type == GGML_TYPE_Q2_K ||
op->src[0]->type == GGML_TYPE_Q3_K ||
false) && (ne11 >= 4 && ne11 <= 8)
)
)
@@ -3098,9 +3180,7 @@ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) {
ggml_metal_encoder_set_buffer (enc, bid_dst, 3);
if (pipeline.cnt) {
const int n = pipeline.c4 ? ggml_nelements(op)/4 : ggml_nelements(op);
ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
ggml_metal_encoder_dispatch_threadgroups(enc, args.ne0, ggml_nrows(op), 1, 1, 1, 1);
} else {
const int nth_max = MIN(256, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
@@ -3729,32 +3809,43 @@ int ggml_metal_op_upscale(ggml_metal_op_t ctx, int idx) {
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
const float sf0 = (float)ne0/op->src[0]->ne[0];
const float sf1 = (float)ne1/op->src[0]->ne[1];
const float sf2 = (float)ne2/op->src[0]->ne[2];
const float sf3 = (float)ne3/op->src[0]->ne[3];
float sf0 = (float)ne0/op->src[0]->ne[0];
float sf1 = (float)ne1/op->src[0]->ne[1];
float sf2 = (float)ne2/op->src[0]->ne[2];
float sf3 = (float)ne3/op->src[0]->ne[3];
const int32_t mode_flags = ggml_get_op_params_i32(op, 0);
float poffs = 0.5f;
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
poffs = 0.0f;
sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
}
ggml_metal_kargs_upscale args = {
/*.ne00 =*/ ne00,
/*.ne01 =*/ ne01,
/*.ne02 =*/ ne02,
/*.ne03 =*/ ne03,
/*.nb00 =*/ nb00,
/*.nb01 =*/ nb01,
/*.nb02 =*/ nb02,
/*.nb03 =*/ nb03,
/*.ne0 =*/ ne0,
/*.ne1 =*/ ne1,
/*.ne2 =*/ ne2,
/*.ne3 =*/ ne3,
/*.nb0 =*/ nb0,
/*.nb1 =*/ nb1,
/*.nb2 =*/ nb2,
/*.nb3 =*/ nb3,
/*.sf0 =*/ sf0,
/*.sf1 =*/ sf1,
/*.sf2 =*/ sf2,
/*.sf3 =*/ sf3
/*.ne00 =*/ ne00,
/*.ne01 =*/ ne01,
/*.ne02 =*/ ne02,
/*.ne03 =*/ ne03,
/*.nb00 =*/ nb00,
/*.nb01 =*/ nb01,
/*.nb02 =*/ nb02,
/*.nb03 =*/ nb03,
/*.ne0 =*/ ne0,
/*.ne1 =*/ ne1,
/*.ne2 =*/ ne2,
/*.ne3 =*/ ne3,
/*.nb0 =*/ nb0,
/*.nb1 =*/ nb1,
/*.nb2 =*/ nb2,
/*.nb3 =*/ nb3,
/*.sf0 =*/ sf0,
/*.sf1 =*/ sf1,
/*.sf2 =*/ sf2,
/*.sf3 =*/ sf3,
/*.poffs =*/ poffs,
};
auto pipeline = ggml_metal_library_get_pipeline_upscale(lib, op);

View File

@@ -58,6 +58,7 @@ int ggml_metal_op_soft_max (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_ssm_conv (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_ssm_scan (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_rwkv (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_gated_delta_net (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_solve_tri (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_set (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_cpy (ggml_metal_op_t ctx, int idx);

View File

@@ -1111,6 +1111,7 @@ template [[host_name("kernel_unary_f16_f16_4")]] kernel kernel_unary_t kernel_un
constant short FC_bin_op [[function_constant(FC_BIN + 0)]];
constant short FC_bin_f [[function_constant(FC_BIN + 1)]];
constant bool FC_bin_rb [[function_constant(FC_BIN + 2)]];
constant bool FC_bin_cb [[function_constant(FC_BIN + 3)]];
template <typename T0, typename T1, typename T>
kernel void kernel_bin_fuse_impl(
@@ -1124,11 +1125,12 @@ kernel void kernel_bin_fuse_impl(
#define FC_OP FC_bin_op
#define FC_F FC_bin_f
#define FC_RB FC_bin_rb
#define FC_CB FC_bin_cb
if (FC_RB) {
// row broadcast
const uint i0 = tgpig.x;
const uint i1 = i0%args.ne10;
const uint i0 = tgpig.y*args.ne00 + tgpig.x;
const uint i1 = FC_CB ? tgpig.x%args.ne10 : tgpig.x;
device const T0 * src0_row = (device const T0 *) (src0);
device T * dst_row = (device T *) (dst);
@@ -1200,7 +1202,7 @@ kernel void kernel_bin_fuse_impl(
device const T1 * src1_ptr = (device const T1 *) (src1 + args.o1[0] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11);
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
const int i10 = i0%args.ne10;
const int i10 = FC_CB ? i0%args.ne10 : i0;
if (FC_OP == 0) {
dst_ptr[i0] = src0_ptr[i0] + src1_ptr[i10];
@@ -1225,7 +1227,7 @@ kernel void kernel_bin_fuse_impl(
}
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
const int i10 = i0%args.ne10;
const int i10 = FC_CB ? i0%args.ne10 : i0;
T res = src0_ptr[i0];
@@ -1261,6 +1263,7 @@ kernel void kernel_bin_fuse_impl(
#undef FC_OP
#undef FC_F
#undef FC_RB
#undef FC_CB
}
typedef decltype(kernel_bin_fuse_impl<float, float, float>) kernel_bin_fuse_t;
@@ -2434,6 +2437,228 @@ kernel void kernel_rwkv_wkv7_f32(
}
}
constant short FC_gated_delta_net_ne20 [[function_constant(FC_GATED_DELTA_NET + 0)]];
constant short FC_gated_delta_net_ne30 [[function_constant(FC_GATED_DELTA_NET + 1)]];
#if 1
template<short NSG>
kernel void kernel_gated_delta_net_impl(
constant ggml_metal_kargs_gated_delta_net & args,
device const char * q,
device const char * k,
device const char * v,
device const char * g,
device const char * b,
device const char * s,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
#define S_v FC_gated_delta_net_ne20
#define G FC_gated_delta_net_ne30
const uint tx = tpitg.x;
const uint ty = tpitg.y;
const uint i23 = tgpig.z; // B
const uint i21 = tgpig.y; // H
const uint i20 = tgpig.x*NSG + ty;
const uint i01 = i21 % args.ne01;
const uint i11 = i21 % args.ne11;
const float scale = 1.0f / sqrt((float)S_v);
// state is stored transposed: M[i20][is] = S[is][i20], so row i20 is contiguous
device const float * s_ptr = (device const float *) (s) + (i23*args.ne21 + i21)*S_v*S_v + i20*S_v;
float ls[NSG];
FOR_UNROLL (short j = 0; j < NSG; j++) {
const short is = tx*NSG + j;
ls[j] = s_ptr[is];
}
device float * dst_attn = (device float *) (dst) + (i23*args.ne22*args.ne21 + i21)*S_v + i20;
device const float * q_ptr = (device const float *) (q + i23*args.nb03 + i01*args.nb01);
device const float * k_ptr = (device const float *) (k + i23*args.nb13 + i11*args.nb11);
device const float * v_ptr = (device const float *) (v + i23*args.nb23 + i21*args.nb21);
device const float * b_ptr = (device const float *) (b) + (i23*args.ne22*args.ne21 + i21);
device const float * g_ptr = (device const float *) (g) + (i23*args.ne22*args.ne21 + i21)*G;
for (short t = 0; t < args.ne22; t++) {
float s_k = 0.0f;
if (G == 1) {
const float g_exp = exp(g_ptr[0]);
FOR_UNROLL (short j = 0; j < NSG; j++) {
const short is = tx*NSG + j;
ls[j] *= g_exp;
s_k += ls[j]*k_ptr[is];
}
} else {
// KDA
FOR_UNROLL (short j = 0; j < NSG; j++) {
const short is = tx*NSG + j;
ls[j] *= exp(g_ptr[is]);
s_k += ls[j]*k_ptr[is];
}
}
s_k = simd_sum(s_k);
const float d = (v_ptr[i20] - s_k)*b_ptr[0];
float y = 0.0f;
FOR_UNROLL (short j = 0; j < NSG; j++) {
const short is = tx*NSG + j;
ls[j] += k_ptr[is]*d;
y += ls[j]*q_ptr[is];
}
y = simd_sum(y);
if (tx == 0) {
dst_attn[t*args.ne21*S_v] = y*scale;
}
q_ptr += args.ns02;
k_ptr += args.ns12;
v_ptr += args.ns22;
b_ptr += args.ne21;
g_ptr += args.ne21*G;
}
device float * dst_state = (device float *) (dst) + args.ne23*args.ne22*args.ne21*S_v + (i23*args.ne21 + i21)*S_v*S_v + i20*S_v;
FOR_UNROLL (short j = 0; j < NSG; j++) {
const short is = tx*NSG + j;
dst_state[is] = ls[j];
}
#undef S_v
#undef G
}
typedef decltype(kernel_gated_delta_net_impl<4>) kernel_gated_delta_net_t;
template [[host_name("kernel_gated_delta_net_f32_1")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<1>;
template [[host_name("kernel_gated_delta_net_f32_2")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<2>;
template [[host_name("kernel_gated_delta_net_f32_4")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<4>;
#else
// a simplified version of the above
// no performance improvement, so keep the above version for now
template<typename T, short NSG>
kernel void kernel_gated_delta_net_impl(
constant ggml_metal_kargs_gated_delta_net & args,
device const char * q,
device const char * k,
device const char * v,
device const char * g,
device const char * b,
device const char * s,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
#define S_v FC_gated_delta_net_ne20
#define G FC_gated_delta_net_ne30
const uint tx = tpitg.x;
const uint ty = tpitg.y;
const uint i23 = tgpig.z; // B
const uint i21 = tgpig.y; // H
const uint i20 = tgpig.x*NSG + ty;
const uint i01 = i21 % args.ne01;
const uint i11 = i21 % args.ne11;
const float scale = 1.0f / sqrt((float)S_v);
device const float * s_ptr = (device const float *) (s) + (i23*args.ne21 + i21)*S_v*S_v + i20;
float lsf[NSG];
FOR_UNROLL (short j = 0; j < NSG; j++) {
const short is = tx*NSG + j;
lsf[j] = s_ptr[is*S_v];
}
thread T * ls = (thread T *) (lsf);
device float * dst_attn = (device float *) (dst) + (i23*args.ne22*args.ne21 + i21)*S_v + i20;
device const float * q_ptr = (device const float *) (q + i23*args.nb03 + i01*args.nb01);
device const float * k_ptr = (device const float *) (k + i23*args.nb13 + i11*args.nb11);
device const float * v_ptr = (device const float *) (v + i23*args.nb23 + i21*args.nb21);
device const float * b_ptr = (device const float *) (b) + (i23*args.ne22*args.ne21 + i21);
device const float * g_ptr = (device const float *) (g) + (i23*args.ne22*args.ne21 + i21)*G;
for (short t = 0; t < args.ne22; t++) {
device const T * qt_ptr = (device const T *) (q_ptr);
device const T * kt_ptr = (device const T *) (k_ptr);
device const T * gt_ptr = (device const T *) (g_ptr);
if (G == 1) {
*ls *= exp(g_ptr[0]);
} else {
// KDA
*ls *= exp(gt_ptr[tx]);
}
const float s_k = simd_sum(dot(*ls, kt_ptr[tx]));
const float d = (v_ptr[i20] - s_k)*b_ptr[0];
*ls += kt_ptr[tx]*d;
const float y = simd_sum(dot(*ls, qt_ptr[tx]));
if (tx == 0) {
*dst_attn = y*scale;
}
q_ptr += args.ns02;
k_ptr += args.ns12;
v_ptr += args.ns22;
b_ptr += args.ne21;
g_ptr += args.ne21*G;
dst_attn += args.ne21*S_v;
}
device float * dst_state = (device float *) (dst) + args.ne23*args.ne22*args.ne21*S_v + (i23*args.ne21 + i21)*S_v*S_v + i20;
device T * dstt_state = (device T *) (dst_state);
FOR_UNROLL (short j = 0; j < NSG; j++) {
const short is = tx*NSG + j;
dst_state[is*S_v] = lsf[j];
}
#undef S_v
#undef G
}
typedef decltype(kernel_gated_delta_net_impl<float4, 4>) kernel_gated_delta_net_t;
template [[host_name("kernel_gated_delta_net_f32_1")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<float, 1>;
template [[host_name("kernel_gated_delta_net_f32_2")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<float2, 2>;
template [[host_name("kernel_gated_delta_net_f32_4")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<float4, 4>;
#endif
constant short FC_solve_tri_nsg [[function_constant(FC_SOLVE_TRI + 0)]];
constant short FC_solve_tri_n [[function_constant(FC_SOLVE_TRI + 1)]];
constant short FC_solve_tri_k [[function_constant(FC_SOLVE_TRI + 2)]];
@@ -2782,7 +3007,7 @@ kernel void kernel_l2_norm_impl(
sumf = shmem_f32[tiisg];
sumf = simd_sum(sumf);
const float scale = 1.0f/sqrt(max(sumf, args.eps));
const float scale = 1.0f/max(sqrt(sumf), args.eps);
for (int i00 = tpitg.x; i00 < args.ne00; i00 += ntg.x) {
y[i00] = x[i00] * scale;
@@ -3481,6 +3706,13 @@ template [[host_name("kernel_mul_mv_ext_f16_f32_r1_3")]] kernel mul_mv_ext_q4
template [[host_name("kernel_mul_mv_ext_f16_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, half4, 4, dequantize_f16_t4>;
template [[host_name("kernel_mul_mv_ext_f16_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, half4, 4, dequantize_f16_t4>;
#if defined(GGML_METAL_HAS_BF16)
template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, bfloat4, 4, dequantize_bf16_t4>;
template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, bfloat4, 4, dequantize_bf16_t4>;
template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, bfloat4, 4, dequantize_bf16_t4>;
template [[host_name("kernel_mul_mv_ext_bf16_f32_r1_5")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<5, bfloat4, 4, dequantize_bf16_t4>;
#endif
template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_2")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<2, block_q4_0, 32, dequantize_q4_0_t4>;
template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_3")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<3, block_q4_0, 32, dequantize_q4_0_t4>;
template [[host_name("kernel_mul_mv_ext_q4_0_f32_r1_4")]] kernel mul_mv_ext_q4_f32_t kernel_mul_mv_ext_q4_f32_disp<4, block_q4_0, 32, dequantize_q4_0_t4>;
@@ -3531,6 +3763,16 @@ template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_3")]] kernel mul_mv_ext_q4x4
template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q6_K, 256, dequantize_q6_K>;
template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q6_K, 256, dequantize_q6_K>;
template [[host_name("kernel_mul_mv_ext_q2_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q2_K, 256, dequantize_q2_K>;
template [[host_name("kernel_mul_mv_ext_q2_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q2_K, 256, dequantize_q2_K>;
template [[host_name("kernel_mul_mv_ext_q2_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q2_K, 256, dequantize_q2_K>;
template [[host_name("kernel_mul_mv_ext_q2_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q2_K, 256, dequantize_q2_K>;
template [[host_name("kernel_mul_mv_ext_q3_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q3_K, 256, dequantize_q3_K>;
template [[host_name("kernel_mul_mv_ext_q3_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q3_K, 256, dequantize_q3_K>;
template [[host_name("kernel_mul_mv_ext_q3_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q3_K, 256, dequantize_q3_K>;
template [[host_name("kernel_mul_mv_ext_q3_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q3_K, 256, dequantize_q3_K>;
template<typename T0, typename T1, short NR0, typename args_t>
void kernel_mul_mv_t_t_impl(
args_t args,
@@ -4530,7 +4772,9 @@ kernel void kernel_conv_transpose_2d<half>(
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]);
kernel void kernel_upscale_f32(
constant bool FC_upscale_aa [[function_constant(FC_UPSCALE + 0)]];
kernel void kernel_upscale_nearest_f32(
constant ggml_metal_kargs_upscale & args,
device const char * src0,
device char * dst,
@@ -4556,6 +4800,156 @@ kernel void kernel_upscale_f32(
}
}
static inline float bilinear_tri(float x) {
return MAX(0.0f, 1.0f - fabs(x));
}
kernel void kernel_upscale_bilinear_f32(
constant ggml_metal_kargs_upscale & args,
device const char * src0,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t i3 = tgpig.z;
const int64_t i2 = tgpig.y;
const int64_t i1 = tgpig.x;
const int64_t i03 = i3 / args.sf3;
const int64_t i02 = i2 / args.sf2;
const float f01 = ((float)i1 + args.poffs) / args.sf1 - args.poffs;
const int64_t i01 = MAX(0, MIN(args.ne01 - 1, (int64_t)floor(f01)));
const int64_t i01p = MAX(0, MIN(args.ne01 - 1, i01 + 1));
const float fd1 = MAX(0.0f, MIN(1.0f, f01 - (float)i01));
src0 += i03*args.nb03 + i02*args.nb02;
device float * dst_ptr = (device float *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1);
if (FC_upscale_aa) {
const float support0 = MAX(1.0f, 1.0f / args.sf0);
const float invscale0 = 1.0f / support0;
const float support1 = MAX(1.0f, 1.0f / args.sf1);
const float invscale1 = 1.0f / support1;
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
const float f00 = ((float)i0 + args.poffs) / args.sf0 - args.poffs;
int64_t x_min = MAX((int64_t)0, (int64_t)floor(f00 - support0 + args.poffs));
int64_t x_max = MIN(args.ne00, (int64_t)ceil (f00 + support0 + args.poffs));
int64_t y_min = MAX((int64_t)0, (int64_t)floor(f01 - support1 + args.poffs));
int64_t y_max = MIN(args.ne01, (int64_t)ceil (f01 + support1 + args.poffs));
float sum = 0.0f;
float wsum = 0.0f;
for (int64_t sy = y_min; sy < y_max; ++sy) {
const float wy = MAX(0.0f, 1.0f - fabs((float)sy - f01) * invscale1);
for (int64_t sx = x_min; sx < x_max; ++sx) {
const float wx = MAX(0.0f, 1.0f - fabs((float)sx - f00) * invscale0);
const float w = wx * wy;
const device const float * src_ptr = (device const float *)(src0 + sy*args.nb01 + sx*args.nb00);
sum += (*src_ptr) * w;
wsum += w;
}
}
const float v = (wsum > 0.0f) ? (sum / wsum) : 0.0f;
dst_ptr[i0] = v;
}
} else {
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
const float f00 = ((float)i0 + args.poffs) / args.sf0 - args.poffs;
const int64_t i00 = MAX(0, MIN(args.ne00 - 1, (int64_t)floor(f00)));
const int64_t i00p = MAX(0, MIN(args.ne00 - 1, i00 + 1));
const float fd0 = MAX(0.0f, MIN(1.0f, f00 - (float)i00));
device const float * src00 = (device const float *)(src0 + i01*args.nb01 + i00*args.nb00);
device const float * src10 = (device const float *)(src0 + i01*args.nb01 + i00p*args.nb00);
device const float * src01 = (device const float *)(src0 + i01p*args.nb01 + i00*args.nb00);
device const float * src11 = (device const float *)(src0 + i01p*args.nb01 + i00p*args.nb00);
const float v =
(*src00) * (1.0f - fd0) * (1.0f - fd1) +
(*src10) * fd0 * (1.0f - fd1) +
(*src01) * (1.0f - fd0) * fd1 +
(*src11) * fd0 * fd1;
dst_ptr[i0] = v;
}
}
}
static inline float bicubic_weight1(float x) {
const float a = -0.75f;
return ((a + 2) * x - (a + 3)) * x * x + 1;
}
static inline float bicubic_weight2(float x) {
const float a = -0.75f;
return ((a * x - 5 * a) * x + 8 * a) * x - 4 * a;
}
kernel void kernel_upscale_bicubic_f32(
constant ggml_metal_kargs_upscale & args,
device const char * src0,
device char * dst,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t i3 = tgpig.z;
const int64_t i2 = tgpig.y;
const int64_t i1 = tgpig.x;
const int64_t i03 = i3 / args.sf3;
const int64_t i02 = i2 / args.sf2;
const float f01 = ((float)i1 + args.poffs) / args.sf1 - args.poffs;
const int64_t i01 = (int64_t)floor(f01);
const float fd1 = f01 - (float)i01;
const float w_y0 = bicubic_weight2(fd1 + 1.0f);
const float w_y1 = bicubic_weight1(fd1);
const float w_y2 = bicubic_weight1(1.0f - fd1);
const float w_y3 = bicubic_weight2(2.0f - fd1);
const device const char * src_slice = src0 + i03 * args.nb03 + i02 * args.nb02;
device float * dst_ptr = (device float *)(dst + i3 * args.nb3 + i2 * args.nb2 + i1 * args.nb1);
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
const float f00 = ((float)i0 + args.poffs) / args.sf0 - args.poffs;
const int64_t i00 = (int64_t)floor(f00);
const float fd0 = f00 - (float)i00;
const float w_x0 = bicubic_weight2(fd0 + 1.0f);
const float w_x1 = bicubic_weight1(fd0);
const float w_x2 = bicubic_weight1(1.0f - fd0);
const float w_x3 = bicubic_weight2(2.0f - fd0);
float sum = 0.0f;
for (int dy = -1; dy <= 2; ++dy) {
const int64_t iy = MAX(0, MIN(args.ne01 - 1, i01 + dy));
const float wy = (dy == -1) ? w_y0 : (dy == 0) ? w_y1 : (dy == 1) ? w_y2 : w_y3;
for (int dx = -1; dx <= 2; ++dx) {
const int64_t ix = MAX(0, MIN(args.ne00 - 1, i00 + dx));
const float wx = (dx == -1) ? w_x0 : (dx == 0) ? w_x1 : (dx == 1) ? w_x2 : w_x3;
const device const float * src_ptr = (device const float *)(src_slice + iy * args.nb01 + ix * args.nb00);
sum += (*src_ptr) * wx * wy;
}
}
dst_ptr[i0] = sum;
}
}
kernel void kernel_pad_f32(
constant ggml_metal_kargs_pad & args,
device const char * src0,
@@ -5782,6 +6176,7 @@ template [[host_name("kernel_flash_attn_ext_f32_dk128_dv128")]] kernel flash_at
template [[host_name("kernel_flash_attn_ext_f32_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 192, 192>;
template [[host_name("kernel_flash_attn_ext_f32_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 192, 128>;
template [[host_name("kernel_flash_attn_ext_f32_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 256, 256>;
template [[host_name("kernel_flash_attn_ext_f32_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 320, 256>;
template [[host_name("kernel_flash_attn_ext_f32_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 576, 512>;
template [[host_name("kernel_flash_attn_ext_f16_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 32, 32>;
@@ -5796,6 +6191,7 @@ template [[host_name("kernel_flash_attn_ext_f16_dk128_dv128")]] kernel flash_at
template [[host_name("kernel_flash_attn_ext_f16_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 192, 192>;
template [[host_name("kernel_flash_attn_ext_f16_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 192, 128>;
template [[host_name("kernel_flash_attn_ext_f16_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 256, 256>;
template [[host_name("kernel_flash_attn_ext_f16_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 320, 256>;
template [[host_name("kernel_flash_attn_ext_f16_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 576, 512>;
#if defined(GGML_METAL_HAS_BF16)
@@ -5811,6 +6207,7 @@ template [[host_name("kernel_flash_attn_ext_bf16_dk128_dv128")]] kernel flash_at
template [[host_name("kernel_flash_attn_ext_bf16_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 192, 192>;
template [[host_name("kernel_flash_attn_ext_bf16_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 192, 128>;
template [[host_name("kernel_flash_attn_ext_bf16_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 256, 256>;
template [[host_name("kernel_flash_attn_ext_bf16_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 320, 256>;
template [[host_name("kernel_flash_attn_ext_bf16_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 576, 512>;
#endif
@@ -5826,6 +6223,7 @@ template [[host_name("kernel_flash_attn_ext_q4_0_dk128_dv128")]] kernel flash_at
template [[host_name("kernel_flash_attn_ext_q4_0_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 192, 192>;
template [[host_name("kernel_flash_attn_ext_q4_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 192, 128>;
template [[host_name("kernel_flash_attn_ext_q4_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 256, 256>;
template [[host_name("kernel_flash_attn_ext_q4_0_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 320, 256>;
template [[host_name("kernel_flash_attn_ext_q4_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 576, 512>;
template [[host_name("kernel_flash_attn_ext_q4_1_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 32, 32>;
@@ -5840,6 +6238,7 @@ template [[host_name("kernel_flash_attn_ext_q4_1_dk128_dv128")]] kernel flash_at
template [[host_name("kernel_flash_attn_ext_q4_1_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 192, 192>;
template [[host_name("kernel_flash_attn_ext_q4_1_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 192, 128>;
template [[host_name("kernel_flash_attn_ext_q4_1_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 256, 256>;
template [[host_name("kernel_flash_attn_ext_q4_1_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 320, 256>;
template [[host_name("kernel_flash_attn_ext_q4_1_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 576, 512>;
template [[host_name("kernel_flash_attn_ext_q5_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 32, 32>;
@@ -5854,6 +6253,7 @@ template [[host_name("kernel_flash_attn_ext_q5_0_dk128_dv128")]] kernel flash_at
template [[host_name("kernel_flash_attn_ext_q5_0_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 192, 192>;
template [[host_name("kernel_flash_attn_ext_q5_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 192, 128>;
template [[host_name("kernel_flash_attn_ext_q5_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 256, 256>;
template [[host_name("kernel_flash_attn_ext_q5_0_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 320, 256>;
template [[host_name("kernel_flash_attn_ext_q5_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 576, 512>;
template [[host_name("kernel_flash_attn_ext_q5_1_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 32, 32>;
@@ -5868,6 +6268,7 @@ template [[host_name("kernel_flash_attn_ext_q5_1_dk128_dv128")]] kernel flash_at
template [[host_name("kernel_flash_attn_ext_q5_1_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 192, 192>;
template [[host_name("kernel_flash_attn_ext_q5_1_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 192, 128>;
template [[host_name("kernel_flash_attn_ext_q5_1_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 256, 256>;
template [[host_name("kernel_flash_attn_ext_q5_1_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 320, 256>;
template [[host_name("kernel_flash_attn_ext_q5_1_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 576, 512>;
template [[host_name("kernel_flash_attn_ext_q8_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 32, 32>;
@@ -5882,6 +6283,7 @@ template [[host_name("kernel_flash_attn_ext_q8_0_dk128_dv128")]] kernel flash_at
template [[host_name("kernel_flash_attn_ext_q8_0_dk192_dv192")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 192, 192>;
template [[host_name("kernel_flash_attn_ext_q8_0_dk192_dv128")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 192, 128>;
template [[host_name("kernel_flash_attn_ext_q8_0_dk256_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 256, 256>;
template [[host_name("kernel_flash_attn_ext_q8_0_dk320_dv256")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 320, 256>;
template [[host_name("kernel_flash_attn_ext_q8_0_dk576_dv512")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 576, 512>;
#undef FA_TYPES
@@ -6452,6 +6854,17 @@ template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk256_dv256")]] kernel flas
template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 8, dequantize_q5_1_t4, block_q5_1, 8, dequantize_q5_1_t4, 256, 256, 1>;
template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk256_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 256, 256, 1>;
template [[host_name("kernel_flash_attn_ext_vec_f32_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES_F32, float4, 1, dequantize_f32_t4, float4, 1, dequantize_f32_t4, 320, 256, 2>;
template [[host_name("kernel_flash_attn_ext_vec_f16_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 320, 256, 2>;
#if defined(GGML_METAL_HAS_BF16)
template [[host_name("kernel_flash_attn_ext_vec_bf16_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4, 1, dequantize_bf16_t4, bfloat4, 1, dequantize_bf16_t4, 320, 256, 2>;
#endif
template [[host_name("kernel_flash_attn_ext_vec_q4_0_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 8, dequantize_q4_0_t4, block_q4_0, 8, dequantize_q4_0_t4, 320, 256, 2>;
template [[host_name("kernel_flash_attn_ext_vec_q4_1_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1, 8, dequantize_q4_1_t4, block_q4_1, 8, dequantize_q4_1_t4, 320, 256, 2>;
template [[host_name("kernel_flash_attn_ext_vec_q5_0_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0, 8, dequantize_q5_0_t4, block_q5_0, 8, dequantize_q5_0_t4, 320, 256, 2>;
template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 8, dequantize_q5_1_t4, block_q5_1, 8, dequantize_q5_1_t4, 320, 256, 2>;
template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk320_dv256")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 320, 256, 2>;
template [[host_name("kernel_flash_attn_ext_vec_f32_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES_F32, float4, 1, dequantize_f32_t4, float4, 1, dequantize_f32_t4, 576, 512, 2>;
template [[host_name("kernel_flash_attn_ext_vec_f16_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 576, 512, 2>;
#if defined(GGML_METAL_HAS_BF16)
@@ -8912,6 +9325,7 @@ template [[host_name("kernel_mul_mm_id_map0_ne20_6" )]] kernel kernel_mul_mm_id_
template [[host_name("kernel_mul_mm_id_map0_ne20_8" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<8>;
template [[host_name("kernel_mul_mm_id_map0_ne20_10")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<10>;
template [[host_name("kernel_mul_mm_id_map0_ne20_16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<16>;
template [[host_name("kernel_mul_mm_id_map0_ne20_22")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<22>;
template<typename S0, typename S0_4x4, typename S0_8x8, typename S1, typename S1_2x4, typename S1_8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &), typename T0, typename T0_4x4, typename T1, typename T1_2x4>
kernel void kernel_mul_mm_id(

View File

@@ -116,6 +116,7 @@ set(GGML_OPENCL_KERNELS
neg
norm
relu
l2_norm
rms_norm
rope
scale
@@ -131,6 +132,7 @@ set(GGML_OPENCL_KERNELS
ssm_conv
sub
sum_rows
cumsum
transpose
concat
tsembd

View File

@@ -497,6 +497,7 @@ struct ggml_backend_opencl_context {
kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16, kernel_geglu_erf_f16, kernel_geglu_quick_f16;
cl_kernel kernel_norm, kernel_norm_mul_add;
cl_kernel kernel_rms_norm, kernel_rms_norm_mul;
cl_kernel kernel_l2_norm_f32;
cl_kernel kernel_group_norm, kernel_group_norm_mul_add;
cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
cl_kernel kernel_diag_f32;
@@ -546,6 +547,7 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
cl_kernel kernel_argsort_f32_i32;
cl_kernel kernel_sum_rows_f32, kernel_sum_rows_f32_4;
cl_kernel kernel_cumsum_blk, kernel_cumsum_add;
cl_kernel kernel_repeat_f32;
cl_kernel kernel_pad;
cl_kernel kernel_tanh_f32, kernel_tanh_f32_4, kernel_tanh_f32_nc;
@@ -1585,6 +1587,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
GGML_LOG_CONT(".");
}
// l2_norm
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "l2_norm.cl.h"
};
#else
const std::string kernel_src = read_file("l2_norm.cl");
#endif
cl_program prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_l2_norm_f32 = clCreateKernel(prog, "kernel_l2_norm_f32", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
// rope
{
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1909,6 +1928,24 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
GGML_LOG_CONT(".");
}
// cumsum
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "cumsum.cl.h"
};
#else
const std::string kernel_src = read_file("cumsum.cl");
#endif
cl_program prog;
prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_cumsum_blk = clCreateKernel(prog, "kernel_cumsum_blk", &err), err));
CL_CHECK((backend_ctx->kernel_cumsum_add = clCreateKernel(prog, "kernel_cumsum_add", &err), err));
GGML_LOG_CONT(".");
CL_CHECK(clReleaseProgram(prog));
}
// sigmoid
{
#ifdef GGML_OPENCL_EMBED_KERNELS
@@ -3689,6 +3726,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
return true;
case GGML_OP_RMS_NORM:
return op->ne[0] % 4 == 0 && ggml_is_contiguous_rows(op->src[0]);
case GGML_OP_L2_NORM:
return ggml_is_contiguous_rows(op->src[0]);
case GGML_OP_REPEAT:
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
case GGML_OP_PAD:
@@ -3783,6 +3822,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
return cols <= max_workgroup_size && op->src[0]->type == GGML_TYPE_F32;
}
case GGML_OP_SUM_ROWS:
case GGML_OP_CUMSUM:
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
case GGML_OP_MEAN:
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_FLASH_ATTN_EXT:
@@ -5755,19 +5796,12 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
const int ne00 = src0->ne[0];
const cl_ulong nb01 = src0->nb[1];
const cl_ulong nb02 = src0->nb[2];
const cl_ulong nb03 = src0->nb[3];
const int ne10 = src1->ne[0];
const cl_ulong nb10 = src1->nb[0];
const int ne11 = src1->ne[1];
const int ne12 = src1->ne[2];
const cl_ulong nb11 = src1->nb[1];
const cl_ulong nb12 = src1->nb[2];
const cl_ulong nb1 = dst->nb[1];
const cl_ulong nb2 = dst->nb[2];
const cl_ulong nb3 = dst->nb[3];
GGML_TENSOR_LOCALS(int, ne0, src0, ne);
GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
GGML_TENSOR_LOCALS(int, ne1, src1, ne);
GGML_TENSOR_LOCALS(cl_ulong, nb1, src1, nb);
GGML_TENSOR_LOCALS(int, ne, dst, ne);
GGML_TENSOR_LOCALS(cl_ulong, nb, dst, nb);
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
@@ -5813,8 +5847,14 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb3));
size_t global_work_size[] = {(size_t)ne10*64, (size_t)ne11, (size_t)ne12};
size_t local_work_size[] = {64, 1, 1};
int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
int nth = 1;
while (nth < ne00 && 2*nth <= max_workgroup_size) {
nth *= 2;
}
size_t global_work_size[] = {(size_t)ne10*nth, (size_t)ne11, (size_t)ne12};
size_t local_work_size[] = {(size_t)nth, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
}
@@ -7554,6 +7594,64 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
}
static void ggml_cl_l2_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
UNUSED(src1);
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
GGML_TENSOR_LOCALS(int, ne0, src0, ne);
GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
size_t sgs;
if (backend_ctx->gpu_family == ADRENO) {
sgs = 64;
} else if (backend_ctx->gpu_family == INTEL) {
sgs = 32;
} else {
GGML_ASSERT(false && "Unsupported GPU");
}
cl_kernel kernel = backend_ctx->kernel_l2_norm_f32;
int nth = sgs;
while (nth < ne00 && nth < (int)backend_ctx->get_kernel_workgroup_size(kernel)) {
nth *= 2;
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
size_t local_work_size[] = {(size_t)nth, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
}
static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
@@ -11871,6 +11969,118 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
}
static void ggml_cl_cumsum(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
GGML_UNUSED(src1);
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
GGML_ASSERT(ggml_is_contiguous(src0));
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
GGML_TENSOR_LOCALS(int, ne0, src0, ne);
GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
cl_kernel kernel = backend_ctx->kernel_cumsum_blk;
int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
int nth = 1;
while (nth < ne00 && 2*nth <= max_workgroup_size) {
nth *= 2;
}
GGML_ASSERT(ne00 <= nth*nth);
const int net0 = CEIL_DIV(ne00, nth);
const int net1 = ne01;
const int net2 = ne02;
const int net3 = ne03;
const cl_ulong nbt0 = sizeof(float);
const cl_ulong nbt1 = net0*nbt0;
const cl_ulong nbt2 = net1*nbt1;
const cl_ulong nbt3 = net2*nbt2;
static ggml_cl_buffer tmp_buffer;
tmp_buffer.allocate(backend_ctx->context, net0*ne01*ne02*ne03*sizeof(float));
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &tmp_buffer.buffer));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne03));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb00));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &net0));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &net1));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &net2));
size_t global_work_size[] = { (size_t)(nth*net0*ne01), (size_t)ne02, (size_t)ne03};
size_t local_work_size[] = { (size_t)nth, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
if(ne00 > nth) {
// if a single workgroup cannot handle an entire row, each workgroup
// computes a partial sum and stores to dst, tmp_buffer contains the sum
// of the each workgroup; cumsum this buffer and add to the partial sums in dst
cl_ulong offsett = 0;
kernel = backend_ctx->kernel_cumsum_blk;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &tmp_buffer.buffer));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offsett));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &tmp_buffer.buffer));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &tmp_buffer.buffer));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offsett));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &net0));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne03));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nbt0));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nbt1));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nbt2));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nbt3));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &net0));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &net1));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &net2));
size_t global_work_size_1[] = { (size_t)net1*nth, (size_t)net2, (size_t)net3};
size_t local_work_size_1[] = { (size_t)nth, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_1, local_work_size_1, dst);
kernel = backend_ctx->kernel_cumsum_add;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &tmp_buffer.buffer));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne03));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &nbt0));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &nbt1));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &nbt2));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &nbt3));
size_t global_work_size_2[] = { (size_t)(nth*net0*ne01), (size_t)ne02, (size_t)ne03};
size_t local_work_size_2[] = { (size_t)nth, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_2, local_work_size_2, dst);
}
}
static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
@@ -12184,6 +12394,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
}
func = ggml_cl_rms_norm;
break;
case GGML_OP_L2_NORM:
if (!any_on_device) {
return false;
}
func = ggml_cl_l2_norm;
break;
case GGML_OP_GROUP_NORM:
if (!any_on_device) {
return false;
@@ -12307,6 +12523,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
}
func = ggml_cl_sum_rows;
break;
case GGML_OP_CUMSUM:
if (!any_on_device) {
return false;
}
func = ggml_cl_cumsum;
break;
case GGML_OP_FLASH_ATTN_EXT:
if (!any_on_device) {
return false;

View File

@@ -0,0 +1,139 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#ifdef cl_intel_required_subgroup_size
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif
// max workgroup size is usually 1024, this covers various subgroups sizes
#define MAX_SUBGROUPS 128
#ifdef INTEL_GPU
REQD_SUBGROUP_SIZE_32
#elif defined (ADRENO_GPU)
REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_cumsum_blk(
global char * src0,
ulong offset0,
global char * tmp,
global char * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne03,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
uint net0,
uint net1,
uint net2
) {
src0 = src0 + offset0;
dst = dst + offsetd;
const int i3 = get_group_id(2);
const int i2 = get_group_id(1);
const int i1 = get_group_id(0);
const int nth = get_local_size(0);
const int tid = get_local_id(0);
const uint sg_size = get_sub_group_size();
const uint sg_id = get_sub_group_id();
const uint sg_lid = get_sub_group_local_id();
const int ib = i1 / ne01;
const int i00 = ib * nth;
const int i01 = i1 % ne01;
const int i02 = i2;
const int i03 = i3;
global const float * src0_row = (global const float *)(src0 + i03*nb03 + i02*nb02 + i01*nb01);
global float * tmp_row = (global float *)tmp + net0 * i01 + net0 * net1 * i02 + net0 * net1 * net2 * i03;
global float * dst_row = (global float *)dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
__local float partial[MAX_SUBGROUPS];
float v = 0.0f;
if (i00 + tid < ne00) {
v = src0_row[i00 + tid];
}
float s = sub_group_scan_inclusive_add(v);
if (sg_lid == sg_size - 1) {
partial[sg_id] = s;
}
barrier(CLK_LOCAL_MEM_FENCE);
// NB: subgroup size should be larger than number of subgroups
// assuming max workgroup size of 1024, subgroup size should be >= 32
if (sg_id == 0) {
float x = 0.0f;
if (sg_lid < get_num_sub_groups()) {
x = partial[sg_lid];
}
float ex = sub_group_scan_exclusive_add(x);
if (sg_lid < get_num_sub_groups()) {
partial[sg_lid] = ex;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
s += partial[sg_id];
if (i00 + tid < ne00) {
dst_row[i00 + tid] = s;
}
if (ne00 > nth && tid == nth - 1) {
tmp_row[ib] = s;
}
}
kernel void kernel_cumsum_add(
global char * tmp,
global char * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne03,
uint nbt0,
uint nbt1,
uint nbt2,
uint nbt3
) {
dst = dst + offsetd;
const int i3 = get_group_id(2);
const int i2 = get_group_id(1);
const int i1 = get_group_id(0);
const int nth = get_local_size(0);
const int tid = get_local_id(0);
const int ib = i1 / ne01;
if (ib == 0) {
return;
}
const int i00 = ib * nth;
const int i01 = i1 % ne01;
const int i02 = i2;
const int i03 = i3;
global float * tmp_row = (global float *)(tmp + nbt1 * i01 + nbt2 * i02 + nbt3 * i03);
global float * dst_row = (global float *)dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
if (i00 + tid < ne00) {
dst_row[i00 + tid] += tmp_row[ib - 1];
}
}

View File

@@ -0,0 +1,71 @@
#ifdef cl_intel_required_subgroup_size
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif
#ifdef INTEL_GPU
REQD_SUBGROUP_SIZE_32
#elif defined (ADRENO_GPU)
REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_l2_norm_f32(
global void * src0,
ulong offset0,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne03,
ulong nb01,
ulong nb02,
ulong nb03,
float eps,
local float * sum
) {
src0 = (global void*)((global char*)src0 + offset0);
dst = (global float*)((global char*)dst + offsetd);
int i03 = get_group_id(2);
int i02 = get_group_id(1);
int i01 = get_group_id(0);
global float * x = (global float *) ((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01);
global float * y = (global float *) (dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
float sumf = 0;
// parallel sum
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
sumf += x[i00] * x[i00];
}
sumf = sub_group_reduce_add(sumf);
if (get_sub_group_local_id() == 0) {
sum[get_sub_group_id()] = sumf;
}
barrier(CLK_LOCAL_MEM_FENCE);
// broadcast
for (uint i = get_local_size(0) / get_max_sub_group_size() / 2; i > 0; i /= 2) {
if (get_local_id(0) < i) {
sum[get_local_id(0)] += sum[get_local_id(0) + i];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
const float scale = 1.0f/max(sqrt(sum[0]), eps);
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
y[i00] = x[i00] * scale;
}
}

View File

@@ -0,0 +1,154 @@
---
# Override root .clang-format
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
Cpp11BracedListStyle: true
SpacesInContainerLiterals: false
BreakBeforeBraces: Attach
AccessModifierOffset: -4
IndentCaseBlocks: false
IndentCaseLabels: false
Language: Cpp
AlignAfterOpenBracket: Align
AlignArrayOfStructures: Left
AlignConsecutiveBitFields: AcrossComments
AlignConsecutiveMacros: AcrossComments
# AlignConsecutiveShortCaseStatements: AcrossComments
AlignEscapedNewlines: Left # LeftWithLastLine
AlignOperands: Align
AlignTrailingComments:
Kind: Always
OverEmptyLines: 1
AllowAllArgumentsOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: false
# AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
AllowShortBlocksOnASingleLine: Never
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Inline
AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: true
# Treat CUDA keywords/attributes as "attribute macros" and avoid breaking lines inside them
AttributeMacros:
- __host__
- __device__
- __global__
- __forceinline__
- __launch_bounds__
BinPackArguments: true
BinPackParameters: false # OnePerLine
BitFieldColonSpacing: Both
# BreakAdjacentStringLiterals: true
BreakAfterAttributes: Never
BreakBeforeBinaryOperators: None
BreakBeforeInlineASMColon: OnlyMultiline
BreakBeforeTernaryOperators: false
# BreakBinaryOperations: Never
BreakConstructorInitializers: AfterColon
# BreakFunctionDefinitionParameters: false
BreakInheritanceList: AfterComma
BreakStringLiterals: true
# BreakTemplateDeclarations: Yes
ColumnLimit: 120
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
DerivePointerAlignment: false
DisableFormat: false
EmptyLineBeforeAccessModifier: Leave
EmptyLineAfterAccessModifier: Never
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
IncludeBlocks: Regroup
IncludeCategories:
- Regex: '".*"'
Priority: 1
SortPriority: 0
- Regex: '^<.*\.h>'
Priority: 2
SortPriority: 0
- Regex: '^<.*'
Priority: 3
SortPriority: 0
- Regex: '.*'
Priority: 4
SortPriority: 0
IncludeIsMainRegex: '([-_](test|unittest))?$'
IncludeIsMainSourceRegex: ''
IndentAccessModifiers: false
IndentExternBlock: NoIndent
IndentGotoLabels: false
IndentPPDirectives: AfterHash
IndentWidth: 4
IndentWrappedFunctionNames: false
InsertBraces: true # NOTE: may lead to incorrect formatting
InsertNewlineAtEOF: true
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
LambdaBodyIndentation: Signature
LineEnding: LF
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBinPackProtocolList: Auto
ObjCBlockIndentWidth: 4
ObjCSpaceAfterProperty: true
ObjCSpaceBeforeProtocolList: true
PPIndentWidth: -1
PackConstructorInitializers: CurrentLine
PenaltyBreakAssignment: 2
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Middle
QualifierAlignment: Left
#QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
RawStringFormats:
- Language: Cpp
Delimiters:
- cc
- CC
- cpp
- Cpp
- CPP
- 'c++'
- 'C++'
CanonicalDelimiter: ''
ReferenceAlignment: Middle
ReflowComments: false # IndentOnly
SeparateDefinitionBlocks: Always
SortIncludes: CaseInsensitive
SortUsingDeclarations: LexicographicNumeric
SpaceAfterCStyleCast: true
SpaceAfterLogicalNot: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatements
SpaceBeforeRangeBasedForLoopColon: true
SpaceInEmptyBlock: false
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 2
SpacesInAngles: Never
SpacesInLineCommentPrefix:
Minimum: 1
Maximum: -1
SpacesInParentheses: false
SpacesInSquareBrackets: false
SpaceBeforeSquareBrackets: false
Standard: c++17
TabWidth: 4
UseTab: Never
WhitespaceSensitiveMacros: ['STRINGIZE']
...

View File

@@ -0,0 +1,22 @@
find_package(OpenVINO REQUIRED)
find_package(OpenCL REQUIRED)
include("${OpenVINO_DIR}/../3rdparty/tbb/lib/cmake/TBB/TBBConfig.cmake")
file(GLOB_RECURSE GGML_HEADERS_OPENVINO "*.h" "*.hpp")
file(GLOB_RECURSE GGML_SOURCES_OPENVINO "*.cpp")
ggml_add_backend_library(ggml-openvino
${GGML_SOURCES_OPENVINO}
${GGML_HEADERS_OPENVINO}
)
target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb OpenCL::OpenCL)
if (GGML_OPENVINO)
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64")
else()
message(FATAL_ERROR "OpenVINO: OpenVINO toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}")
endif()
endif()

View File

@@ -0,0 +1,975 @@
#include "ggml-decoder.h"
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml-openvino-extra.h"
#include "ggml-openvino.h"
#include "ggml-quants.h"
#include <ggml-impl.h>
#include <ggml.h>
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <execution>
#include <fstream>
#include <iomanip>
#include <map>
#include <memory>
#include <mutex>
#include <openvino/core/dimension.hpp>
#include <openvino/core/except.hpp>
#include <openvino/core/node.hpp>
#include <openvino/core/partial_shape.hpp>
#include <openvino/core/type/bfloat16.hpp>
#include <openvino/core/type/element_type.hpp>
#include <openvino/core/type/float16.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/parameter.hpp>
#include <openvino/runtime/tensor.hpp>
#include <optional>
#include <ostream>
#include <set>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <vector>
GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
ModelParams & model_params,
ComputeParams & compute_params,
std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
bool is_static,
bool is_stateful,
bool is_prefill,
int prefill_chunk_size) :
m_is_static(is_static),
m_is_stateful(is_stateful),
m_is_prefill(is_prefill),
m_naive(false),
m_prefill_chunk_size(prefill_chunk_size),
m_cgraph(cgraph),
m_model_weights(model_weights),
m_model_params(model_params),
m_compute_params(compute_params) {
if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") {
#ifdef _WIN32
_putenv_s("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS", "");
#else
unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS");
#endif
print_tensor_address_map(cgraph);
}
validate_cgraph();
set_input_output();
compute_model_inputs();
compute_model_outputs();
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
m_node_info_list[node_n].node_op_type = compute_op_type(m_node_info_list[node_n].node);
}
add_extra_inputs();
}
void GgmlOvDecoder::update_io(ggml_cgraph * cgraph) {
m_cgraph = cgraph;
m_model_inputs.clear();
m_model_outputs.clear();
m_node_info_list.clear();
set_input_output();
compute_model_inputs();
compute_model_outputs();
}
GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights) {
m_cgraph = cgraph;
m_model_weights = model_weights;
m_naive = true;
set_input_output();
compute_model_inputs();
compute_model_outputs();
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
m_node_info_list[node_n].node_op_type = compute_op_type(m_node_info_list[node_n].node);
}
}
void GgmlOvDecoder::set_input_output() {
for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
auto node = m_cgraph->nodes[node_n];
NodeInfo current_node_info;
auto node_name = std::string(node->name);
auto node_output_name = node_name;
auto * node_output = node;
if (node->op == GGML_OP_SET_ROWS) {
// SET_ROWS updates the tensor in place. For later ov op that uses the
// the view_src of SET_ROWS, we need to make sure they get the updated tensor
// by putting the view_src name in the tensor_map in
// <openvino>/src/frontends/ggml/src/translate_session.cpp
node_output_name = std::string(node->view_src->name);
node_output = node->view_src;
}
current_node_info.node = node;
current_node_info.node_name = node_name;
current_node_info.node_output = node_output;
current_node_info.node_output_name = node_output_name;
current_node_info.node_op_case = 0;
current_node_info.data_addr = node->data;
for (int i = 0; i < GGML_MAX_SRC; i++) {
auto * src = node->src[i];
if (src == nullptr) {
continue;
}
auto src_name = std::string(src->name);
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
src_name = get_graph_input_ov_name(src, node);
}
current_node_info.node_inputs[src_name] = src;
current_node_info.node_inputs_names.push_back(src_name);
}
m_node_info_list.push_back(current_node_info);
}
}
int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
int op_case = 0;
switch (node->op) {
case GGML_OP_RESHAPE: {
auto * src = node->src[0];
if (src->op == GGML_OP_RESHAPE && src->src[0]->ne[0] == node->ne[0] && src->src[0]->ne[1] == node->ne[1]) {
op_case = 4;
} else if (node->ne[0] * node->ne[1] == src->ne[0]) {
op_case = 1;
} else if (src->ne[0] * src->ne[1] == node->ne[0]) {
op_case = 2;
if (src->ne[2] * src->ne[3] == node->ne[1]) {
op_case = 5;
}
} else if (src->ne[0] * src->ne[1] == node->ne[1]) {
op_case = 3;
} else if (src->ne[1] * src->ne[2] == node->ne[1]) {
op_case = 6;
}
break;
}
case GGML_OP_CONT: {
if (node->src[0]->op == GGML_OP_PERMUTE) {
op_case = 1;
} else if (node->src[0]->op == GGML_OP_TRANSPOSE) {
op_case = 2;
} else if (node->src[0]->op == GGML_OP_VIEW) {
op_case = 3;
}
break;
}
case GGML_OP_PERMUTE: {
if (node->src[0]->op != GGML_OP_VIEW) {
op_case = 1;
} else if (node->src[0]->src[0]->op == GGML_OP_NONE) {
// kv cache tensor
std::string src_name(node->view_src->name);
int layer = extract_layer_from_name(src_name);
if (!is_swa_layer(layer)) {
op_case = 2;
} else {
op_case = 3;
}
} else {
// rope'ed query tensor
op_case = 4;
}
break;
}
case GGML_OP_MUL_MAT: {
if (node->src[0]->op == GGML_OP_CONT && node->src[0]->src[0]->op == GGML_OP_TRANSPOSE) {
op_case = 2;
} else if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) {
op_case = 3;
}
break;
}
case GGML_OP_GET_ROWS: {
if (node->src[1]->op == GGML_OP_VIEW) {
op_case = 2;
}
break;
}
case GGML_OP_ROPE: {
if (node->src[0]->op == GGML_OP_VIEW) {
op_case = 2;
}
break;
}
case GGML_OP_VIEW: {
if (node->src[0]->op == GGML_OP_VIEW) {
auto * src = node->src[0];
if (ggml_nelements(node) != ggml_nelements(src)) {
throw std::runtime_error("Unsupported VIEW case");
}
op_case = 2;
}
{
auto * src = node->src[0];
if ((ggml_nelements(node) != ggml_nelements(src)) && m_naive) {
// Compare each dimension of node and src, if only one dimension differs then op_case=3
int diff_count = 0;
for (int i = 0; i < GGML_MAX_DIMS; i++) {
if (node->ne[i] != src->ne[i]) {
diff_count++;
}
}
if (diff_count == 1) {
op_case = 3;
}
}
}
break;
}
default:
break;
}
return op_case;
}
int extract_layer_from_name(const std::string & name) {
size_t pos1 = name.find("_l");
assert(pos1 != std::string::npos);
pos1 += 2;
size_t pos2 = name.find(' ', pos1);
if (pos2 == std::string::npos) {
pos2 = name.length();
}
std::string layer_str = name.substr(pos1, pos2 - pos1);
int layer = std::stoi(layer_str);
return layer;
}
std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgraph * cgraph, bool is_static) {
ModelParams model_params;
ComputeParams compute_params;
for (int i = 0; i < cgraph->n_nodes; i++) {
auto * node = cgraph->nodes[i];
std::string name = std::string(node->name);
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
model_params.n_heads = node->src[0]->ne[2];
model_params.n_heads_kv = node->src[1]->ne[2];
model_params.head_size = node->src[0]->ne[0];
compute_params.input_len = node->src[0]->ne[1];
auto * cache_k_perm = node->src[1];
if (cache_k_perm->op == GGML_OP_CPY) {
cache_k_perm = cache_k_perm->src[0];
}
assert(cache_k_perm->op == GGML_OP_PERMUTE);
auto * cache_k_view = cache_k_perm->src[0];
assert(cache_k_view->op == GGML_OP_VIEW);
auto * cache_k = cache_k_view->src[0];
int layer = extract_layer_from_name(cache_k->name);
auto * mask = node->src[3];
std::string mask_name(mask->name);
model_params.kv_buffer_ctx_id = ggml_backend_openvino_buffer_get_ctx_id(cache_k->buffer);
if (mask_name.find("swa") != std::string::npos) {
model_params.swa_layers.push_back(layer);
model_params.ctx_per_seq_swa = cache_k->ne[1];
} else {
model_params.ctx_per_seq = cache_k->ne[1];
model_params.n_seq = cache_k->ne[2];
}
compute_params.n_seq_active = mask->ne[3];
auto seq_size = cache_k->ne[0] * cache_k->ne[1] * ggml_type_size(cache_k->type);
size_t offset;
memcpy(&offset, cache_k_view->op_params, sizeof(size_t));
compute_params.seq_active_start = offset / seq_size;
compute_params.token_len_per_seq = node->ne[2];
if (mask_name.find("swa") != std::string::npos) {
compute_params.attention_size_swa = mask->ne[0];
} else {
compute_params.attention_size = mask->ne[0];
}
if (is_static) {
compute_params.attention_size = model_params.ctx_per_seq;
compute_params.attention_size_swa = model_params.ctx_per_seq_swa;
compute_params.token_len_per_seq = 1;
}
break;
}
if (node->op == GGML_OP_ROPE) {
memcpy(model_params.rope_params, node->op_params, sizeof(int32_t) * 15);
}
}
auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1];
compute_params.output_len = output_tensor->ne[1];
// for NPU, output_len is always 1 except for llama-perplexity
if (is_static && compute_params.output_len == 0) {
compute_params.output_len = 1;
}
model_params.ctx = model_params.ctx_per_seq * model_params.n_seq;
model_params.ctx_swa = model_params.ctx_per_seq_swa * model_params.n_seq;
return {model_params, compute_params};
}
void GgmlOvDecoder::validate_cgraph() const {
if (m_model_params.n_seq > 1 && m_is_static == true) {
throw std::runtime_error("n_seq > 1 is not supported on NPU. Try setting -np 1.");
}
}
ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const {
if (m_naive) {
return input!= nullptr ? ov::PartialShape{get_shape(input)} : ov::PartialShape{get_shape(op)};
}
auto name = std::string(input->name);
ov::PartialShape input_shape;
if (is_inp_tok(input, op) || is_inp_pos(input, op)) {
// tokens or positions
int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
input_shape = ov::PartialShape{1, 1, 1, len};
} else if (is_output_idx(input, op)) {
// output index
input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1};
} else if (is_inp_mask(input, op)) {
// mask
if (m_is_static) {
input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_model_params.ctx};
} else if (m_is_stateful) {
input_shape = ov::PartialShape{1, 1, -1, -1};
} else {
input_shape = ov::PartialShape{-1, 1, -1, -1};
}
} else if (is_kvcache(input, op)) {
// kvcache
input_shape = ov::PartialShape{get_shape(input)};
if (!m_is_static) {
// do not fix ctx size to make llama-bench work across test params
input_shape[2] = -1;
}
if (is_stateful()) {
// Convert stateless KV cache layout [1, 1, seq, n_heads_kv * head_size]
// to stateful layout [1, seq, n_heads_kv, head_size].
assert(input_shape.size() == 4 && input_shape[0] == 1 && input_shape[1] == 1 &&
input_shape[2].is_dynamic() &&
input_shape[3] == (m_model_params.n_heads_kv * m_model_params.head_size));
input_shape = {input_shape[0], ov::Dimension::dynamic(), m_model_params.n_heads_kv,
m_model_params.head_size};
}
} else if (is_kv_idx(input, op)) {
// kv update index
int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
input_shape = ov::PartialShape{1, 1, 1, len};
} else {
input_shape = ov::PartialShape{get_shape(input)};
}
return input_shape;
}
void GgmlOvDecoder::add_extra_inputs() {
// Extra inputs:
// 1. `attention_size`, used in FLASH_ATTN where the shape of the matmul's are 256 aligned,
// see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
// 2. `n_seq_active` and `seq_active_start`, used in FLASH_ATTN_EXT to indicate the active sequences in the batch
auto create_1d_input = [this](const std::string & name, int64_t value) {
if (m_is_static) {
auto constant =
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{value});
constant->set_friendly_name(name);
m_model_extra_inputs[name] = constant;
} else {
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
param_node->set_friendly_name(name);
param_node->output(0).get_tensor().set_names({name});
m_model_extra_inputs[name] = param_node;
auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
*tensor->data<int64_t>() = value;
m_model_extra_input_values[name] = tensor;
}
};
create_1d_input("attention_size", m_compute_params.attention_size);
if (m_compute_params.attention_size_swa != -1) {
create_1d_input("attention_size_swa", m_compute_params.attention_size_swa);
}
create_1d_input("n_seq_active", m_compute_params.n_seq_active);
create_1d_input("seq_active_start", m_compute_params.seq_active_start);
create_1d_input("seq_active_end", m_compute_params.seq_active_start + m_compute_params.n_seq_active);
create_1d_input("token_len_per_seq", m_compute_params.token_len_per_seq);
// create_1d_input("token_len", m_token_len_per_seq * m_n_seq_active);
}
bool GgmlOvDecoder::node_is_used_as_src(const int node_idx) {
ggml_tensor * node = m_cgraph->nodes[node_idx];
for (int i = node_idx; i < m_cgraph->n_nodes; i++) {
ggml_tensor * other_node = m_cgraph->nodes[i];
for (int j = 0; j < GGML_MAX_SRC; j++) {
if (other_node->src[j] == node) {
return true;
}
}
}
return false;
}
void GgmlOvDecoder::compute_model_inputs() {
m_model_inputs.clear();
m_inputs.clear();
for (int i = 0; i < m_cgraph->n_nodes; i++) {
ggml_tensor * node = m_cgraph->nodes[i];
// the node op is NONE means this node maybe as input of later nodes, we should add it to model inputs for this node.
if (node->op == GGML_OP_NONE && node_is_used_as_src(i)) {
std::string node_name(node->name);
if (m_model_weights.find(node_name) == m_model_weights.end()) {
m_inputs[node_name] = node;
auto param_node =
std::make_shared<ov::op::v0::Parameter>(get_ov_type(node), get_graph_input_shape(node, nullptr));
param_node->set_friendly_name(node_name);
param_node->output(0).get_tensor().set_names({node_name});
m_model_inputs[node_name] = param_node;
}
continue;
}
for (int i = 0; i < GGML_MAX_SRC; i++) {
auto * src = node->src[i];
if (src == nullptr) {
continue;
}
std::string src_name = std::string(src->name);
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
src_name = get_graph_input_ov_name(src, node);
}
if (m_model_weights.find(src_name) != m_model_weights.end()) {
continue;
}
bool is_intermediate_node = false;
for (const auto & node_info : m_node_info_list) {
if (node_info.node == src) {
is_intermediate_node = true;
break;
}
}
if (is_intermediate_node) {
continue;
}
if (m_model_inputs.find(src_name) != m_model_inputs.end()) {
continue;
}
m_inputs[src_name] = src;
ggml_backend_buffer * buffer = src->buffer;
// GGML_BACKEND_BUFFER_USAGE_ANY are kv caches
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) {
if (auto it = std::find(m_model_params.kv_names.begin(), m_model_params.kv_names.end(), src_name);
it == m_model_params.kv_names.end()) {
m_model_params.kv_names.push_back(src_name);
}
}
ov::PartialShape param_shape = get_graph_input_shape(node, src);
auto param_node = std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), param_shape);
param_node->set_friendly_name(src_name);
param_node->output(0).get_tensor().set_names({src_name});
m_model_inputs[src_name] = param_node;
}
}
}
void GgmlOvDecoder::compute_model_outputs() {
m_model_outputs.clear();
m_model_output_names.clear();
for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
auto * cur_node = m_cgraph->nodes[node_n];
// if the node op is NONE means this node is not used at all, we can skip it directly without adding to model outputs.
if (cur_node->op == GGML_OP_NONE) {
continue;
}
auto cur_node_use_count = m_cgraph->use_counts[ggml_hash_find(&m_cgraph->visited_hash_set, cur_node)];
if (cur_node_use_count == 0) {
// The output of SET_ROWS is the view_src tensor, which is updated in place. We should use the view_src name as the output name to make sure it can be correctly matched with the later ops that use the view_src.
if (cur_node != nullptr && cur_node->op == GGML_OP_SET_ROWS) {
cur_node = cur_node->view_src;
}
} else {
int input_use_count = 0;
for (int i = 0; i < m_cgraph->n_nodes; i++) {
ggml_tensor * node = m_cgraph->nodes[i];
for (int j = 0; j < GGML_MAX_SRC; j++) {
if (node->src[j] != NULL && node->src[j] == cur_node) {
input_use_count++;
}
}
}
if (input_use_count == cur_node_use_count) {
cur_node = nullptr;
}
}
if (cur_node != nullptr) {
std::string node_output_name(cur_node->name);
m_model_outputs[node_output_name] = cur_node;
m_model_output_names.push_back(node_output_name);
}
}
}
const ggml_tensor * GgmlOvDecoder::get_tensor_used_op(const ggml_tensor * tensor) const {
if (tensor == nullptr) {
return nullptr;
}
for (int i = 0; i < m_cgraph->n_nodes; i++) {
const auto * node = m_cgraph->nodes[i];
for (int j = 0; j < GGML_MAX_SRC; j++) {
if (node->src[j] == tensor) {
return node;
}
}
}
return nullptr;
}
const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name) const {
for (int i = 0; i < m_cgraph->n_nodes; i++) {
const auto * node = m_cgraph->nodes[i];
for (int j = 0; j < GGML_MAX_SRC; j++) {
const auto * src = node->src[j];
if (src == nullptr) {
break;
}
if (std::string(src->name) == name) {
return src;
}
}
}
return nullptr;
}
std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const {
std::map<std::string, std::string> kv_param_res_names;
for (const auto & name : m_model_params.kv_names) {
kv_param_res_names[name] = name;
}
return kv_param_res_names;
}
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) {
static std::mutex weights_mutex;
std::lock_guard<std::mutex> lock(weights_mutex);
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
auto * nodes = cgraph->nodes;
auto n_nodes = cgraph->n_nodes;
for (int node_i = 0; node_i < n_nodes; node_i++) {
auto * node = nodes[node_i];
for (int i = 0; i < GGML_MAX_SRC; i++) {
auto * src = node->src[i];
if (src == nullptr) {
continue;
}
std::string src_name(src->name);
if (is_rope_freqs_weight(src, node)) {
src_name = "rope_freqs.weight";
}
if (!src->view_src) {
ggml_backend_buffer * buffer = src->buffer;
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS || ggml_is_quantized(src->type)) {
if (model_weights.find(src_name) == model_weights.end()) {
auto weight_node = create_weight_node(src, naive);
weight_node->set_friendly_name(src_name);
model_weights[src_name] = weight_node;
}
}
}
}
}
return model_weights;
}
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor, bool naive) {
const bool is_ov_buffer = ggml_backend_buffer_is_openvino(tensor->buffer);
// Check if we have a pre-built constant from the OpenVINO backend buffer
// This is set during ggml_backend_openvino_buffer_set_tensor
if (tensor->extra) {
OPENVINO_ASSERT(is_ov_buffer, "Unsupported weight tensor: " + std::string(tensor->name) +
" Possibly this is a cpu backend repacked quantized weights");
// Cast to our extra base type and check the type
auto * extra_base = static_cast<ggml_openvino_extra_base *>(tensor->extra);
if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT) {
// F16/F32/BF16 weight with shared-memory constant
auto * weight_extra = static_cast<ggml_openvino_weight_extra *>(tensor->extra);
if (weight_extra->weight_node) {
// GGML_LOG_DEBUG("%s: using pre-built weight node for %s\n", __func__, tensor->name);
return weight_extra->weight_node;
}
} else if (extra_base->type == ggml_openvino_extra_base::Type::QUANTIZED_WEIGHT) {
// Quantized weight with pre-extracted data
auto * quant_extra = static_cast<ggml_openvino_quantized_weight_extra *>(tensor->extra);
if (quant_extra->weight_node) {
// GGML_LOG_DEBUG("%s: using pre-extracted quantized weight node for %s\n", __func__, tensor->name);
return quant_extra->weight_node;
}
}
}
// There are three cases where we need to create a new weight node:
// 1. weights are in openvino_host_buffer. Weight loading to host buffer will not trigger backend_buffer_set_tensor
// 2. weights are in cpu/cpu_mapped buffer. On token_embd.weight goes to case 1 or 2, depending on whether mmap or direct_io is used
// 3. test-backend-ops. buffers in test-backend-ops does not set USAGE_WEIGHT so backend_buffer_set_tensor will not create weight node
// GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name);
static const std::set<ggml_type> weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
if (weight_types.find(tensor->type) == weight_types.end()) {
throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " +
ggml_type_name(tensor->type));
}
OvWeight ov_weight;
if (ggml_is_quantized(tensor->type)) {
auto use_bias = naive;
if (is_ov_buffer) {
// For quantized weights, copy raw data to a temp buffer first because
// process_weight_tensor reads from data and writes extracted results
// (weights/scales/zp) to output_base_ptr — they would overlap if both
// point to tensor->data.
size_t raw_size = ggml_nbytes(tensor);
std::vector<uint8_t> tmp(raw_size);
memcpy(tmp.data(), tensor->data, raw_size);
ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data, use_bias);
} else {
ov_weight = process_weight_tensor(tensor, tensor->data, nullptr, use_bias);
}
} else {
// For non-quantized weights (F16/F32/BF16), data is already in tensor->data.
// process_weight_tensor will create an ov::Tensor wrapping tensor->data directly.
ov_weight = process_weight_tensor(tensor, tensor->data, tensor->data);
}
ov_weight.weight_node->set_friendly_name(tensor->name);
if (!is_ov_buffer) {
return ov_weight.weight_node;
}
ggml_openvino_extra_base * extra;
if (ov_weight.is_quantized()) {
extra = new ggml_openvino_quantized_weight_extra(std::move(ov_weight.weights), std::move(ov_weight.scales),
std::move(ov_weight.zp), ov_weight.weight_node);
} else {
extra = new ggml_openvino_weight_extra(std::move(ov_weight.weights), ov_weight.weight_node);
}
ggml_openvino_buffer_register_extra(tensor, extra);
return ov_weight.weight_node;
}
void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filename) {
std::ofstream file(filename);
if (!file.is_open()) {
std::cerr << "Failed to open file" << std::endl;
return;
}
file << "=== GRAPH ===\n";
// clang-format off
file << "n_nodes = " << cgraph->n_nodes << "\n";
file << " " << std::setw(3) << "nodes"
<< std::setw(15) << "shape"
<< std::setw(20) << "op"
<< std::setw(20) << "name"
<< std::setw(3) << " "
<< std::setw(62) << "stride"
<< std::setw(20) << "buffer_type"
<< "\n";
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
// Get buffer type name
const char * buf_name = "none";
ggml_backend_buffer_t buf = node->view_src ? node->view_src->buffer : node->buffer;
if (buf) {
buf_name = ggml_backend_buffer_name(buf);
}
file << " - " << std::setw(3) << i << ": [ "
<< std::setw(5) << node->ne[0] << ", "
<< std::setw(5) << node->ne[1] << ", "
<< std::setw(5) << node->ne[2] << ", "
<< std::setw(5) << node->ne[3] << "] "
<< std::left << std::setw(20) << ggml_op_name(node->op) << std::right << " "
<< std::left << std::setw(45) << node->name << std::right
<< std::setw(2) << "[ "
<< std::setw(0) << node->nb[0] << ", "
<< std::setw(5) << node->nb[1] << ", "
<< std::setw(5) << node->nb[2] << ", "
<< std::setw(5) << node->nb[3] << "] "
<< std::right << std::setw(15) << buf_name << std::right
<< "\n";
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (auto* src = node->src[i]) {
// Get buffer type name for source
const char * src_buf_name = "none";
ggml_backend_buffer_t src_buf = src->view_src ? src->view_src->buffer : src->buffer;
if (src_buf) {
src_buf_name = ggml_backend_buffer_name(src_buf);
}
file << std::setw(10) << " [ "
<< std::setw(5) << src->ne[0] << ", "
<< std::setw(5) << src->ne[1] << ", "
<< std::setw(5) << src->ne[2] << ", "
<< std::setw(5) << src->ne[3] << "] "
<< std::setw(12)
<< i << ": " << std::left << std::setw(12) << ggml_op_name(src->op) << std::right;
file << std::left << std::setw(30) << src->name << std::right
<< std::setw(16) << "[ "
<< std::setw(0) << src->nb[0] << ", "
<< std::setw(5) << src->nb[1] << ", "
<< std::setw(5) << src->nb[2] << ", "
<< std::setw(5) << src->nb[3] << "] "
<< std::right << std::setw(15) << src_buf_name << std::right
<< "\n";
}
}
}
file << "n_leafs = " << cgraph->n_leafs << "\n";
for (int i = 0; i < cgraph->n_leafs; i++) {
ggml_tensor * node = cgraph->leafs[i];
// Get buffer type name for leaf
const char * leaf_buf_name = "none";
ggml_backend_buffer_t leaf_buf = node->view_src ? node->view_src->buffer : node->buffer;
if (leaf_buf) {
leaf_buf_name = ggml_backend_buffer_name(leaf_buf);
}
file << " - " << std::setw(3) << i << ": [ "
<< std::setw(5) << node->ne[0] << ", "
<< std::setw(5) << node->ne[1] << "] "
<< std::setw(8) << ggml_op_name(node->op) << " "
<< std::setw(16) << ggml_get_name(node)
<< std::setw(20) << leaf_buf_name << "\n";
}
// clang-format on
file << "========================================\n";
file.close();
}
void print_tensor_address_map(const ggml_cgraph * cgraph) {
std::map<void *, std::vector<std::string>> address_map;
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
auto * node = cgraph->nodes[node_n];
if (node->data) {
auto it = address_map.find(node->data);
if (it == address_map.end()) {
address_map[node->data] = std::vector<std::string>();
}
address_map[node->data].push_back(node->name);
}
}
for (const auto & pair : address_map) {
std::cout << "Address: " << pair.first << std::endl;
for (const auto & name : pair.second) {
std::cout << name << " ; ";
}
std::cout << std::endl << std::endl;
}
}
ov::Shape GgmlOvDecoder::get_shape(const ggml_tensor * tensor) {
std::vector<size_t> shape;
for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
shape.push_back(static_cast<size_t>(tensor->ne[i]));
}
return shape;
}
std::vector<size_t> GgmlOvDecoder::get_stride(const ggml_tensor * tensor) {
std::vector<size_t> stride;
for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
stride.push_back(static_cast<size_t>(tensor->nb[i]));
}
return stride;
}
ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor * tensor) {
switch (tensor->type) {
case GGML_TYPE_F64:
return ov::element::f64;
case GGML_TYPE_F32:
return ov::element::f32;
case GGML_TYPE_F16:
return ov::element::f16;
case GGML_TYPE_BF16:
return ov::element::bf16;
case GGML_TYPE_I8:
return ov::element::i8;
case GGML_TYPE_I16:
return ov::element::i16;
case GGML_TYPE_I32:
return ov::element::i32;
case GGML_TYPE_I64:
return ov::element::i64;
default:
return ov::element::dynamic;
}
}
ov::PartialShape GgmlOvDecoder::get_input_shape(int node_idx, const std::string & name) const {
return ov::PartialShape(get_shape(m_node_info_list[node_idx].node_inputs.at(name)));
}
std::vector<size_t> GgmlOvDecoder::get_input_stride(int node_idx, const std::string & name) const {
return get_stride(m_node_info_list[node_idx].node_inputs.at(name));
}
ov::element::Type GgmlOvDecoder::get_input_type(int node_idx, const std::string & name) const {
return get_ov_type(m_node_info_list[node_idx].node_inputs.at(name));
}
size_t GgmlOvDecoder::get_input_size() const {
return m_model_inputs.size();
}
size_t GgmlOvDecoder::get_input_size(int node_idx) const {
return m_node_info_list[node_idx].node_inputs_names.size();
}
std::vector<std::string> GgmlOvDecoder::get_input_names(int node_idx) const {
return m_node_info_list[node_idx].node_inputs_names;
}
ov::PartialShape GgmlOvDecoder::get_output_shape(int node_idx) const {
auto * ggml_tensor = m_node_info_list[node_idx].node_output;
return ov::PartialShape(get_shape(ggml_tensor));
}
ov::element::Type GgmlOvDecoder::get_output_type(const int node_idx) const {
return get_ov_type(m_node_info_list[node_idx].node);
}
std::vector<std::string> GgmlOvDecoder::get_output_names(int node_idx) const {
return {m_node_info_list[node_idx].node_output_name};
}
const std::string & GgmlOvDecoder::get_op_name() const {
static const std::string unknown_name = "UNKNOWN_OP_NAME";
return unknown_name;
}
const std::string & GgmlOvDecoder::get_op_name(int node_idx) const {
return m_node_info_list[node_idx].node_name;
}
int32_t * GgmlOvDecoder::get_input_op_params(int node_idx, const std::string & name) const {
return m_node_info_list[node_idx].node_inputs.at(name)->op_params;
}
int32_t * GgmlOvDecoder::get_output_op_params(int node_idx) const {
return m_node_info_list[node_idx].node->op_params;
}
void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>, int node_idx)> node_visitor) const {
for (int node_idx = 0; node_idx < m_cgraph->n_nodes; node_idx++) {
if (m_cgraph->nodes[node_idx]->op == GGML_OP_NONE) {
continue;
}
node_visitor(std::make_shared<GgmlOvDecoder>(*this), node_idx);
}
}
std::string GgmlOvDecoder::compute_op_type(const ggml_tensor * node) {
static const std::map<ggml_op, std::string> ops = {
{GGML_OP_NONE, "GGML_OP_NONE" },
{GGML_OP_ACC, "GGML_OP_ACC" },
{GGML_OP_ADD, "GGML_OP_ADD" },
{GGML_OP_ADD1, "GGML_OP_ADD1" },
{GGML_OP_CONT, "GGML_OP_CONT" },
{GGML_OP_DIV, "GGML_OP_DIV" },
{GGML_OP_DUP, "GGML_OP_DUP" },
{GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS" },
{GGML_OP_MUL, "GGML_OP_MUL" },
{GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT" },
{GGML_OP_PERMUTE, "GGML_OP_PERMUTE" },
{GGML_OP_RESHAPE, "GGML_OP_RESHAPE" },
{GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM" },
{GGML_OP_ROPE, "GGML_OP_ROPE" },
{GGML_OP_SCALE, "GGML_OP_SCALE" },
{GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX" },
{GGML_OP_SUB, "GGML_OP_SUB" },
{GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE" },
{GGML_OP_VIEW, "GGML_OP_VIEW" },
{GGML_OP_SET_ROWS, "GGML_OP_SET_ROWS" },
{GGML_OP_CPY, "GGML_OP_CPY" },
{GGML_OP_FLASH_ATTN_EXT, "GGML_OP_FLASH_ATTN_EXT"},
};
static const std::map<ggml_unary_op, std::string> unary_ops = {
{GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS" },
{GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN" },
{GGML_UNARY_OP_NEG, "GGML_UNARY_OP_NEG" },
{GGML_UNARY_OP_STEP, "GGML_UNARY_OP_STEP" },
{GGML_UNARY_OP_TANH, "GGML_UNARY_OP_TANH" },
{GGML_UNARY_OP_ELU, "GGML_UNARY_OP_ELU" },
{GGML_UNARY_OP_RELU, "GGML_UNARY_OP_RELU" },
{GGML_UNARY_OP_SIGMOID, "GGML_UNARY_OP_SIGMOID" },
{GGML_UNARY_OP_GELU, "GGML_UNARY_OP_GELU" },
{GGML_UNARY_OP_GELU_QUICK, "GGML_UNARY_OP_GELU_QUICK" },
{GGML_UNARY_OP_SILU, "GGML_UNARY_OP_SILU" },
{GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH" },
{GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"},
{GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP" },
{GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT" }
};
static const std::map<ggml_glu_op, std::string> glu_ops = {
{GGML_GLU_OP_SWIGLU, "GGML_GLU_OP_SWIGLU"},
{GGML_GLU_OP_GEGLU, "GGML_GLU_OP_GEGLU" },
{GGML_GLU_OP_REGLU, "GGML_GLU_OP_REGLU" }
};
switch (node->op) {
case GGML_OP_UNARY:
return unary_ops.at(ggml_get_unary_op(node));
case GGML_OP_GLU:
return glu_ops.at(ggml_get_glu_op(node));
default:
return ops.at(node->op);
}
static const std::string unknown_op = "UNKNOWN_GGML_OP";
return unknown_op;
}
const std::string & GgmlOvDecoder::get_op_type(int node_idx) const {
return m_node_info_list[node_idx].node_op_type;
}
const std::string & GgmlOvDecoder::get_op_type() const {
static const std::string unknown_op = "UNKNOWN_GGML_OP";
return unknown_op;
}

View File

@@ -0,0 +1,294 @@
#pragma once
#include "ggml-quants.h"
#include "ggml.h"
#include "openvino/decoder.h"
#include <cstdint>
#include <cstring>
#include <map>
#include <memory>
#include <openvino/core/partial_shape.hpp>
#include <optional>
#include <vector>
struct ModelParams {
int ctx = -1;
int ctx_swa = -1;
int ctx_per_seq = -1;
int ctx_per_seq_swa = -1;
int n_seq = 1;
int n_heads = -1;
int n_heads_kv = -1;
int head_size = -1;
int32_t rope_params[15];
std::vector<int> swa_layers;
std::vector<std::string> kv_names;
size_t kv_buffer_ctx_id = 0;
bool same_rope_params(const ModelParams & other) const {
return memcmp(rope_params, other.rope_params, sizeof(int32_t) * 15) == 0;
}
bool can_reuse_dynamically(const ModelParams & other) const { return same_rope_params(other); }
bool can_reuse_statically(const ModelParams & other) const { return same_rope_params(other) && ctx == other.ctx; }
bool kv_buffer_changed(const ModelParams & other) const { return kv_buffer_ctx_id != other.kv_buffer_ctx_id; }
};
struct ComputeParams {
int n_seq_active = 1;
int seq_active_start = 0;
int attention_size = -1;
int attention_size_swa = -1;
int input_len = -1;
int token_len_per_seq = -1;
int past_kv_len = -1;
int output_len = 1;
};
class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
public:
struct NodeInfo {
ggml_tensor * node;
std::string node_name;
std::string node_op_type;
std::map<std::string, ggml_tensor *> node_inputs;
std::vector<std::string> node_inputs_names;
ggml_tensor * node_output;
std::string node_output_name;
int node_op_case = 0;
void * data_addr;
};
// Graph decoder
GgmlOvDecoder(ggml_cgraph * cgraph,
ModelParams & model_params,
ComputeParams & compute_params,
std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
bool is_static,
bool is_stateful = false,
bool is_prefill = false,
int prefill_chunk_size = 256);
// Naive graph decoder
GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights);
virtual ov::Any get_attribute(const std::string & name) const override {
return nullptr;
GGML_UNUSED(name);
}
virtual ov::PartialShape get_input_shape(int node_idx, const std::string & name) const override;
virtual std::vector<size_t> get_input_stride(int node_idx, const std::string & name) const override;
virtual ov::element::Type get_input_type(int node_idx, const std::string & name) const override;
virtual size_t get_input_size() const override;
virtual size_t get_input_size(int node_idx) const override;
virtual void get_input_node(size_t input_port_idx,
std::string & producer_name,
std::string & producer_output_port_name,
size_t & producer_output_port_index) const override {
GGML_UNUSED(input_port_idx);
GGML_UNUSED(producer_name);
GGML_UNUSED(producer_output_port_name);
GGML_UNUSED(producer_output_port_index);
}
virtual std::vector<std::string> get_input_names(int node_idx) const override;
virtual ov::PartialShape get_output_shape(int node_idx) const override;
virtual ov::element::Type get_output_type(int node_idx) const override;
virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const override;
virtual int32_t * get_output_op_params(int node_idx) const override;
virtual std::vector<std::string> get_output_names(int node_idx) const override;
virtual const std::string & get_op_type() const override;
virtual const std::string & get_op_type(int node_idx) const override;
virtual const std::string & get_op_name() const override;
virtual const std::string & get_op_name(int node_idx) const override;
virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>, int node_idx)> node_visitor) const override;
ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); }
virtual int get_op_case(int node_idx) const override { return m_node_info_list[node_idx].node_op_case; }
virtual const std::map<std::string, std::shared_ptr<ov::Node>> & get_model_inputs() const override {
return m_model_inputs;
}
virtual const std::map<std::string, std::shared_ptr<ov::Node>> & get_model_extra_inputs() const override {
return m_model_extra_inputs;
}
virtual const std::map<std::string, std::shared_ptr<ov::Tensor>> & get_model_extra_input_values() const {
return m_model_extra_input_values;
}
virtual const std::map<std::string, std::shared_ptr<ov::Node>> & get_model_weights() const override {
return m_model_weights;
}
virtual std::vector<std::string> get_model_output_names() const override {
return m_model_output_names;
}
const std::map<std::string, ggml_tensor *> & get_model_outputs() const { return m_model_outputs; }
virtual int get_ctx_size() const { return m_model_params.ctx; }
virtual int get_ctx_swa_size() const { return m_model_params.ctx_swa; }
virtual int get_ctx_per_seq() const { return m_model_params.ctx_per_seq; }
virtual int get_ctx_per_seq_swa() const { return m_model_params.ctx_per_seq_swa; }
virtual int get_n_seq() const { return m_model_params.n_seq; }
virtual int is_swa_layer(int layer) const override {
return std::find(m_model_params.swa_layers.begin(), m_model_params.swa_layers.end(), layer) !=
m_model_params.swa_layers.end();
}
int get_past_kv_len() const { return m_compute_params.past_kv_len; }
int get_input_len() const { return m_compute_params.input_len; }
virtual int32_t * get_rope_params() const override { return const_cast<int32_t *>(m_model_params.rope_params); }
virtual std::map<std::string, std::string> get_kv_param_res_names() const override;
virtual bool is_static() const override { return m_is_static; }
virtual bool is_stateful() const override { return m_is_stateful; }
ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const;
static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor * tensor, bool naive = false);
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(ggml_cgraph * cgraph,
bool naive = false);
const ggml_tensor * get_tensor_used_op(const ggml_tensor * tensor) const;
const ggml_tensor * get_tensor_from_name(const std::string & name) const;
void clear_model_weights() { m_model_weights.clear(); }
static std::pair<ModelParams, ComputeParams> compute_llm_params(ggml_cgraph * cgraph, bool is_static);
ModelParams get_model_params() const { return m_model_params; }
ComputeParams get_compute_params() const { return m_compute_params; }
void set_model_params(const ModelParams & model_params) { m_model_params = model_params; }
void set_compute_params(const ComputeParams & compute_params) { m_compute_params = compute_params; }
bool m_is_static = false;
bool m_is_stateful = false;
bool m_is_prefill = false;
bool m_naive = false;
int m_prefill_chunk_size = 0;
static ov::Shape get_shape(const ggml_tensor * tensor);
static std::vector<size_t> get_stride(const ggml_tensor * tensor);
static ov::element::Type get_ov_type(const ggml_tensor * tensor);
static std::string compute_op_type(const ggml_tensor * node);
void add_extra_inputs();
void update_io(ggml_cgraph * cgraph);
inline static bool is_inp_tok(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op == GGML_OP_NONE;
}
inline static bool is_inp_pos(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_ROPE && tensor == op->src[1];
}
inline static bool is_inp_emb(const ggml_tensor * tensor, const ggml_tensor * op) {
return tensor->op == GGML_OP_GET_ROWS && op->op == GGML_OP_RMS_NORM;
}
inline static bool is_inp_mask(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]);
}
inline static bool is_rope_freqs_weight(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_ROPE && tensor == op->src[2];
}
inline static bool is_kvcache(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_SET_ROWS && op->src[2] == tensor;
}
inline static bool is_kv_idx(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_SET_ROWS && op->src[1] == tensor;
}
inline static bool is_output_idx(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op != GGML_OP_NONE;
}
static std::string get_graph_input_ov_name(const ggml_tensor * tensor, const ggml_tensor * op) {
if (is_inp_tok(tensor, op)) {
return "inp_tokens";
}
if (is_inp_pos(tensor, op)) {
return "inp_pos";
}
if (is_inp_emb(tensor, op)) {
return "embd";
}
if (is_output_idx(tensor, op)) {
return "inp_out_ids";
}
if (is_inp_mask(tensor, op)) {
return std::string(tensor->name).find("swa") == std::string::npos ? "self_kq_mask" : "self_kq_mask_swa";
}
return tensor->name;
}
private:
void set_input_output();
int compute_op_case(const ggml_tensor * node) const;
bool node_is_used_as_src(const int node_idx);
void compute_model_inputs();
void compute_model_outputs();
void validate_cgraph() const;
ggml_cgraph * m_cgraph = nullptr;
std::map<std::string, ggml_tensor *> m_inputs;
std::map<std::string, std::shared_ptr<ov::Node>> m_model_inputs;
std::map<std::string, std::shared_ptr<ov::Node>> m_model_extra_inputs;
std::map<std::string, std::shared_ptr<ov::Tensor>> m_model_extra_input_values;
std::map<std::string, std::shared_ptr<ov::Node>> m_model_weights;
std::map<std::string, ggml_tensor *> m_model_outputs;
std::vector<std::string> m_model_output_names;
std::vector<NodeInfo> m_node_info_list;
ModelParams m_model_params;
ComputeParams m_compute_params;
};
void print_tensor_address_map(const ggml_cgraph * cgraph);
int extract_layer_from_name(const std::string & name);

View File

@@ -0,0 +1,373 @@
#include "ggml-openvino-extra.h"
#include "ggml-impl.h"
#include "ggml.h"
#include <cstring>
#include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
#include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
#include <optional>
ov::Core & ov_singleton_core() {
static ov::Core core;
return core;
}
// =====================================================
// Device Configuration Implementations
// =====================================================
void ggml_openvino_device_config::init() {
if (initialized) {
return;
}
device_name = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "CPU";
auto available_devices = ov_singleton_core().get_available_devices();
if (std::find(available_devices.begin(), available_devices.end(), device_name) == available_devices.end()) {
GGML_LOG_WARN("GGML OpenVINO Backend: device %s is not available, fallback to CPU\n", device_name.c_str());
device_name = "CPU";
}
is_npu = (device_name == "NPU");
auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR");
if (device_name == "NPU") {
compile_config = {
{"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" },
{"NPU_USE_NPUW", "YES" },
{"NPUW_DEVICES", "NPU" },
{"NPUW_FOLD", "YES" },
{"NPUW_WEIGHTS_BANK", "shared"},
{"NPUW_FUNCALL_FOR_ALL", "YES" },
{"NPUW_FUNCALL_ASYNC", "YES" },
{"NPUW_DQ", "YES" },
{"NPUW_DQ_FULL", "NO" },
};
if (cache_dir) {
compile_config["NPUW_CACHE_DIR"] = cache_dir;
}
} else if (cache_dir) {
ov_singleton_core().set_property(ov::cache_dir(cache_dir));
}
// Initialize remote context with queue sharing for GPU
if (device_name == "GPU") {
// Create OpenCL context and queue
cl_int err;
cl_platform_id platform;
err = clGetPlatformIDs(1, &platform, nullptr);
if (err != CL_SUCCESS) {
GGML_LOG_ERROR("Failed to get OpenCL platform: %d\n", err);
return;
}
cl_device_id cl_device;
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &cl_device, nullptr);
if (err != CL_SUCCESS) {
GGML_LOG_ERROR("Failed to get OpenCL device: %d\n", err);
return;
}
cl_context cl_ctx = clCreateContext(nullptr, 1, &cl_device, nullptr, nullptr, &err);
if (err != CL_SUCCESS) {
GGML_LOG_ERROR("Failed to create OpenCL context: %d\n", err);
return;
}
cl_queue = clCreateCommandQueueWithProperties(cl_ctx, cl_device, nullptr, &err);
if (err != CL_SUCCESS) {
GGML_LOG_ERROR("Failed to create OpenCL command queue: %d\n", err);
clReleaseContext(cl_ctx);
return;
}
// Create OpenVINO remote context with queue sharing
remote_context = ov::intel_gpu::ocl::ClContext(ov_singleton_core(), cl_queue);
// Release the context (queue keeps a reference)
clReleaseContext(cl_ctx);
} else if (device_name == "NPU") {
// remote tensor is not used for NPU yet
// remote_context = ov_singleton_core().get_default_context(device_name);
}
initialized = true;
}
ggml_openvino_device_config::~ggml_openvino_device_config() {
if (cl_queue != nullptr) {
clReleaseCommandQueue(cl_queue);
cl_queue = nullptr;
}
}
// Get the global device config singleton
ggml_openvino_device_config & ggml_openvino_get_device_config() {
static ggml_openvino_device_config config;
return config;
}
// Initialize device config (call during backend init)
void ggml_openvino_init_device_config() {
ggml_openvino_get_device_config().init();
}
// Get the device name
const std::string & ggml_openvino_get_device_name() {
return ggml_openvino_get_device_config().device_name;
}
// Check if running on NPU
bool ggml_openvino_is_npu() {
return ggml_openvino_get_device_config().is_npu;
}
// Get the remote context for the current device (returns empty optional for CPU)
std::optional<ov::RemoteContext> ggml_openvino_get_remote_context() {
return ggml_openvino_get_device_config().remote_context;
}
// Get the compile config for the current device
const ov::AnyMap & ggml_openvino_get_compile_config() {
return ggml_openvino_get_device_config().compile_config;
}
// Get the OpenCL command queue for GPU operations
cl_command_queue ggml_openvino_get_cl_queue() {
return ggml_openvino_get_device_config().cl_queue;
}
// Get the clEnqueueMemFillINTEL function pointer (lazy load)
clEnqueueMemFillINTEL_fn ggml_openvino_get_clEnqueueMemFillINTEL() {
static clEnqueueMemFillINTEL_fn fn = nullptr;
static bool loaded = false;
if (!loaded) {
loaded = true;
cl_platform_id platform;
if (clGetPlatformIDs(1, &platform, nullptr) == CL_SUCCESS) {
fn = (clEnqueueMemFillINTEL_fn) clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemFillINTEL");
}
}
return fn;
}
// Get the clEnqueueMemcpyINTEL function pointer (lazy load)
clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() {
static clEnqueueMemcpyINTEL_fn fn = nullptr;
static bool loaded = false;
if (!loaded) {
loaded = true;
cl_platform_id platform;
if (clGetPlatformIDs(1, &platform, nullptr) == CL_SUCCESS) {
fn = (clEnqueueMemcpyINTEL_fn) clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemcpyINTEL");
}
}
return fn;
}
// Get requantization type for a tensor type (returns nullopt if no requant needed)
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor, bool no_requant) {
if (no_requant) {
return std::nullopt;
}
if (strncmp(tensor->name, "token_embd.weight", 17) == 0) {
return ((ggml_openvino_is_npu() && tensor->type == GGML_TYPE_Q6_K) ? ExtraQuantType::F16 : ExtraQuantType::Q8_0_C);
}
if (strncmp(tensor->name, "output.weight", 13) == 0) {
return ExtraQuantType::Q8_0_C;
}
if (ggml_openvino_is_npu()) {
return ExtraQuantType::Q4_0_128;
}
switch (tensor->type) {
case GGML_TYPE_Q6_K:
case GGML_TYPE_Q5_K:
return ExtraQuantType::Q8_0_C;
default:
return std::nullopt;
}
}
// =====================================================
// Extracted Layout Calculation
// =====================================================
ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor, bool use_bias) {
ggml_openvino_extracted_layout layout = {};
layout.is_symmetric = false;
if (!ggml_is_quantized(tensor->type)) {
return layout;
}
// Only handle 2D weight tensors
if (tensor->ne[2] != 1 || tensor->ne[3] != 1) {
return layout;
}
int64_t n_elements = ggml_nelements(tensor);
const size_t alignment = 64; // Good for SIMD
// Check if requantization is needed (NPU-specific)
auto requant_type = ggml_openvino_get_requant_type(tensor, use_bias);
if (requant_type.has_value()) {
layout.is_requant = true;
layout.requant_type = requant_type;
// Special case: requant to F16 - just store F16 weights, no scales/zp
if (requant_type.value() == ExtraQuantType::F16) {
layout.weights_size = n_elements * sizeof(uint16_t); // F16 = 2 bytes
layout.total_size = layout.weights_size;
layout.weights_offset = 0;
// No scales/zp for F16
return layout;
}
// Requant to different quantized format (e.g., Q4_0_128)
switch (requant_type.value()) {
case ExtraQuantType::Q4_0_128:
layout.is_u4 = true;
layout.weights_per_block = 128;
layout.is_symmetric = true;
break;
case ExtraQuantType::Q4_0_C:
layout.is_u4 = true;
layout.weights_per_block = tensor->ne[0];
layout.is_symmetric = true;
break;
case ExtraQuantType::Q8_0_32:
layout.is_u4 = false;
layout.weights_per_block = 32;
layout.is_symmetric = true;
break;
case ExtraQuantType::Q8_0_C:
layout.is_u4 = false;
layout.weights_per_block = tensor->ne[0];
layout.is_symmetric = true;
break;
case ExtraQuantType::Q8_1_C:
layout.is_u4 = false;
layout.weights_per_block = tensor->ne[0];
break;
default:
layout.weights_per_block = -1;
GGML_ABORT("Code of re-quantizing to channel-wise is not updated");
break;
}
if (layout.is_requant) {
// Calculate sizes for requantized format
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
int64_t n_blocks = n_elements / layout.weights_per_block;
layout.scales_size = n_blocks * sizeof(uint16_t);
// For symmetric quantization, we only need one zp value (not one per block)
// Zero points are stored in U4 or U8 format matching the weight type
size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;
layout.weights_offset = 0;
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
layout.total_size = layout.zp_offset + layout.zp_size;
layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor));
return layout;
}
}
// Normal extraction (no requant) - determine format based on tensor type
layout.is_u4 = false;
layout.weights_per_block = 32;
layout.is_symmetric = false;
switch (tensor->type) {
case GGML_TYPE_Q4_0:
layout.is_u4 = true;
layout.is_symmetric = true;
break;
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_K:
layout.is_u4 = true;
break;
case GGML_TYPE_Q8_0:
layout.is_symmetric = true;
break;
case GGML_TYPE_Q6_K:
layout.weights_per_block = 16;
layout.is_symmetric = true;
break;
case GGML_TYPE_Q5_K:
break;
default:
// Unsupported quantization type
return layout;
}
// Calculate sizes
// Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
// Scales: F16 per block
int64_t n_blocks = n_elements / layout.weights_per_block;
layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes
// Zero points: U4 or U8 matching weight type
// For symmetric quantization, we only need one zp value (not one per block)
size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;
// Layout in buffer: [weights | scales | zp] with alignment
layout.weights_offset = 0;
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
layout.total_size = layout.zp_offset + layout.zp_size;
layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor));
return layout;
}
ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote) {
ov::Shape shape;
for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
shape.push_back(static_cast<size_t>(tensor->ne[i]));
}
ov::element::Type element_type;
switch (tensor->type) {
case GGML_TYPE_F32:
element_type = ov::element::f32;
break;
case GGML_TYPE_F16:
element_type = ov::element::f16;
break;
case GGML_TYPE_BF16:
element_type = ov::element::bf16;
break;
case GGML_TYPE_I32:
element_type = ov::element::i32;
break;
case GGML_TYPE_I64:
element_type = ov::element::i64;
break;
default:
// GGML_LOG_WARN("%s: unsupported tensor type for ov::Tensor: %s\n", __func__, ggml_type_name(tensor->type));
return nullptr;
}
const auto & device_name = ggml_openvino_get_device_name();
auto remote_context = ggml_openvino_get_remote_context();
std::shared_ptr<ov::Tensor> ov_tensor;
if (is_remote) {
GGML_ASSERT(device_name == "GPU");
auto gpu_context = remote_context->as<ov::intel_gpu::ocl::ClContext>();
auto usm_tensor = gpu_context.create_tensor(element_type, shape, tensor->data);
ov_tensor = std::make_shared<ov::intel_gpu::ocl::USMTensor>(std::move(usm_tensor));
} else {
ov_tensor = std::make_shared<ov::Tensor>(element_type, shape, tensor->data);
}
return new ggml_openvino_tensor_extra(ov_tensor);
}

View File

@@ -0,0 +1,182 @@
#pragma once
#include "ggml.h"
#include "openvino/runtime/core.hpp"
#define CL_TARGET_OPENCL_VERSION 300
#include <CL/cl.h>
#include <cstdlib>
#include <memory>
#include <openvino/core/node.hpp>
#include <openvino/runtime/remote_context.hpp>
#include <openvino/runtime/tensor.hpp>
#include <optional>
#include <string>
// ExtraQuantType enum - defines requantization target formats
enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 };
ov::Core & ov_singleton_core();
// Get the remote context for the current device (returns empty optional for CPU)
std::optional<ov::RemoteContext> ggml_openvino_get_remote_context();
// Get the compile config for the current device
const ov::AnyMap & ggml_openvino_get_compile_config();
// Get the OpenCL command queue for GPU operations (returns nullptr for CPU/NPU)
cl_command_queue ggml_openvino_get_cl_queue();
// Intel USM extension function type
typedef cl_int(CL_API_CALL * clEnqueueMemFillINTEL_fn)(cl_command_queue queue,
void * dst_ptr,
const void * pattern,
size_t pattern_size,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event);
typedef cl_int(CL_API_CALL * clEnqueueMemcpyINTEL_fn)(cl_command_queue queue,
cl_bool blocking,
void * dst_ptr,
const void * src_ptr,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event);
// Get the clEnqueueMemFillINTEL function pointer (returns nullptr if not available)
clEnqueueMemFillINTEL_fn ggml_openvino_get_clEnqueueMemFillINTEL();
// Get the clEnqueueMemcpyINTEL function pointer (returns nullptr if not available)
clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL();
// =====================================================
// Global Device Configuration (singleton)
// =====================================================
// Initialized once during backend init from GGML_OPENVINO_DEVICE env var
struct ggml_openvino_device_config {
std::string device_name = "CPU";
bool is_npu = false;
bool initialized = false;
std::optional<ov::RemoteContext> remote_context;
ov::AnyMap compile_config;
cl_command_queue cl_queue = nullptr;
void init();
~ggml_openvino_device_config();
};
// Get the global device config singleton
ggml_openvino_device_config & ggml_openvino_get_device_config();
// Initialize device config (call during backend init)
void ggml_openvino_init_device_config();
// Get the device name
const std::string & ggml_openvino_get_device_name();
// Check if running on NPU
bool ggml_openvino_is_npu();
// Get requantization type for a tensor type (returns nullopt if no requant needed)
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor, bool no_requant = false);
// =====================================================
// OpenVINO Tensor Extra Types
// =====================================================
// These types are stored in tensor->extra by the OpenVINO backend buffer.
// They allow:
// 1. Pre-built ov::Constant nodes for weights (avoiding memcpy during graph construction)
// 2. ov::Tensor wrappers for KV cache / compute tensors (for direct use with infer_request)
// Base class for OpenVINO tensor extra data
struct ggml_openvino_extra_base {
enum class Type { WEIGHT, QUANTIZED_WEIGHT, TENSOR };
Type type;
virtual ~ggml_openvino_extra_base() = default;
protected:
explicit ggml_openvino_extra_base(Type t) : type(t) {}
};
// Extra data for F16/F32/BF16 weight tensors - stores the pre-built weight node
struct ggml_openvino_weight_extra : public ggml_openvino_extra_base {
ov::Tensor weights; // The underlying weight data tensor
std::shared_ptr<ov::Node> weight_node; // Pre-built OpenVINO weight node
ggml_openvino_weight_extra(ov::Tensor w, std::shared_ptr<ov::Node> n) :
ggml_openvino_extra_base(Type::WEIGHT),
weights(std::move(w)),
weight_node(std::move(n)) {}
};
// Extra data for quantized weight tensors - stores extracted weights/scales/zp and weight node
struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base {
ov::Tensor weights; // U4 or U8 extracted weights
ov::Tensor scales; // F16 scales
ov::Tensor zp; // U4 or U8 zero points (same type as weights)
std::shared_ptr<ov::Node> weight_node; // Pre-built OpenVINO weight subgraph
ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr<ov::Node> n) :
ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT),
weights(std::move(w)),
scales(std::move(s)),
zp(std::move(z)),
weight_node(std::move(n)) {}
};
// Extra data for KV cache / compute tensors - stores ov::Tensor for infer_request
struct ggml_openvino_tensor_extra : public ggml_openvino_extra_base {
std::shared_ptr<ov::Tensor> tensor; // For direct use with infer_request
explicit ggml_openvino_tensor_extra(std::shared_ptr<ov::Tensor> t)
: ggml_openvino_extra_base(Type::TENSOR), tensor(std::move(t)) {}
};
// =====================================================
// Extracted Size Calculation for Quantized Tensors
// =====================================================
// For quantized tensors, we need extra space to store extracted weights, scales, and zero points.
// Returns the total size needed in the buffer for extracted data.
struct ggml_openvino_extracted_layout {
size_t total_size = 0; // Total bytes needed
size_t weights_offset = 0; // Offset to weights in buffer
size_t weights_size = 0; // Size of weights in bytes
size_t scales_offset = 0; // Offset to scales in buffer
size_t scales_size = 0; // Size of scales in bytes
size_t zp_offset = 0; // Offset to zero points in buffer
size_t zp_size = 0; // Size of zero points in bytes (U4 or U8)
bool is_u4; // true for U4 weights, false for U8
int64_t weights_per_block; // weights per scale/zp block
bool is_symmetric; // true for symmetric quantization
// Requantization info
bool is_requant = false; // true if this tensor needs requantization
std::optional<ExtraQuantType> requant_type; // target requant type if is_requant
};
// Calculate the buffer layout for extracted quantized data
ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor, bool use_bias = false);
ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote);
// Register an extra with the tensor's OpenVINO buffer context for proper lifetime management.
// This sets tensor->extra and tracks the extra in the buffer context for cleanup.
void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra);
// =====================================================
// OpenVINO Backend Context and Interface
// =====================================================
struct ggml_backend_openvino_context {
int device = 0;
std::string name = "OpenVINO";
std::string description = "OpenVINO Backend Context";
std::shared_ptr<void> runtime_context = nullptr;
ggml_backend_openvino_context() = default;
};

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,884 @@
#include "ggml-quants.h"
#include "ggml-common.h"
#include "ggml-impl.h"
#include "ggml.h"
#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <limits>
#include <memory>
#include <openvino/core/except.hpp>
#include <openvino/core/node.hpp>
#include <openvino/core/node_output.hpp>
#include <openvino/core/parallel.hpp>
#include <openvino/core/shape.hpp>
#include <openvino/core/type/element_type.hpp>
#include <openvino/core/type/element_type_traits.hpp>
#include <openvino/core/type/float16.hpp>
#include <openvino/op/add.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/reshape.hpp>
#include <openvino/op/subtract.hpp>
#include <openvino/op/util/attr_types.hpp>
#include <openvino/runtime/tensor.hpp>
#include <string>
#include <vector>
void unpack_32_4(const uint8_t * data, uint8_t * dst) {
std::fill_n(dst, 16, 0);
for (int j = 0; j < 16; ++j) {
uint8_t x = (data[j] & 0x0F);
uint8_t y = (data[j] >> 4);
if (j % 2 != 0) {
x <<= 4;
y <<= 4;
}
dst[j / 2] |= x;
dst[8 + j / 2] |= y; // Last 16 weights are in the higher bits
}
}
// Extracts (weight, scales, zp) from Q4_0 tensors.
// Data layout is: |16 bit scale|32 x 4bit weights|.
void extract_q4_0_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & zp_arr) {
const uint64_t bytes_per_block = 18; // 2 bytes scale, 32x0.5 byte weights
auto * data = static_cast<uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * zp = static_cast<uint8_t *>(zp_arr.data());
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
// For Q4_0, zero point is always 8
if (is_scalar_zp) {
zp[0] = 8 | (8 << 4); // Pack two 4-bit values
}
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
// For asymmetric quantization, compute per-block zero points
if (!is_scalar_zp) {
// Pack two 4-bit zero points per byte
if (i % 2 == 0) {
zp[i / 2] = 8; // Lower nibble
} else {
zp[i / 2] |= (8 << 4); // Upper nibble
}
}
unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
});
}
// Extracts (weight, scales, zp) from Q4_1 tensors.
// Data layout is: |16 bit scale|16 bit min|32 x 4bit weights|.
void extract_q4_1_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & zp_arr,
bool use_bias) {
const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes min, 32x0.5 byte weights
auto * data = static_cast<uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
if (use_bias) {
// Store bias (min) directly as f16 instead of computing u4 zero points
auto * bias = zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
float scale = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))));
float min = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2))));
scales[i] = ov::float16(scale);
bias[i] = ov::float16(min); // bias = min, dequant: w*s + bias
unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16);
});
} else {
auto * zp = static_cast<uint8_t *>(zp_arr.data());
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
float scale = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))));
float min = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2))));
scales[i] = ov::float16(scale);
// zp = -min / scale (bias = min, so zp = -bias/scale)
uint8_t zp_val = (scale != 0.0f) ? (uint8_t) std::round(-min / scale) : 0;
// Pack two 4-bit zero points per byte
if (i % 2 == 0) {
zp[i / 2] = zp_val & 0x0F; // Lower nibble
} else {
zp[i / 2] |= (zp_val << 4); // Upper nibble
}
unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16);
});
}
}
// Extracts (weight, scales, zp) from Q8_0 tensors.
// Data layout is: |16 bit scale|32 x 8bit weights|.
void extract_q8_0_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & zp_arr) {
const uint64_t weights_per_block = 32;
const uint64_t bytes_per_block = 34; // 2 bytes scale, 32x1 byte weights
auto * data = static_cast<uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * zp = static_cast<uint8_t *>(zp_arr.data());
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
// For Q8_0, zero point is always 128
if (is_scalar_zp) {
zp[0] = 128;
}
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
uint8_t * block_data = data + i * bytes_per_block;
scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
// For asymmetric quantization, store per-block zero points
if (!is_scalar_zp) {
zp[i] = 128;
}
for (size_t j = 0; j < weights_per_block; ++j) {
uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes.
// Original data is in int8_t, so we add a bias of -128 and invert the first bit.
x ^= 1 << 7;
weights[i * weights_per_block + j] = x;
}
});
}
void unpack_256_4(const uint8_t * data, uint8_t * dst) {
// Initialize the output array with zeros
std::fill_n(dst, 128, 0);
for (size_t i = 0; i < 4; ++i) {
for (int j = 0; j < 32; ++j) {
uint8_t x = (data[i * 32 + j] & 0x0F);
uint8_t y = (data[i * 32 + j] >> 4);
if (j % 2 != 0) {
x <<= 4;
y <<= 4;
}
dst[i * 32 + j / 2] |= x;
dst[i * 32 + 16 + j / 2] |= y; // Last 16 weights are in the higher bits
}
}
}
void extract_q4_k_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & zp_arr,
bool use_bias) {
const uint64_t bytes_per_block = 2 + 2 + 12 + 128;
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
auto * data = static_cast<uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
// For bias path, zp_arr holds f16 bias values; for zp path, it holds packed u4 zero points
auto * zp_u4 = use_bias ? nullptr : static_cast<uint8_t *>(zp_arr.data());
auto * bias_f16 = use_bias ? zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>() : nullptr;
ov::parallel_for(n_super_block, [&](size_t i) {
uint8_t * block_data = data + i * bytes_per_block;
// Extract scale factors and offsets
float scale_scales = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data)));
float scale_mins = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 1)));
// Extract qs1 and qs2
uint8_t * qs1 = block_data + 4;
// Calculate scales
float scale_vals[8];
scale_vals[0] = scale_scales * static_cast<float>((*(qs1) & 0b111111));
scale_vals[1] = scale_scales * static_cast<float>((*(qs1 + 1) & 0b111111));
scale_vals[2] = scale_scales * static_cast<float>((*(qs1 + 2) & 0b111111));
scale_vals[3] = scale_scales * static_cast<float>((*(qs1 + 3) & 0b111111));
scale_vals[4] = scale_scales * static_cast<float>((*(qs1 + 8) & 0b00001111) | ((*(qs1) >> 6) << 4));
scale_vals[5] = scale_scales * static_cast<float>((*(qs1 + 9) & 0b00001111) | ((*(qs1 + 1) >> 6) << 4));
scale_vals[6] = scale_scales * static_cast<float>((*(qs1 + 10) & 0b00001111) | ((*(qs1 + 2) >> 6) << 4));
scale_vals[7] = scale_scales * static_cast<float>((*(qs1 + 11) & 0b00001111) | ((*(qs1 + 3) >> 6) << 4));
// Calculate min values (bias = -min)
float min_vals[8];
min_vals[0] = scale_mins * static_cast<float>((*(qs1 + 4) & 0b111111));
min_vals[1] = scale_mins * static_cast<float>((*(qs1 + 5) & 0b111111));
min_vals[2] = scale_mins * static_cast<float>((*(qs1 + 6) & 0b111111));
min_vals[3] = scale_mins * static_cast<float>((*(qs1 + 7) & 0b111111));
min_vals[4] = scale_mins * static_cast<float>((*(qs1 + 8) >> 4) | ((*(qs1 + 4) >> 6) << 4));
min_vals[5] = scale_mins * static_cast<float>((*(qs1 + 9) >> 4) | ((*(qs1 + 5) >> 6) << 4));
min_vals[6] = scale_mins * static_cast<float>((*(qs1 + 10) >> 4) | ((*(qs1 + 6) >> 6) << 4));
min_vals[7] = scale_mins * static_cast<float>((*(qs1 + 11) >> 4) | ((*(qs1 + 7) >> 6) << 4));
// Store scales and compute zero points or bias
for (int j = 0; j < 8; j++) {
scales[i * 8 + j] = ov::float16(scale_vals[j]);
if (use_bias) {
// Store bias = -min directly as f16, dequant: w*s + bias
bias_f16[i * 8 + j] = ov::float16(-min_vals[j]);
} else {
// zp = min / scale (since bias = -min and zp = -bias/scale)
uint8_t zp_val = (scale_vals[j] != 0.0f) ? (uint8_t) std::round(min_vals[j] / scale_vals[j]) : 0;
// Pack two 4-bit zero points per byte
size_t idx = i * 8 + j;
if (idx % 2 == 0) {
zp_u4[idx / 2] = zp_val & 0x0F;
} else {
zp_u4[idx / 2] |= (zp_val << 4);
}
}
}
unpack_256_4(block_data + 16, weights + i * 128);
});
}
void extract_q6_k_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & zp_arr) {
const uint64_t bytes_per_block = 128 + 64 + 16 + 2;
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
auto * data = static_cast<uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * zp = static_cast<uint8_t *>(zp_arr.data());
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
// For Q6_K, zero point is always 32
if (is_scalar_zp) {
zp[0] = 32;
}
ov::parallel_for(n_super_block, [&](size_t i) {
uint8_t * block_data = data + i * bytes_per_block;
float scale_factor =
static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 104))); // (128+64+16)/2
for (size_t j = 0; j < 16; j++) {
scales[j + i * 16] =
ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
// For asymmetric quantization, store per-block zero points
if (!is_scalar_zp) {
zp[j + i * 16] = 32;
}
}
uint8_t * ql = block_data;
uint8_t * qh = block_data + 128;
for (int64_t j = 0; j < 32; ++j) {
weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4);
weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4);
weights[i * 256 + j + 64] = (ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4);
weights[i * 256 + j + 96] = (ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4);
weights[i * 256 + j + 128] = (ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4);
weights[i * 256 + j + 160] = (ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4);
weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4);
weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4);
}
});
}
static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) {
if (j < 4) {
*d = q[j] & 63;
*m = q[j + 4] & 63;
} else {
*d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4);
*m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4);
}
}
void extract_q5_k_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & zp_arr,
bool use_bias) {
const uint64_t bytes_per_block = 4 + 12 + 32 + 128;
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
auto * data = static_cast<uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
// For bias path, zp_arr holds f16 bias values; for zp path, it holds u8 zero points
auto * zp_u8 = use_bias ? nullptr : static_cast<uint8_t *>(zp_arr.data());
auto * bias_f16 = use_bias ? zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>() : nullptr;
ov::parallel_for(n_super_block, [&](size_t i) {
uint8_t * block_data = data + i * bytes_per_block;
const float d = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data)));
const float min_factor = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 1)));
const uint8_t * scales_data = block_data + 4; // 12 bytes of scales
const uint8_t * qh = block_data + 4 + 12; // 32 bytes of high bits
const uint8_t * ql = block_data + 4 + 12 + 32; // 128 bytes of low bits
int is = 0;
uint8_t u1 = 1;
uint8_t u2 = 2;
// Process 2 blocks in one iteration
for (int j = 0; j < 256; j += 64) { // 256 = QK_K, so 4 iterations of 64
uint8_t sc;
uint8_t m;
// Get scale and min for first 32 elements
get_scale_min_k4(is + 0, scales_data, &sc, &m);
const float d1 = d * sc;
const float m1 = min_factor * m;
// Get scale and min for second 32 elements
get_scale_min_k4(is + 1, scales_data, &sc, &m);
const float d2 = d * sc;
const float m2 = min_factor * m;
scales[i * 8 + is] = ov::float16(d1);
scales[i * 8 + is + 1] = ov::float16(d2);
if (use_bias) {
// Store bias = -min directly as f16, dequant: w*s + bias
bias_f16[i * 8 + is] = ov::float16(-m1);
bias_f16[i * 8 + is + 1] = ov::float16(-m2);
} else {
// zp = min / scale (since bias = -min and zp = -bias/scale)
zp_u8[i * 8 + is] = (d1 != 0.0f) ? (uint8_t) std::round(m1 / d1) : 0;
zp_u8[i * 8 + is + 1] = (d2 != 0.0f) ? (uint8_t) std::round(m2 / d2) : 0;
}
// Extract weights for first 32 elements (matching deq formula exactly)
for (int l = 0; l < 32; ++l) {
weights[i * 256 + j + l] = (ql[l] & 0xF) + ((qh[l] & u1) ? 16 : 0);
}
// Extract weights for second 32 elements
for (int l = 0; l < 32; ++l) {
weights[i * 256 + j + l + 32] = (ql[l] >> 4) + ((qh[l] & u2) ? 16 : 0);
}
ql += 32;
is += 2;
u1 <<= 2;
u2 <<= 2;
}
});
}
// TODO Reorder for make_intX_weights
ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
ov::Tensor & scales,
ov::Tensor & zp,
size_t group_size,
bool use_bias) {
ov::Shape orig_shape = weight.get_shape();
// Expand dimensions for scales and zp/bias
auto scale_shape = scales.get_shape();
auto zp_shape = zp.get_shape();
bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization
ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size};
if (packed_shape[1] == 1) {
// Requantized channel-wise case
packed_shape.erase(packed_shape.begin() + 1);
} else {
scale_shape.push_back(1);
scales.set_shape(scale_shape);
// For symmetric quantization, zp remains scalar (don't resize)
if (!is_scalar_zp) {
zp_shape.push_back(1);
zp.set_shape(zp_shape);
}
}
// Create graph nodes
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, packed_shape,
static_cast<uint8_t *>(weight.data()), nullptr);
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
ov::Output<ov::Node> result;
if (use_bias && !is_scalar_zp) {
// Bias path: w * s + b (zp tensor holds f16 bias values)
auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
} else {
// Zero point path: (w - zp) * s
auto zero_point = std::make_shared<ov::op::v0::Constant>(zp);
float zp_value;
if (ov::op::util::get_single_value(zero_point, zp_value)) {
zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
}
auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
auto w_zp =
std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
}
if (packed_shape.size() != 2) {
// If not requantized channel-wise case, reshape back to original shape
auto final_shape =
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape);
result = std::make_shared<ov::op::v1::Reshape>(result, final_shape, false);
}
return std::make_shared<ov::op::v0::Convert>(result, ov::element::f32);
}
ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
ov::Tensor & scales,
ov::Tensor & zp,
size_t group_size,
bool use_bias) {
ov::Shape orig_weight_shape = weight.get_shape();
// Expand dimensions for scales and zp/bias
ov::Shape scale_shape = scales.get_shape();
auto zp_shape = zp.get_shape();
bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization
// Create INT4 weight tensor
ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size};
if (packed_shape[1] == 1) {
// Requantized channel-wise case
packed_shape.erase(packed_shape.begin() + 1);
} else {
scale_shape.push_back(1);
scales.set_shape(scale_shape);
// For symmetric quantization, zp remains scalar (don't resize)
if (!is_scalar_zp) {
zp_shape.push_back(1);
zp.set_shape(zp_shape);
}
}
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape,
static_cast<uint8_t *>(weight.data()), nullptr);
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
ov::Output<ov::Node> result;
if (use_bias && !is_scalar_zp) {
// Bias path: w * s + b (zp tensor holds f16 bias values)
auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
} else {
// Zero point path: (w - zp) * s
auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zp);
float zp_value;
if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
}
auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
auto w_zp =
std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
}
if (packed_shape.size() != 2) {
// If not requantized channel-wise case, reshape back to original shape
auto final_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_weight_shape.size()},
orig_weight_shape);
result = std::make_shared<ov::op::v1::Reshape>(result, final_shape, false);
}
return std::make_shared<ov::op::v0::Convert>(result, ov::element::f32);
}
// Extract quantized weights from tensor and create weight subgraph
std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
const void * data,
ov::Tensor & weights,
ov::Tensor & scales,
ov::Tensor & zp,
bool use_bias) {
// Create a temporary tensor for extraction functions that read from tensor->data
ggml_tensor temp_tensor = *tensor;
temp_tensor.data = const_cast<void *>(data);
// Determine block size based on tensor type
int64_t weights_per_block;
bool is_u4;
switch (tensor->type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_K:
is_u4 = true;
weights_per_block = 32;
break;
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q5_K:
is_u4 = false;
weights_per_block = 32;
break;
case GGML_TYPE_Q6_K:
is_u4 = false;
weights_per_block = 16;
break;
default:
throw std::runtime_error("Unsupported quantized type for extraction: " +
std::string(ggml_type_name(tensor->type)));
}
// Extract quantized data
switch (tensor->type) {
case GGML_TYPE_Q4_0:
extract_q4_0_data(&temp_tensor, weights, scales, zp);
break;
case GGML_TYPE_Q4_1:
extract_q4_1_data(&temp_tensor, weights, scales, zp, use_bias);
break;
case GGML_TYPE_Q4_K:
extract_q4_k_data(&temp_tensor, weights, scales, zp, use_bias);
break;
case GGML_TYPE_Q8_0:
extract_q8_0_data(&temp_tensor, weights, scales, zp);
break;
case GGML_TYPE_Q6_K:
extract_q6_k_data(&temp_tensor, weights, scales, zp);
break;
case GGML_TYPE_Q5_K:
extract_q5_k_data(&temp_tensor, weights, scales, zp, use_bias);
break;
default:
throw std::runtime_error("Unsupported quantized type: " + std::string(ggml_type_name(tensor->type)));
}
// Create the OpenVINO weight subgraph
ov::Output<ov::Node> weight_node;
if (is_u4) {
weight_node = make_int4_weights(weights, scales, zp, weights_per_block, use_bias);
} else {
weight_node = make_int8_weights(weights, scales, zp, weights_per_block, use_bias);
}
auto result = weight_node.get_node_shared_ptr();
result->set_friendly_name(tensor->name);
return result;
}
// Requantize weights to target format, writing to provided buffers
std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
const void * data,
ExtraQuantType requant_type,
int64_t block_size,
ov::Tensor & weights,
ov::Tensor & scales,
ov::Tensor & zp) {
int64_t n_elements = ggml_nelements(tensor);
// First dequantize to F32
std::vector<float> weights_f32(n_elements);
ggml_get_type_traits(tensor->type)->to_float(data, weights_f32.data(), n_elements);
// Handle F16 case - just convert and create constant
if (requant_type == ExtraQuantType::F16) {
ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), n_elements);
auto result = std::make_shared<ov::op::v0::Constant>(weights);
result->set_friendly_name(tensor->name);
return result;
}
// Requantize to target quantized format
bool is_u4 = (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128);
if (is_u4) {
quantize_q4_0(weights_f32.data(), weights, scales, zp, n_elements, block_size);
} else if (requant_type == ExtraQuantType::Q8_1_C) {
quantize_q8_1(weights_f32.data(), weights, scales, zp, n_elements, block_size);
} else {
quantize_q8_0(weights_f32.data(), weights, scales, zp, n_elements, block_size);
}
// Create the OpenVINO weight subgraph
ov::Output<ov::Node> weight_node;
if (is_u4) {
weight_node = make_int4_weights(weights, scales, zp, block_size);
} else {
weight_node = make_int8_weights(weights, scales, zp, block_size);
}
auto result = weight_node.get_node_shared_ptr();
result->set_friendly_name(tensor->name);
return result;
}
OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr, bool use_bias) {
GGML_ASSERT(tensor != nullptr);
GGML_ASSERT(data != nullptr);
OvWeight result;
// Get 2D shape for weights [rows, cols]
ov::Shape node_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
// Handle F16/F32/BF16 weights
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
ov::element::Type element_type;
switch (tensor->type) {
case GGML_TYPE_F32:
element_type = ov::element::f32;
break;
case GGML_TYPE_F16:
element_type = ov::element::f16;
break;
case GGML_TYPE_BF16:
element_type = ov::element::bf16;
break;
default:
OPENVINO_THROW("Unexpected tensor type in F16/F32/BF16 path");
}
if (output_base_ptr && output_base_ptr != data) {
// Using external buffer - copy data and create shared-memory constant
size_t tensor_bytes = ggml_nbytes(tensor);
memcpy(output_base_ptr, data, tensor_bytes);
result.weights = ov::Tensor(element_type, node_shape, output_base_ptr);
} else {
result.weights = ov::Tensor(element_type, node_shape, data);
}
result.weight_node = std::make_shared<ov::op::v0::Constant>(result.weights);
return result;
}
// Handle quantized weights
if (!ggml_is_quantized(tensor->type)) {
OPENVINO_THROW("Unsupported weight tensor type: ", ggml_type_name(tensor->type));
}
result.layout = ggml_openvino_get_extracted_layout(tensor, use_bias);
const auto & layout = result.layout;
if (layout.total_size == 0) {
OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type));
}
if (use_bias) {
OPENVINO_ASSERT(!layout.is_requant,
"use_bias is only used for test-backend-ops, which should not have requantization");
// bias node will be created on the fly and not use backend buffer
output_base_ptr = nullptr;
}
// F16 requant path - no separate scales/zp needed in result
if (layout.is_requant && layout.requant_type.has_value() && layout.requant_type.value() == ExtraQuantType::F16) {
if (output_base_ptr) {
result.weights = ov::Tensor(ov::element::f16, node_shape,
static_cast<uint8_t *>(output_base_ptr) + layout.weights_offset);
} else {
result.weights = ov::Tensor(ov::element::f16, node_shape);
}
ov::Tensor dummy_scales, dummy_zp; // Not used for F16
result.weight_node =
requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, result.weights, dummy_scales, dummy_zp);
return result;
}
// Quantized path (normal extraction or quantized requant)
// Create weight/scale/zp tensors - shared between both paths
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
if (output_base_ptr) {
uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
result.weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
result.scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
result.zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset);
} else {
result.weights = ov::Tensor(weight_type, node_shape);
result.scales = ov::Tensor(ov::element::f16, scale_shape);
if (use_bias && !layout.is_symmetric) {
// bias only has effect for asymmetric quant
result.zp = ov::Tensor(ov::element::f16, zp_shape);
} else {
result.zp = ov::Tensor(weight_type, zp_shape);
}
}
if (layout.is_requant && layout.requant_type.has_value()) {
result.weight_node = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block,
result.weights, result.scales, result.zp);
} else {
result.weight_node =
extract_quantized_weights(tensor, data, result.weights, result.scales, result.zp, use_bias);
}
return result;
}
void quantize_q4_0(const float * x,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & zp_arr,
int64_t k,
int64_t qk) {
assert(k % qk == 0);
const int nb = k / qk;
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * zp = static_cast<uint8_t *>(zp_arr.data());
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
// For Q4_0, zero point is always 8
if (is_scalar_zp) {
zp[0] = 8 | (8 << 4); // Pack two 4-bit values
}
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
float max = 0.0f;
for (int j = 0; j < qk; j++) {
const float v = x[i * qk + j];
if (amax < fabsf(v)) {
amax = fabsf(v);
max = v;
}
}
const float d = max / -8;
if (d == 0) {
scales[i] = ov::float16(1.0f);
// zp is already set to 8 for symmetric, or set per-block for asymmetric
if (!is_scalar_zp) {
if (i % 2 == 0) {
zp[i / 2] = 8;
} else {
zp[i / 2] |= (8 << 4);
}
}
memset(weights + i * qk / 2, 8 | (8 << 4), qk / 2);
continue;
}
const float id = 1.0f / d;
scales[i] = ov::float16(d);
// For asymmetric quantization, store per-block zero points
if (!is_scalar_zp) {
if (i % 2 == 0) {
zp[i / 2] = 8;
} else {
zp[i / 2] |= (8 << 4);
}
}
for (int j = 0; j < qk / 2; ++j) {
const float x0 = x[i * qk + 2 * j] * id;
const float x1 = x[i * qk + 2 * j + 1] * id;
const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f));
const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f));
weights[i * qk / 2 + j] = xi0 | (xi1 << 4);
}
}
}
void quantize_q8_0(const float * x,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & zp_arr,
int64_t k,
int64_t qk) {
assert(k % qk == 0);
const int nb = k / qk;
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * zp = static_cast<uint8_t *>(zp_arr.data());
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
// For Q8_0, zero point is always 128
if (is_scalar_zp) {
zp[0] = 128;
}
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
for (int j = 0; j < qk; j++) {
const float v = x[i * qk + j];
if (amax < fabsf(v)) {
amax = fabsf(v);
}
}
const float d = amax / 127.0f;
const float id = d ? 1.0f / d : 0.0f;
scales[i] = ov::float16(d);
// For asymmetric quantization, store per-block zero points
if (!is_scalar_zp) {
zp[i] = 128;
}
for (int j = 0; j < qk; ++j) {
const float x0 = x[i * qk + j] * id;
const int8_t xi0 = roundf(x0);
weights[i * qk + j] = (uint8_t) (xi0 + 128);
}
}
}
void quantize_q8_1(const float * x,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & zp_arr,
int64_t k,
int64_t qk) {
assert(k % qk == 0);
const int nb = k / qk;
auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * zp = static_cast<uint8_t *>(zp_arr.data());
for (int i = 0; i < nb; i++) {
float min = std::numeric_limits<float>::max();
float max = std::numeric_limits<float>::lowest();
for (int j = 0; j < qk; j++) {
const float v = x[i * qk + j];
if (v < min) {
min = v;
}
if (v > max) {
max = v;
}
}
const float d = (max - min) / ((1 << 8) - 1);
const float id = d ? 1.0f / d : 0.0f;
scales[i] = ov::float16(d);
// zp = -min / scale (Q8_1 is asymmetric)
zp[i] = (d != 0.0f) ? (uint8_t) std::round(-min / d) : 0;
for (int j = 0; j < qk; ++j) {
const float x0 = (x[i * qk + j] - min) * id;
const uint8_t xi0 = roundf(x0);
weights[i * qk + j] = xi0;
}
}
}

Some files were not shown because too many files have changed in this diff Show More