Files
llama.cpp/src/llama-ext.h
Bartowski 7992aa7c8e tests : add unit test coverage for llama_tensor_get_type (#20112)
* Add unit test coverage for llama_tensor_get_type

* Fix merge conflicts, add more schemas

* clang formatter changes

* Trailing whitespace

* Update name

* Start rebase

* Updating files with upstream changes prior to rebase

* Changes needed from rebase

* Update attn_qkv schema, change throw behaviour

* Fix merge conflicts

* White space

* Update with latest changes to state counters

* Revert accidental personal CLAUDE.md changes

* Change quotation mark

* Reuse metadata.name since we have it

* Move test-only stuff out of llama-quant.cpp

* Hide the regex functionality back in llama-quant.cpp, use a unique pointer to a new struct 'compiled_tensor_type_patterns' which contains the patterns

* cont : inital deslop guidelines

* Cleanup based on review comments

* Continue cleanup

* Small cleanup

* Manually set proper ordering of tensors, mostly applies to gemma

* Formatting

* Update tests/test-quant-type-selection.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Fix merge conflicts

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-04-02 22:53:58 +02:00

57 lines
1.8 KiB
C++

#pragma once
#include "llama.h"
#include <cstdint>
// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
LLAMA_API struct ggml_cgraph * llama_graph_reserve(
struct llama_context * ctx,
uint32_t n_tokens,
uint32_t n_seqs,
uint32_t n_outputs);
// Get the default ggml_type for a given ftype.
LLAMA_API ggml_type llama_ftype_get_default_type(llama_ftype ftype);
// Quantization state.
struct quantize_state_impl;
LLAMA_API quantize_state_impl * llama_quant_init(
const llama_model * model,
const llama_model_quantize_params * params);
LLAMA_API void llama_quant_free(quantize_state_impl * qs);
// Descriptor for constructing a mock model for quantization testing.
struct llama_quant_model_desc {
const char * architecture;
uint32_t n_embd;
uint32_t n_ff;
uint32_t n_layer;
uint32_t n_head;
uint32_t n_head_kv;
uint32_t n_expert;
uint32_t n_embd_head_k;
uint32_t n_embd_head_v;
};
// Create a mock model from a metadata descriptor (for testing).
// The returned model must be freed with llama_model_free().
LLAMA_API llama_model * llama_quant_model_from_metadata(const llama_quant_model_desc * desc);
// Returns true if this tensor should be quantized (based on name, dims, params).
LLAMA_API bool llama_quant_tensor_allows_quantization(
const quantize_state_impl * qs,
const ggml_tensor * tensor);
// Compute quantization type assignments for a list of tensors.
// All tensors should be quantizable (use llama_quant_tensor_allows_quantization to filter).
// result_types: caller-allocated array of n_tensors elements, filled with assigned types.
LLAMA_API void llama_quant_compute_types(
quantize_state_impl * qs,
llama_ftype ftype,
ggml_tensor ** tensors,
ggml_type * result_types,
size_t n_tensors);