mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-19 07:24:07 +00:00
Compare commits
14 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
090b137e56 | ||
|
|
968929528c | ||
|
|
3d26a09dc7 | ||
|
|
bd2a93d475 | ||
|
|
e75ee11024 | ||
|
|
da9b8d3300 | ||
|
|
e443fbcfa5 | ||
|
|
73d284a250 | ||
|
|
df17a4c94f | ||
|
|
1871f0ba56 | ||
|
|
f47edb8c19 | ||
|
|
da143b9940 | ||
|
|
f1768d8f03 | ||
|
|
2da64a2f8a |
@@ -7212,6 +7212,7 @@ class DeepseekModel(TextModel):
|
||||
"DeepseekV3ForCausalLM",
|
||||
"KimiVLForConditionalGeneration",
|
||||
"YoutuForCausalLM",
|
||||
"YoutuVLForConditionalGeneration"
|
||||
)
|
||||
class DeepseekV2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
||||
@@ -9955,6 +9956,27 @@ class LFM2Model(TextModel):
|
||||
return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
|
||||
|
||||
|
||||
@ModelBase.register("Lfm2Model")
|
||||
class LFM2ColBertModel(LFM2Model):
|
||||
model_arch = gguf.MODEL_ARCH.LFM2
|
||||
dense_tensor_name = "dense_2"
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if not name.startswith(self.dense_tensor_name):
|
||||
name = "model." + name
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
# dense tensor is stored in a separate safetensors file
|
||||
from safetensors.torch import load_file
|
||||
tensors_file = self.dir_model / "1_Dense" / "model.safetensors"
|
||||
assert tensors_file.is_file()
|
||||
tensor = load_file(tensors_file)["linear.weight"]
|
||||
self.gguf_writer.add_embedding_length_out(tensor.shape[0])
|
||||
yield f"{self.dense_tensor_name}.weight", tensor.clone()
|
||||
|
||||
|
||||
@ModelBase.register("Lfm2MoeForCausalLM")
|
||||
class LFM2MoeModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.LFM2MOE
|
||||
@@ -10674,8 +10696,8 @@ class JanusProVisionModel(MmprojModel):
|
||||
return []
|
||||
|
||||
|
||||
@ModelBase.register("YOUTUVLForConditionalGeneration", "YOUTUVLForCausalLM")
|
||||
class YOUTUVLVisionModel(MmprojModel):
|
||||
@ModelBase.register("YoutuVLForConditionalGeneration")
|
||||
class YoutuVLVisionModel(MmprojModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
|
||||
@@ -22,7 +22,7 @@ Legend:
|
||||
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -33,7 +33,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
|
||||
}
|
||||
}
|
||||
|
||||
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
|
||||
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd_out, int embd_norm) {
|
||||
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
||||
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
@@ -65,8 +65,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
|
||||
}
|
||||
|
||||
float * out = output + embd_pos * n_embd;
|
||||
common_embd_normalize(embd, out, n_embd, embd_norm);
|
||||
float * out = output + embd_pos * n_embd_out;
|
||||
common_embd_normalize(embd, out, n_embd_out, embd_norm);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -252,8 +252,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// allocate output
|
||||
const int n_embd = llama_model_n_embd(model);
|
||||
std::vector<float> embeddings(n_embd_count * n_embd, 0);
|
||||
const int n_embd_out = llama_model_n_embd_out(model);
|
||||
std::vector<float> embeddings(n_embd_count * n_embd_out, 0);
|
||||
float * emb = embeddings.data();
|
||||
|
||||
// break into batches
|
||||
@@ -267,8 +267,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// encode if at capacity
|
||||
if (batch.n_tokens + n_toks > n_batch || s >= n_seq_max) {
|
||||
float * out = emb + e * n_embd;
|
||||
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||
float * out = emb + e * n_embd_out;
|
||||
batch_decode(ctx, batch, out, s, n_embd_out, params.embd_normalize);
|
||||
e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
|
||||
s = 0;
|
||||
common_batch_clear(batch);
|
||||
@@ -280,8 +280,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// final batch
|
||||
float * out = emb + e * n_embd;
|
||||
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||
float * out = emb + e * n_embd_out;
|
||||
batch_decode(ctx, batch, out, s, n_embd_out, params.embd_normalize);
|
||||
|
||||
if (params.embd_out.empty()) {
|
||||
LOG("\n");
|
||||
@@ -289,19 +289,19 @@ int main(int argc, char ** argv) {
|
||||
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||
for (int j = 0; j < n_embd_count; j++) {
|
||||
LOG("embedding %d: ", j);
|
||||
for (int i = 0; i < std::min(3, n_embd); i++) {
|
||||
for (int i = 0; i < std::min(3, n_embd_out); i++) {
|
||||
if (params.embd_normalize == 0) {
|
||||
LOG("%6.0f ", emb[j * n_embd + i]);
|
||||
LOG("%6.0f ", emb[j * n_embd_out + i]);
|
||||
} else {
|
||||
LOG("%9.6f ", emb[j * n_embd + i]);
|
||||
LOG("%9.6f ", emb[j * n_embd_out + i]);
|
||||
}
|
||||
}
|
||||
LOG(" ... ");
|
||||
for (int i = n_embd - 3; i < n_embd; i++) {
|
||||
for (int i = n_embd_out - 3; i < n_embd_out; i++) {
|
||||
if (params.embd_normalize == 0) {
|
||||
LOG("%6.0f ", emb[j * n_embd + i]);
|
||||
LOG("%6.0f ", emb[j * n_embd_out + i]);
|
||||
} else {
|
||||
LOG("%9.6f ", emb[j * n_embd + i]);
|
||||
LOG("%9.6f ", emb[j * n_embd_out + i]);
|
||||
}
|
||||
}
|
||||
LOG("\n");
|
||||
@@ -320,9 +320,9 @@ int main(int argc, char ** argv) {
|
||||
for (uint32_t i = 0; i < n_cls_out; i++) {
|
||||
// NOTE: if you change this log - update the tests in ci/run.sh
|
||||
if (n_cls_out == 1) {
|
||||
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
|
||||
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd_out]);
|
||||
} else {
|
||||
LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd + i], cls_out_labels[i].c_str());
|
||||
LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd_out + i], cls_out_labels[i].c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -330,11 +330,11 @@ int main(int argc, char ** argv) {
|
||||
// print the first part of the embeddings or for a single prompt, the full embedding
|
||||
for (int j = 0; j < n_prompts; j++) {
|
||||
LOG("embedding %d: ", j);
|
||||
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
||||
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd_out) : n_embd_out); i++) {
|
||||
if (params.embd_normalize == 0) {
|
||||
LOG("%6.0f ", emb[j * n_embd + i]);
|
||||
LOG("%6.0f ", emb[j * n_embd_out + i]);
|
||||
} else {
|
||||
LOG("%9.6f ", emb[j * n_embd + i]);
|
||||
LOG("%9.6f ", emb[j * n_embd_out + i]);
|
||||
}
|
||||
}
|
||||
LOG("\n");
|
||||
@@ -350,7 +350,7 @@ int main(int argc, char ** argv) {
|
||||
LOG("\n");
|
||||
for (int i = 0; i < n_prompts; i++) {
|
||||
for (int j = 0; j < n_prompts; j++) {
|
||||
float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||
float sim = common_embd_similarity_cos(emb + i * n_embd_out, emb + j * n_embd_out, n_embd_out);
|
||||
LOG("%6.2f ", sim);
|
||||
}
|
||||
LOG("%1.10s", prompts[i].c_str());
|
||||
@@ -368,9 +368,9 @@ int main(int argc, char ** argv) {
|
||||
if (notArray) LOG(" {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
|
||||
LOG("[");
|
||||
for (int i = 0;;) { // at least one iteration (n_embd > 0)
|
||||
LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
|
||||
LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd_out + i]);
|
||||
i++;
|
||||
if (i < n_embd) LOG(","); else break;
|
||||
if (i < n_embd_out) LOG(","); else break;
|
||||
}
|
||||
LOG(notArray ? "]\n }" : "]");
|
||||
j++;
|
||||
@@ -383,7 +383,7 @@ int main(int argc, char ** argv) {
|
||||
for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
|
||||
LOG(" [");
|
||||
for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
|
||||
float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||
float sim = common_embd_similarity_cos(emb + i * n_embd_out, emb + j * n_embd_out, n_embd_out);
|
||||
LOG("%6.2f", sim);
|
||||
j++;
|
||||
if (j < n_embd_count) LOG(", "); else break;
|
||||
@@ -397,7 +397,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (notArray) LOG("\n}\n");
|
||||
} else if (params.embd_out == "raw") {
|
||||
print_raw_embeddings(emb, n_embd_count, n_embd, model, pooling_type, params.embd_normalize);
|
||||
print_raw_embeddings(emb, n_embd_count, n_embd_out, model, pooling_type, params.embd_normalize);
|
||||
}
|
||||
|
||||
LOG("\n");
|
||||
|
||||
@@ -161,9 +161,9 @@ int main(int argc, char ** argv) {
|
||||
std::vector<float> embd_out;
|
||||
|
||||
if (embedding_mode) {
|
||||
const int n_embd = llama_model_n_embd(model);
|
||||
const int n_embd_out = llama_model_n_embd_out(model);
|
||||
const int n_embd_count = pooling_enabled ? 1 : batch.n_tokens;
|
||||
const int n_embeddings = n_embd * n_embd_count;
|
||||
const int n_embeddings = n_embd_out * n_embd_count;
|
||||
float * embeddings;
|
||||
type = "-embeddings";
|
||||
|
||||
@@ -177,7 +177,7 @@ int main(int argc, char ** argv) {
|
||||
embeddings = llama_get_embeddings(ctx);
|
||||
}
|
||||
|
||||
printf("Embedding dimension: %d\n", n_embd);
|
||||
printf("Embedding dimension: %d\n", n_embd_out);
|
||||
printf("\n");
|
||||
|
||||
// Print embeddings in the specified format
|
||||
@@ -185,16 +185,16 @@ int main(int argc, char ** argv) {
|
||||
printf("embedding %d: ", j);
|
||||
|
||||
// Print first 3 values
|
||||
for (int i = 0; i < 3 && i < n_embd; i++) {
|
||||
printf("%9.6f ", embeddings[j * n_embd + i]);
|
||||
for (int i = 0; i < 3 && i < n_embd_out; i++) {
|
||||
printf("%9.6f ", embeddings[j * n_embd_out + i]);
|
||||
}
|
||||
|
||||
printf(" ... ");
|
||||
|
||||
// Print last 3 values
|
||||
for (int i = n_embd - 3; i < n_embd; i++) {
|
||||
for (int i = n_embd_out - 3; i < n_embd_out; i++) {
|
||||
if (i >= 0) {
|
||||
printf("%9.6f ", embeddings[j * n_embd + i]);
|
||||
printf("%9.6f ", embeddings[j * n_embd_out + i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -217,8 +217,8 @@ int main(int argc, char ** argv) {
|
||||
struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
|
||||
|
||||
// allocate output
|
||||
const int n_embd = llama_model_n_embd(model);
|
||||
std::vector<float> embeddings(n_chunks * n_embd, 0);
|
||||
const int n_embd_out = llama_model_n_embd_out(model);
|
||||
std::vector<float> embeddings(n_chunks * n_embd_out, 0);
|
||||
float * emb = embeddings.data();
|
||||
|
||||
// break into batches
|
||||
@@ -232,8 +232,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// encode if at capacity
|
||||
if (batch.n_tokens + n_toks > n_batch || s >= llama_n_seq_max(ctx)) {
|
||||
float * out = emb + p * n_embd;
|
||||
batch_process(ctx, batch, out, s, n_embd);
|
||||
float * out = emb + p * n_embd_out;
|
||||
batch_process(ctx, batch, out, s, n_embd_out);
|
||||
common_batch_clear(batch);
|
||||
p += s;
|
||||
s = 0;
|
||||
@@ -245,12 +245,12 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// final batch
|
||||
float * out = emb + p * n_embd;
|
||||
batch_process(ctx, batch, out, s, n_embd);
|
||||
float * out = emb + p * n_embd_out;
|
||||
batch_process(ctx, batch, out, s, n_embd_out);
|
||||
|
||||
// save embeddings to chunks
|
||||
for (int i = 0; i < n_chunks; i++) {
|
||||
chunks[i].embedding = std::vector<float>(emb + i * n_embd, emb + (i + 1) * n_embd);
|
||||
chunks[i].embedding = std::vector<float>(emb + i * n_embd_out, emb + (i + 1) * n_embd_out);
|
||||
// clear tokens as they are no longer needed
|
||||
chunks[i].tokens.clear();
|
||||
}
|
||||
@@ -266,8 +266,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
batch_add_seq(query_batch, query_tokens, 0);
|
||||
|
||||
std::vector<float> query_emb(n_embd, 0);
|
||||
batch_process(ctx, query_batch, query_emb.data(), 1, n_embd);
|
||||
std::vector<float> query_emb(n_embd_out, 0);
|
||||
batch_process(ctx, query_batch, query_emb.data(), 1, n_embd_out);
|
||||
|
||||
common_batch_clear(query_batch);
|
||||
|
||||
@@ -275,7 +275,7 @@ int main(int argc, char ** argv) {
|
||||
{
|
||||
std::vector<std::pair<int, float>> similarities;
|
||||
for (int i = 0; i < n_chunks; i++) {
|
||||
float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
|
||||
float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd_out);
|
||||
similarities.push_back(std::make_pair(i, sim));
|
||||
}
|
||||
|
||||
|
||||
@@ -122,7 +122,7 @@ std::optional<std::string> get_env(const std::string & name) {
|
||||
* @brief Verify whether the environment variable is a valid value.
|
||||
*/
|
||||
bool parse_bool(const std::string & value) {
|
||||
std::unordered_set<std::string> valid_values = { "on", "1", "yes", "y", "enable", "true" };
|
||||
static const std::unordered_set<std::string> valid_values = { "on", "1", "yes", "y", "enable", "true" };
|
||||
return valid_values.find(value) != valid_values.end();
|
||||
}
|
||||
|
||||
|
||||
@@ -1036,7 +1036,7 @@ struct ggml_tensor_extra_gpu {
|
||||
#define USE_CUDA_GRAPH
|
||||
#endif
|
||||
|
||||
struct ggml_graph_node_properties {
|
||||
struct ggml_cuda_graph_node_properties {
|
||||
void * node_address;
|
||||
ggml_op node_op;
|
||||
int64_t ne[GGML_MAX_DIMS];
|
||||
@@ -1061,10 +1061,25 @@ struct ggml_cuda_graph {
|
||||
std::vector<cudaGraphNode_t> nodes;
|
||||
bool disable_due_to_gpu_arch = false;
|
||||
bool disable_due_to_too_many_updates = false;
|
||||
bool disable_due_to_failed_graph_capture = false;
|
||||
int number_consecutive_updates = 0;
|
||||
bool cuda_graphs_enabled = false;
|
||||
std::vector<ggml_graph_node_properties> ggml_graph_properties;
|
||||
std::vector<ggml_cuda_graph_node_properties> props;
|
||||
|
||||
void record_update(bool use_graph, bool update_required) {
|
||||
if (use_graph && update_required) {
|
||||
number_consecutive_updates++;
|
||||
} else {
|
||||
number_consecutive_updates = 0;
|
||||
}
|
||||
if (number_consecutive_updates >= 4) {
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
||||
disable_due_to_too_many_updates = true;
|
||||
}
|
||||
}
|
||||
|
||||
bool is_enabled() const {
|
||||
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
|
||||
return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env || disable_due_to_too_many_updates);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
@@ -11,10 +11,12 @@
|
||||
#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
|
||||
|
||||
// log(2) = 0.6931, by adding this to the KQ maximum used for the softmax the numerical range representable
|
||||
// by the VKQ accumulators is effectively being shifted up by a factor of 8.
|
||||
// by the VKQ accumulators is effectively being shifted up by a factor of 2.
|
||||
// This reduces issues with numerical overflow but also causes larger values to be flushed to zero.
|
||||
// However, as the output from FlashAttention will usually be used as an input for a matrix multiplication this should be negligible.
|
||||
#define FATTN_KQ_MAX_OFFSET 0.6931f
|
||||
// Still, the value range should be shifted as much as necessary but as little as possible.
|
||||
// The macro on the following line shifts it by a factor of 2**3=8, as was needed to fix https://github.com/ggml-org/llama.cpp/issues/18606 .
|
||||
#define FATTN_KQ_MAX_OFFSET (3.0f*0.6931f)
|
||||
|
||||
typedef void (* fattn_kernel_t)(
|
||||
const char * __restrict__ Q,
|
||||
|
||||
@@ -2853,9 +2853,9 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||
}
|
||||
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
|
||||
bool use_cuda_graph) {
|
||||
static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
|
||||
|
||||
bool use_cuda_graph = true;
|
||||
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
||||
|
||||
const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
|
||||
@@ -2915,41 +2915,41 @@ static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
|
||||
return use_cuda_graph;
|
||||
}
|
||||
|
||||
static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
|
||||
graph_node_properties->node_address = node->data;
|
||||
graph_node_properties->node_op = node->op;
|
||||
static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties * props, ggml_tensor * node) {
|
||||
props->node_address = node->data;
|
||||
props->node_op = node->op;
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
graph_node_properties->ne[i] = node->ne[i];
|
||||
graph_node_properties->nb[i] = node->nb[i];
|
||||
props->ne[i] = node->ne[i];
|
||||
props->nb[i] = node->nb[i];
|
||||
}
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
|
||||
props->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
|
||||
}
|
||||
memcpy(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS);
|
||||
memcpy(props->op_params, node->op_params, GGML_MAX_OP_PARAMS);
|
||||
}
|
||||
|
||||
static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
|
||||
if (node->data != graph_node_properties->node_address &&
|
||||
static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_graph_node_properties * props) {
|
||||
if (node->data != props->node_address &&
|
||||
node->op != GGML_OP_VIEW) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (node->op != graph_node_properties->node_op) {
|
||||
if (node->op != props->node_op) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
if (node->ne[i] != graph_node_properties->ne[i]) {
|
||||
if (node->ne[i] != props->ne[i]) {
|
||||
return false;
|
||||
}
|
||||
if (node->nb[i] != graph_node_properties->nb[i]) {
|
||||
if (node->nb[i] != props->nb[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (node->src[i] &&
|
||||
node->src[i]->data != graph_node_properties->src_address[i] &&
|
||||
node->src[i]->data != props->src_address[i] &&
|
||||
node->op != GGML_OP_VIEW
|
||||
) {
|
||||
return false;
|
||||
@@ -2957,44 +2957,55 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
|
||||
}
|
||||
|
||||
if ((node->op == GGML_OP_SCALE || node->op == GGML_OP_GLU) &&
|
||||
memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
|
||||
memcmp(props->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
|
||||
static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
|
||||
|
||||
bool cuda_graph_update_required = false;
|
||||
bool res = false;
|
||||
|
||||
if (cuda_ctx->cuda_graph->instance == nullptr) {
|
||||
cuda_graph_update_required = true;
|
||||
res = true;
|
||||
}
|
||||
|
||||
// Check if the graph size has changed
|
||||
if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
|
||||
cuda_graph_update_required = true;
|
||||
cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
|
||||
if (cuda_ctx->cuda_graph->props.size() != (size_t)cgraph->n_nodes + cgraph->n_leafs) {
|
||||
res = true;
|
||||
cuda_ctx->cuda_graph->props.resize(cgraph->n_nodes + cgraph->n_leafs);
|
||||
}
|
||||
|
||||
// Loop over nodes in GGML graph to determine if CUDA graph update is required
|
||||
// and store properties to allow this comparison for the next token
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
bool has_matching_properties = true;
|
||||
if (!cuda_graph_update_required) {
|
||||
has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
|
||||
bool props_match = true;
|
||||
if (!res) {
|
||||
props_match = ggml_cuda_graph_node_properties_match(cgraph->nodes[i], &cuda_ctx->cuda_graph->props[i]);
|
||||
}
|
||||
if (!has_matching_properties) {
|
||||
cuda_graph_update_required = true;
|
||||
if (!props_match) {
|
||||
res = true;
|
||||
}
|
||||
set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
|
||||
ggml_cuda_graph_node_set_properties(&cuda_ctx->cuda_graph->props[i], cgraph->nodes[i]);
|
||||
}
|
||||
|
||||
return cuda_graph_update_required;
|
||||
for (int i = 0; i < cgraph->n_leafs; i++) {
|
||||
bool props_match= true;
|
||||
if (!res) {
|
||||
props_match = ggml_cuda_graph_node_properties_match(cgraph->leafs[i], &cuda_ctx->cuda_graph->props[cgraph->n_nodes + i]);
|
||||
}
|
||||
if (!props_match) {
|
||||
res = true;
|
||||
}
|
||||
ggml_cuda_graph_node_set_properties(&cuda_ctx->cuda_graph->props[cgraph->n_nodes + i], cgraph->leafs[i]);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
|
||||
static void ggml_cuda_graph_update_executable(ggml_backend_cuda_context * cuda_ctx) {
|
||||
|
||||
#if CUDART_VERSION >= 12000
|
||||
cudaGraphExecUpdateResultInfo result_info;
|
||||
@@ -3225,10 +3236,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
||||
return false;
|
||||
}
|
||||
|
||||
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
||||
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
|
||||
static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, const bool use_cuda_graph, const bool cuda_graph_update_required) {
|
||||
bool graph_evaluated_or_captured = false;
|
||||
|
||||
// flag used to determine whether it is an integrated_gpu
|
||||
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
|
||||
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
|
||||
|
||||
ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context();
|
||||
bool is_concurrent_event_active = false;
|
||||
@@ -3698,7 +3710,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
|
||||
}
|
||||
if (cuda_graph_update_required) { // Update graph executable
|
||||
update_cuda_graph_executable(cuda_ctx);
|
||||
ggml_cuda_graph_update_executable(cuda_ctx);
|
||||
}
|
||||
// Launch graph
|
||||
CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
|
||||
@@ -3708,43 +3720,25 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
}
|
||||
}
|
||||
|
||||
static bool ggml_cuda_set_cuda_graph_enabled(ggml_backend_cuda_context * cuda_ctx) {
|
||||
static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx) {
|
||||
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
|
||||
|
||||
// Objects required for CUDA Graph
|
||||
if (cuda_ctx->cuda_graph == nullptr) {
|
||||
cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
|
||||
}
|
||||
|
||||
bool use_cuda_graph = true;
|
||||
|
||||
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
||||
if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
|
||||
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
// Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
|
||||
// or previous graph capture failure.
|
||||
// Also disable for multi-gpu for now. TO DO investigate
|
||||
if (disable_cuda_graphs_due_to_env
|
||||
|| cuda_ctx->cuda_graph->disable_due_to_gpu_arch
|
||||
|| cuda_ctx->cuda_graph->disable_due_to_too_many_updates
|
||||
|| cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
|
||||
use_cuda_graph = false;
|
||||
}
|
||||
|
||||
cuda_ctx->cuda_graph->cuda_graphs_enabled = use_cuda_graph;
|
||||
return cuda_ctx->cuda_graph->is_enabled();
|
||||
#else
|
||||
bool use_cuda_graph = false;
|
||||
return false;
|
||||
#endif // USE_CUDA_GRAPH
|
||||
|
||||
return use_cuda_graph;
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
@@ -3755,30 +3749,14 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
bool use_cuda_graph = false;
|
||||
bool cuda_graph_update_required = false;
|
||||
|
||||
// graph_optimize calls set_cuda_graph_enabled, in-case it not called (i.e. graph_compute is directly called)
|
||||
// we call it here instead.
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
|
||||
use_cuda_graph = ggml_cuda_graph_set_enabled(cuda_ctx);
|
||||
|
||||
if (use_cuda_graph) {
|
||||
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
|
||||
if (cuda_ctx->cuda_graph->is_enabled()) {
|
||||
cuda_graph_update_required = ggml_cuda_graph_update_required(cuda_ctx, cgraph);
|
||||
use_cuda_graph = ggml_cuda_graph_check_compability(cgraph);
|
||||
|
||||
use_cuda_graph = check_node_graph_compatibility(cgraph, use_cuda_graph);
|
||||
|
||||
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
|
||||
if (use_cuda_graph && cuda_graph_update_required) {
|
||||
cuda_ctx->cuda_graph->number_consecutive_updates++;
|
||||
} else {
|
||||
cuda_ctx->cuda_graph->number_consecutive_updates = 0;
|
||||
}
|
||||
|
||||
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
||||
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
||||
cuda_ctx->cuda_graph->cuda_graphs_enabled = false;
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
||||
#endif
|
||||
}
|
||||
cuda_ctx->cuda_graph->record_update(use_cuda_graph, cuda_graph_update_required);
|
||||
}
|
||||
#endif // USE_CUDA_GRAPH
|
||||
|
||||
@@ -3792,9 +3770,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
|
||||
}
|
||||
|
||||
bool graph_evaluated_or_captured = false;
|
||||
|
||||
evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
|
||||
ggml_cuda_graph_evaluate_and_capture(cuda_ctx, cgraph, use_cuda_graph, cuda_graph_update_required);
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -3827,7 +3803,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
|
||||
static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
|
||||
|
||||
const bool use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
|
||||
const bool use_cuda_graph = ggml_cuda_graph_set_enabled(cuda_ctx);
|
||||
|
||||
static bool enable_graph_optimization = [] {
|
||||
const char * env = getenv("GGML_CUDA_GRAPH_OPT");
|
||||
|
||||
@@ -34,13 +34,11 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
// CUDA_GRAPHS_DISABLED
|
||||
((ncols > 65536) &&
|
||||
((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
|
||||
ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
|
||||
ctx.cuda_graph->disable_due_to_failed_graph_capture)) ||
|
||||
ctx.cuda_graph->is_enabled())) ||
|
||||
// CUDA_GRAPHS ENABLED
|
||||
((ncols > 32768) &&
|
||||
!((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
|
||||
ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
|
||||
ctx.cuda_graph->disable_due_to_failed_graph_capture))) {
|
||||
ctx.cuda_graph->is_enabled()))) {
|
||||
#else
|
||||
(ncols > 65536)) {
|
||||
#endif // USE_CUDA_GRAPH
|
||||
|
||||
@@ -333,6 +333,28 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
|
||||
}
|
||||
|
||||
if (amd_wmma_available(cc)) {
|
||||
// RDNA 4 is consistently worse on rocblas
|
||||
// https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
|
||||
if (GGML_CUDA_CC_IS_RDNA3(cc)) {
|
||||
// High expert counts almost always better on MMQ
|
||||
// due to a large amount of graph splits
|
||||
// https://github.com/ggml-org/llama.cpp/pull/18202
|
||||
if (n_experts >= 64) {
|
||||
return true;
|
||||
}
|
||||
|
||||
switch (type) {
|
||||
// These quants are really bad on MMQ
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
// These quants are usually worse but not always
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
return ne11 <= 128;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -101,6 +101,10 @@ void main() {
|
||||
const uint lane = gl_SubgroupInvocationID;
|
||||
|
||||
float probs[experts_per_thread];
|
||||
[[unroll]]
|
||||
for (int i = 0; i < experts_per_thread; i++) {
|
||||
probs[i] = -INFINITY;
|
||||
}
|
||||
|
||||
[[unroll]]
|
||||
for (uint i = 0; i < n_experts; i += WARP_SIZE) {
|
||||
@@ -112,8 +116,9 @@ void main() {
|
||||
softmax_warp_inplace(probs, n_experts, lane, nexperts_use_push);
|
||||
} else if (gating_func == GATING_FUNC_SIGMOID) {
|
||||
[[unroll]]
|
||||
for (int i = 0; i < experts_per_thread; i++) {
|
||||
probs[i] = 1.f / (1.f + exp(-probs[i]));
|
||||
for (uint i = 0; i < n_experts; i += WARP_SIZE) {
|
||||
const uint expert = i + lane;
|
||||
probs[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? 1.f / (1.f + exp(-probs[i / WARP_SIZE])) : -INFINITY;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -150,11 +155,11 @@ void main() {
|
||||
uint max_expert = lane;
|
||||
|
||||
[[unroll]]
|
||||
for (int i = 1; i < experts_per_thread; i++) {
|
||||
const uint expert = lane + i * WARP_SIZE;
|
||||
if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && selection_probs[i] > max_val_s) {
|
||||
max_val = probs[i];
|
||||
max_val_s = selection_probs[i];
|
||||
for (uint i = WARP_SIZE; i < n_experts; i += WARP_SIZE) {
|
||||
const uint expert = i + lane;
|
||||
if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && selection_probs[i / WARP_SIZE] > max_val_s) {
|
||||
max_val = probs[i / WARP_SIZE];
|
||||
max_val_s = selection_probs[i / WARP_SIZE];
|
||||
max_expert = expert;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2273,6 +2273,16 @@ static void ggml_webgpu_init_unary_pipeline(webgpu_context & webgpu_ctx) {
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, wgsl_xielu_inplace_f32, "xielu_inplace_f32", constants);
|
||||
webgpu_ctx->unary_pipelines[GGML_UNARY_OP_XIELU][GGML_TYPE_F16][1] =
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, wgsl_xielu_inplace_f16, "xielu_inplace_f16", constants);
|
||||
|
||||
// CEIL
|
||||
webgpu_ctx->unary_pipelines[GGML_UNARY_OP_CEIL][GGML_TYPE_F32][0] =
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, wgsl_ceil_f32, "ceil_f32", constants);
|
||||
webgpu_ctx->unary_pipelines[GGML_UNARY_OP_CEIL][GGML_TYPE_F16][0] =
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, wgsl_ceil_f16, "ceil_f16", constants);
|
||||
webgpu_ctx->unary_pipelines[GGML_UNARY_OP_CEIL][GGML_TYPE_F32][1] =
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, wgsl_ceil_inplace_f32, "ceil_inplace_f32", constants);
|
||||
webgpu_ctx->unary_pipelines[GGML_UNARY_OP_CEIL][GGML_TYPE_F16][1] =
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, wgsl_ceil_inplace_f16, "ceil_inplace_f16", constants);
|
||||
}
|
||||
|
||||
static void ggml_webgpu_init_scale_pipeline(webgpu_context & webgpu_ctx) {
|
||||
@@ -2528,6 +2538,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
|
||||
case GGML_UNARY_OP_EXP:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_XIELU:
|
||||
case GGML_UNARY_OP_CEIL:
|
||||
supports_op = supports_op =
|
||||
(op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type);
|
||||
break;
|
||||
|
||||
@@ -16,7 +16,8 @@
|
||||
"HARDSWISH_FUNC": "{{MUTATE}}[dst_i] = src[src_i] * min(1.0, max(0.0, (src[src_i] + 3.0) / 6.0));",
|
||||
"GELU_FUNC": "{{MUTATE}}[dst_i] = 0.5 * src[src_i] * (1.0 + tanh(clamp(sqrt(2.0 / 3.14159265) * (src[src_i] + 0.044715 * pow(src[src_i], 3.0)), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458",
|
||||
"GELU_QUICK_FUNC": "{{MUTATE}}[dst_i] = src[src_i] * 0.5 * (1.0 + tanh(clamp(0.79788456 * (src[src_i] + 0.044715 * src[src_i] * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458",
|
||||
"GELU_ERF_FUNC": "{{MUTATE}}[dst_i] = 0.5 * src[src_i] * (1.0 + tanh(clamp(0.79788456 * (src[src_i] + 0.044715 * src[src_i] * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458"
|
||||
"GELU_ERF_FUNC": "{{MUTATE}}[dst_i] = 0.5 * src[src_i] * (1.0 + tanh(clamp(0.79788456 * (src[src_i] + 0.044715 * src[src_i] * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458",
|
||||
"CEIL_FUNC": "{{MUTATE}}[dst_i] = ceil(src[src_i]);"
|
||||
}
|
||||
|
||||
#end(REPL_TEMPLATES)
|
||||
@@ -357,6 +358,27 @@
|
||||
"SHADER_NAME": "gelu_erf_inplace_f16",
|
||||
"REPLS": { "TYPE": "f16", "FUNC": "GELU_ERF_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
|
||||
"DECLS": ["INPLACE"]
|
||||
},
|
||||
|
||||
{
|
||||
"SHADER_NAME": "ceil_f32",
|
||||
"REPLS": { "TYPE": "f32", "FUNC": "CEIL_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
|
||||
"DECLS": ["NOT_INPLACE"]
|
||||
},
|
||||
{
|
||||
"SHADER_NAME": "ceil_f16",
|
||||
"REPLS": { "TYPE": "f16", "FUNC": "CEIL_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
|
||||
"DECLS": ["NOT_INPLACE"]
|
||||
},
|
||||
{
|
||||
"SHADER_NAME": "ceil_inplace_f32",
|
||||
"REPLS": { "TYPE": "f32", "FUNC": "CEIL_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
|
||||
"DECLS": ["INPLACE"]
|
||||
},
|
||||
{
|
||||
"SHADER_NAME": "ceil_inplace_f16",
|
||||
"REPLS": { "TYPE": "f16", "FUNC": "CEIL_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
|
||||
"DECLS": ["INPLACE"]
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
@@ -53,13 +53,15 @@
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
// Needed for ggml_fp32_to_bf16_row()
|
||||
#if defined(__AVX512BF16__)
|
||||
#if defined(_MSC_VER)
|
||||
#define m512bh(p) p
|
||||
#define m512i(p) p
|
||||
#else
|
||||
#define m512bh(p) (__m512bh)(p)
|
||||
#include <immintrin.h>
|
||||
#define m512i(p) (__m512i)(p)
|
||||
#endif
|
||||
#endif // defined(_MSC_VER)
|
||||
#endif // defined(__AVX512BF16__)
|
||||
|
||||
#if defined(__linux__) || \
|
||||
defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
|
||||
|
||||
@@ -104,6 +104,7 @@ class Keys:
|
||||
VOCAB_SIZE = "{arch}.vocab_size"
|
||||
CONTEXT_LENGTH = "{arch}.context_length"
|
||||
EMBEDDING_LENGTH = "{arch}.embedding_length"
|
||||
EMBEDDING_LENGTH_OUT = "{arch}.embedding_length_out"
|
||||
FEATURES_LENGTH = "{arch}.features_length"
|
||||
BLOCK_COUNT = "{arch}.block_count"
|
||||
LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count"
|
||||
@@ -3038,6 +3039,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.DENSE_2_OUT, # LFM2-ColBert-350M
|
||||
],
|
||||
MODEL_ARCH.LFM2MOE: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
|
||||
@@ -681,6 +681,9 @@ class GGUFWriter:
|
||||
def add_embedding_length(self, length: int) -> None:
|
||||
self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length)
|
||||
|
||||
def add_embedding_length_out(self, length: int) -> None:
|
||||
self.add_uint32(Keys.LLM.EMBEDDING_LENGTH_OUT.format(arch=self.arch), length)
|
||||
|
||||
def add_features_length(self, length: int) -> None:
|
||||
self.add_uint32(Keys.LLM.FEATURES_LENGTH.format(arch=self.arch), length)
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@ python = ">=3.8"
|
||||
numpy = ">=1.17"
|
||||
tqdm = ">=4.27"
|
||||
pyyaml = ">=5.1"
|
||||
requests = ">=2.25"
|
||||
sentencepiece = { version = ">=0.1.98,<=0.2.0", optional = true }
|
||||
PySide6 = { version = "^6.9", python = ">=3.9,<3.14", optional = true }
|
||||
|
||||
|
||||
@@ -535,6 +535,7 @@ extern "C" {
|
||||
LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
|
||||
|
||||
@@ -152,6 +152,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
||||
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
||||
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
||||
{ LLM_KV_EMBEDDING_LENGTH_OUT, "%s.embedding_length_out" },
|
||||
{ LLM_KV_FEATURES_LENGTH, "%s.features_length" },
|
||||
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
||||
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
|
||||
@@ -2075,6 +2076,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
||||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM_LFM2,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_DENSE_2_OUT,
|
||||
};
|
||||
case LLM_ARCH_LFM2MOE:
|
||||
return {
|
||||
|
||||
@@ -156,6 +156,7 @@ enum llm_kv {
|
||||
LLM_KV_VOCAB_SIZE,
|
||||
LLM_KV_CONTEXT_LENGTH,
|
||||
LLM_KV_EMBEDDING_LENGTH,
|
||||
LLM_KV_EMBEDDING_LENGTH_OUT,
|
||||
LLM_KV_FEATURES_LENGTH,
|
||||
LLM_KV_BLOCK_COUNT,
|
||||
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
||||
|
||||
@@ -758,7 +758,8 @@ float * llama_context::get_embeddings_ith(int32_t i) {
|
||||
throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
|
||||
}
|
||||
|
||||
return embd + j*model.hparams.n_embd;
|
||||
const uint32_t n_embd_out = model.hparams.get_n_embd_out();
|
||||
return embd + j*n_embd_out;
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
|
||||
#ifndef NDEBUG
|
||||
@@ -1194,9 +1195,10 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||
{
|
||||
// extract token embeddings
|
||||
GGML_ASSERT(embd != nullptr);
|
||||
const uint32_t n_embd_out = hparams.get_n_embd_out();
|
||||
|
||||
GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
|
||||
ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd*sizeof(float));
|
||||
GGML_ASSERT(n_tokens*n_embd_out <= (int64_t) embd_size);
|
||||
ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd_out*sizeof(float));
|
||||
} break;
|
||||
case LLAMA_POOLING_TYPE_MEAN:
|
||||
case LLAMA_POOLING_TYPE_CLS:
|
||||
@@ -1600,12 +1602,13 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
{
|
||||
// extract token embeddings
|
||||
GGML_ASSERT(embd != nullptr);
|
||||
float * embd_out = embd + n_outputs_prev*n_embd;
|
||||
const uint32_t n_embd_out = hparams.get_n_embd_out();
|
||||
float * embd_out = embd + n_outputs_prev*n_embd_out;
|
||||
|
||||
if (n_outputs) {
|
||||
GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
|
||||
GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd <= (int64_t) embd_size);
|
||||
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float));
|
||||
GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd_out <= (int64_t) embd_size);
|
||||
ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd_out*sizeof(float));
|
||||
}
|
||||
} break;
|
||||
case LLAMA_POOLING_TYPE_MEAN:
|
||||
@@ -1730,9 +1733,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
|
||||
|
||||
const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
|
||||
|
||||
const auto n_batch = cparams.n_batch;
|
||||
const auto n_vocab = vocab.n_tokens();
|
||||
const auto n_embd = hparams.n_embd;
|
||||
const auto n_batch = cparams.n_batch;
|
||||
const auto n_vocab = vocab.n_tokens();
|
||||
const auto n_embd_out = hparams.get_n_embd_out();
|
||||
|
||||
bool has_logits = true;
|
||||
bool has_embd = cparams.embeddings;
|
||||
@@ -1773,7 +1776,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba
|
||||
|
||||
// Allocate CPU logits buffer only if needed by sequences in this batch
|
||||
logits_size = (has_logits && cpu_logits) ? n_vocab*n_outputs_max : 0;
|
||||
embd_size = has_embd ? n_embd*n_outputs_max : 0;
|
||||
embd_size = has_embd ? n_embd_out*n_outputs_max : 0;
|
||||
|
||||
// TODO: avoid this branching by working with the worst-case
|
||||
if (!has_sampling) {
|
||||
|
||||
@@ -1326,6 +1326,10 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
// make sure the produced embeddings are immediately materialized in the ggml graph
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/18599
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
@@ -2067,14 +2071,18 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
||||
void llm_graph_context::build_dense_out(
|
||||
ggml_tensor * dense_2,
|
||||
ggml_tensor * dense_3) const {
|
||||
if (!cparams.embeddings || dense_2 == nullptr || dense_3 == nullptr) {
|
||||
if (!cparams.embeddings || !(dense_2 || dense_3)) {
|
||||
return;
|
||||
}
|
||||
ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
|
||||
GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");
|
||||
|
||||
cur = ggml_mul_mat(ctx0, dense_2, cur);
|
||||
cur = ggml_mul_mat(ctx0, dense_3, cur);
|
||||
if (dense_2) {
|
||||
cur = ggml_mul_mat(ctx0, dense_2, cur);
|
||||
}
|
||||
if (dense_3) {
|
||||
cur = ggml_mul_mat(ctx0, dense_3, cur);
|
||||
}
|
||||
cb(cur, "result_embd_pooled", -1);
|
||||
res->t_embd_pooled = cur;
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
@@ -72,6 +72,10 @@ uint32_t llama_hparams::n_embd_inp() const {
|
||||
return n_embd_inp;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::get_n_embd_out() const {
|
||||
return n_embd_out > 0 ? n_embd_out : n_embd;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
|
||||
const uint32_t n_head_kv = this->n_head_kv(il);
|
||||
|
||||
|
||||
@@ -162,6 +162,9 @@ struct llama_hparams {
|
||||
// for Classifiers
|
||||
uint32_t n_cls_out = 1;
|
||||
|
||||
// output embedding dimension (0 = use n_embd)
|
||||
uint32_t n_embd_out = 0;
|
||||
|
||||
// llama4 smallthinker
|
||||
uint32_t n_moe_layer_step = 0;
|
||||
uint32_t n_no_rope_layer_step = 4;
|
||||
@@ -234,6 +237,9 @@ struct llama_hparams {
|
||||
// dimension of main + auxiliary input embeddings
|
||||
uint32_t n_embd_inp() const;
|
||||
|
||||
// dimension of output embeddings
|
||||
uint32_t get_n_embd_out() const;
|
||||
|
||||
// dimension of key embeddings across all k-v heads
|
||||
uint32_t n_embd_k_gqa(uint32_t il = 0) const;
|
||||
|
||||
|
||||
@@ -146,6 +146,9 @@ void llama_model_saver::add_kv_from_model() {
|
||||
add_kv(LLM_KV_VOCAB_SIZE, vocab.n_tokens());
|
||||
add_kv(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
||||
add_kv(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
||||
if (hparams.n_embd_out > 0) {
|
||||
add_kv(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out);
|
||||
}
|
||||
add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true);
|
||||
|
||||
@@ -507,6 +507,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
||||
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
||||
ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out, false);
|
||||
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
||||
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
||||
@@ -6469,6 +6470,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
|
||||
}
|
||||
}
|
||||
|
||||
// for LFM2-ColBert-350M
|
||||
dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.get_n_embd_out()}, TENSOR_NOT_REQUIRED);
|
||||
} break;
|
||||
case LLM_ARCH_SMALLTHINKER:
|
||||
{
|
||||
@@ -8003,6 +8007,10 @@ int32_t llama_model_n_embd_inp(const llama_model * model) {
|
||||
return model->hparams.n_embd_inp();
|
||||
}
|
||||
|
||||
int32_t llama_model_n_embd_out(const llama_model * model) {
|
||||
return model->hparams.get_n_embd_out();
|
||||
}
|
||||
|
||||
int32_t llama_model_n_layer(const llama_model * model) {
|
||||
return model->hparams.n_layer;
|
||||
}
|
||||
|
||||
@@ -8184,6 +8184,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_topk_moe({71, 22, 1, 1}, 8, with_norm, bias_probs, gate, scale_w));
|
||||
test_cases.emplace_back(new test_topk_moe({128, 1, 1, 1}, 128, with_norm, bias_probs, gate, scale_w));
|
||||
test_cases.emplace_back(new test_topk_moe({129, 1, 1, 1}, 128, with_norm, bias_probs, gate, scale_w));
|
||||
test_cases.emplace_back(new test_topk_moe({160, 4, 1, 1}, 160, with_norm, bias_probs, gate, scale_w));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1505,9 +1505,9 @@ private:
|
||||
res->n_tokens = slot.task->n_tokens();
|
||||
res->res_type = slot.task->params.res_type;
|
||||
|
||||
const int n_embd = llama_model_n_embd(model);
|
||||
const int n_embd_out = llama_model_n_embd_out(model);
|
||||
|
||||
std::vector<float> embd_res(n_embd, 0.0f);
|
||||
std::vector<float> embd_res(n_embd_out, 0.0f);
|
||||
|
||||
for (int i = 0; i < batch.n_tokens; ++i) {
|
||||
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
|
||||
@@ -1524,18 +1524,18 @@ private:
|
||||
if (embd == nullptr) {
|
||||
SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
|
||||
|
||||
res->embedding.push_back(std::vector<float>(n_embd, 0.0f));
|
||||
res->embedding.push_back(std::vector<float>(n_embd_out, 0.0f));
|
||||
continue;
|
||||
}
|
||||
|
||||
// normalize only when there is pooling
|
||||
if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) {
|
||||
common_embd_normalize(embd, embd_res.data(), n_embd, slot.task->params.embd_normalize);
|
||||
common_embd_normalize(embd, embd_res.data(), n_embd_out, slot.task->params.embd_normalize);
|
||||
res->embedding.push_back(embd_res);
|
||||
break;
|
||||
}
|
||||
|
||||
res->embedding.emplace_back(embd, embd + n_embd);
|
||||
res->embedding.emplace_back(embd, embd + n_embd_out);
|
||||
}
|
||||
|
||||
SLT_DBG(slot, "%s", "sending embeddings\n");
|
||||
|
||||
@@ -21,11 +21,13 @@
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <winsock2.h>
|
||||
#include <windows.h>
|
||||
#else
|
||||
#include <sys/socket.h>
|
||||
#include <netinet/in.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <unistd.h>
|
||||
extern char **environ;
|
||||
#endif
|
||||
|
||||
#if defined(__APPLE__) && defined(__MACH__)
|
||||
@@ -99,6 +101,49 @@ static void unset_reserved_args(common_preset & preset, bool unset_model_args) {
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
static std::string wide_to_utf8(const wchar_t * ws) {
|
||||
if (!ws || !*ws) {
|
||||
return {};
|
||||
}
|
||||
|
||||
const int len = static_cast<int>(std::wcslen(ws));
|
||||
const int bytes = WideCharToMultiByte(CP_UTF8, 0, ws, len, nullptr, 0, nullptr, nullptr);
|
||||
if (bytes == 0) {
|
||||
return {};
|
||||
}
|
||||
|
||||
std::string utf8(bytes, '\0');
|
||||
WideCharToMultiByte(CP_UTF8, 0, ws, len, utf8.data(), bytes, nullptr, nullptr);
|
||||
|
||||
return utf8;
|
||||
}
|
||||
#endif
|
||||
|
||||
static std::vector<std::string> get_environment() {
|
||||
std::vector<std::string> env;
|
||||
|
||||
#ifdef _WIN32
|
||||
LPWCH env_block = GetEnvironmentStringsW();
|
||||
if (!env_block) {
|
||||
return env;
|
||||
}
|
||||
for (LPWCH e = env_block; *e; e += wcslen(e) + 1) {
|
||||
env.emplace_back(wide_to_utf8(e));
|
||||
}
|
||||
FreeEnvironmentStringsW(env_block);
|
||||
#else
|
||||
if (environ == nullptr) {
|
||||
return env;
|
||||
}
|
||||
for (char ** e = environ; *e != nullptr; e++) {
|
||||
env.emplace_back(*e);
|
||||
}
|
||||
#endif
|
||||
|
||||
return env;
|
||||
}
|
||||
|
||||
void server_model_meta::update_args(common_preset_context & ctx_preset, std::string bin_path) {
|
||||
// update params
|
||||
unset_reserved_args(preset, false);
|
||||
@@ -117,14 +162,11 @@ void server_model_meta::update_args(common_preset_context & ctx_preset, std::str
|
||||
server_models::server_models(
|
||||
const common_params & params,
|
||||
int argc,
|
||||
char ** argv,
|
||||
char ** envp)
|
||||
char ** argv)
|
||||
: ctx_preset(LLAMA_EXAMPLE_SERVER),
|
||||
base_params(params),
|
||||
base_env(get_environment()),
|
||||
base_preset(ctx_preset.load_from_args(argc, argv)) {
|
||||
for (char ** env = envp; *env != nullptr; env++) {
|
||||
base_env.push_back(std::string(*env));
|
||||
}
|
||||
// clean up base preset
|
||||
unset_reserved_args(base_preset, true);
|
||||
// set binary path
|
||||
|
||||
@@ -105,7 +105,7 @@ private:
|
||||
void add_model(server_model_meta && meta);
|
||||
|
||||
public:
|
||||
server_models(const common_params & params, int argc, char ** argv, char ** envp);
|
||||
server_models(const common_params & params, int argc, char ** argv);
|
||||
|
||||
void load_models();
|
||||
|
||||
@@ -147,8 +147,8 @@ struct server_models_routes {
|
||||
common_params params;
|
||||
json webui_settings = json::object();
|
||||
server_models models;
|
||||
server_models_routes(const common_params & params, int argc, char ** argv, char ** envp)
|
||||
: params(params), models(params, argc, argv, envp) {
|
||||
server_models_routes(const common_params & params, int argc, char ** argv)
|
||||
: params(params), models(params, argc, argv) {
|
||||
if (!this->params.webui_config_json.empty()) {
|
||||
try {
|
||||
webui_settings = json::parse(this->params.webui_config_json);
|
||||
|
||||
@@ -814,6 +814,15 @@ json server_task_result_cmpl_final::to_json_anthropic() {
|
||||
msg.content = content;
|
||||
}
|
||||
|
||||
// thinking block comes first (Anthropic extended thinking format)
|
||||
if (!msg.reasoning_content.empty()) {
|
||||
content_blocks.push_back({
|
||||
{"type", "thinking"},
|
||||
{"thinking", msg.reasoning_content},
|
||||
{"signature", ""} // empty signature for local models (no cryptographic verification)
|
||||
});
|
||||
}
|
||||
|
||||
if (!msg.content.empty()) {
|
||||
content_blocks.push_back({
|
||||
{"type", "text"},
|
||||
@@ -862,20 +871,57 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
|
||||
stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use";
|
||||
}
|
||||
|
||||
bool has_text = !oaicompat_msg.content.empty();
|
||||
bool has_thinking = !oaicompat_msg.reasoning_content.empty();
|
||||
bool has_text = !oaicompat_msg.content.empty();
|
||||
size_t num_tool_calls = oaicompat_msg.tool_calls.size();
|
||||
|
||||
bool text_block_started = false;
|
||||
// content block indices: thinking (0) -> text (0 or 1) -> tool_use (n+)
|
||||
size_t thinking_block_index = 0;
|
||||
size_t text_block_index = has_thinking ? 1 : 0;
|
||||
|
||||
bool thinking_block_started = false;
|
||||
bool text_block_started = false;
|
||||
std::unordered_set<size_t> tool_calls_started;
|
||||
|
||||
for (const auto & diff : oaicompat_msg_diffs) {
|
||||
// handle thinking/reasoning content
|
||||
if (!diff.reasoning_content_delta.empty()) {
|
||||
if (!thinking_block_started) {
|
||||
events.push_back({
|
||||
{"event", "content_block_start"},
|
||||
{"data", {
|
||||
{"type", "content_block_start"},
|
||||
{"index", thinking_block_index},
|
||||
{"content_block", {
|
||||
{"type", "thinking"},
|
||||
{"thinking", ""}
|
||||
}}
|
||||
}}
|
||||
});
|
||||
thinking_block_started = true;
|
||||
}
|
||||
|
||||
events.push_back({
|
||||
{"event", "content_block_delta"},
|
||||
{"data", {
|
||||
{"type", "content_block_delta"},
|
||||
{"index", thinking_block_index},
|
||||
{"delta", {
|
||||
{"type", "thinking_delta"},
|
||||
{"thinking", diff.reasoning_content_delta}
|
||||
}}
|
||||
}}
|
||||
});
|
||||
}
|
||||
|
||||
// handle regular text content
|
||||
if (!diff.content_delta.empty()) {
|
||||
if (!text_block_started) {
|
||||
events.push_back({
|
||||
{"event", "content_block_start"},
|
||||
{"data", {
|
||||
{"type", "content_block_start"},
|
||||
{"index", 0},
|
||||
{"index", text_block_index},
|
||||
{"content_block", {
|
||||
{"type", "text"},
|
||||
{"text", ""}
|
||||
@@ -889,7 +935,7 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
|
||||
{"event", "content_block_delta"},
|
||||
{"data", {
|
||||
{"type", "content_block_delta"},
|
||||
{"index", 0},
|
||||
{"index", text_block_index},
|
||||
{"delta", {
|
||||
{"type", "text_delta"},
|
||||
{"text", diff.content_delta}
|
||||
@@ -898,8 +944,9 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
|
||||
});
|
||||
}
|
||||
|
||||
// handle tool calls
|
||||
if (diff.tool_call_index != std::string::npos) {
|
||||
size_t content_block_index = (has_text ? 1 : 0) + diff.tool_call_index;
|
||||
size_t content_block_index = (has_thinking ? 1 : 0) + (has_text ? 1 : 0) + diff.tool_call_index;
|
||||
|
||||
if (tool_calls_started.find(diff.tool_call_index) == tool_calls_started.end()) {
|
||||
const auto & full_tool_call = oaicompat_msg.tool_calls[diff.tool_call_index];
|
||||
@@ -935,18 +982,42 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
|
||||
}
|
||||
}
|
||||
|
||||
// close content blocks in order
|
||||
if (has_thinking) {
|
||||
// Anthropic API requires a signature_delta before closing thinking blocks
|
||||
// We use an empty signature since we can't generate a cryptographic signature for local models
|
||||
events.push_back({
|
||||
{"event", "content_block_delta"},
|
||||
{"data", {
|
||||
{"type", "content_block_delta"},
|
||||
{"index", thinking_block_index},
|
||||
{"delta", {
|
||||
{"type", "signature_delta"},
|
||||
{"signature", ""}
|
||||
}}
|
||||
}}
|
||||
});
|
||||
events.push_back({
|
||||
{"event", "content_block_stop"},
|
||||
{"data", {
|
||||
{"type", "content_block_stop"},
|
||||
{"index", thinking_block_index}
|
||||
}}
|
||||
});
|
||||
}
|
||||
|
||||
if (has_text) {
|
||||
events.push_back({
|
||||
{"event", "content_block_stop"},
|
||||
{"data", {
|
||||
{"type", "content_block_stop"},
|
||||
{"index", 0}
|
||||
{"index", text_block_index}
|
||||
}}
|
||||
});
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < num_tool_calls; i++) {
|
||||
size_t content_block_index = (has_text ? 1 : 0) + i;
|
||||
size_t content_block_index = (has_thinking ? 1 : 0) + (has_text ? 1 : 0) + i;
|
||||
events.push_back({
|
||||
{"event", "content_block_stop"},
|
||||
{"data", {
|
||||
@@ -1154,11 +1225,10 @@ json server_task_result_rerank::to_json() {
|
||||
json server_task_result_cmpl_partial::to_json_anthropic() {
|
||||
json events = json::array();
|
||||
bool first = (n_decoded == 1);
|
||||
bool text_block_started = false;
|
||||
// use member variables to track block state across streaming calls
|
||||
// (anthropic_thinking_block_started, anthropic_text_block_started)
|
||||
|
||||
if (first) {
|
||||
text_block_started = false;
|
||||
|
||||
events.push_back({
|
||||
{"event", "message_start"},
|
||||
{"data", {
|
||||
@@ -1180,28 +1250,69 @@ json server_task_result_cmpl_partial::to_json_anthropic() {
|
||||
});
|
||||
}
|
||||
|
||||
// content block indices: thinking (0) -> text (0 or 1) -> tool_use (n+)
|
||||
size_t thinking_block_index = 0;
|
||||
// use anthropic_has_reasoning (set in update()) to know if ANY reasoning was generated
|
||||
size_t text_block_index = anthropic_has_reasoning ? 1 : 0;
|
||||
|
||||
// use local copies of streaming state (copied from task_result_state in update())
|
||||
// these reflect the state BEFORE this chunk was processed
|
||||
bool thinking_started = anthropic_thinking_block_started;
|
||||
bool text_started = anthropic_text_block_started;
|
||||
|
||||
for (const auto & diff : oaicompat_msg_diffs) {
|
||||
if (!diff.content_delta.empty()) {
|
||||
if (!text_block_started) {
|
||||
// handle thinking/reasoning content
|
||||
if (!diff.reasoning_content_delta.empty()) {
|
||||
if (!thinking_started) {
|
||||
events.push_back({
|
||||
{"event", "content_block_start"},
|
||||
{"data", {
|
||||
{"type", "content_block_start"},
|
||||
{"index", 0},
|
||||
{"index", thinking_block_index},
|
||||
{"content_block", {
|
||||
{"type", "text"},
|
||||
{"text", ""}
|
||||
{"type", "thinking"},
|
||||
{"thinking", ""}
|
||||
}}
|
||||
}}
|
||||
});
|
||||
text_block_started = true;
|
||||
thinking_started = true;
|
||||
}
|
||||
|
||||
events.push_back({
|
||||
{"event", "content_block_delta"},
|
||||
{"data", {
|
||||
{"type", "content_block_delta"},
|
||||
{"index", 0},
|
||||
{"index", thinking_block_index},
|
||||
{"delta", {
|
||||
{"type", "thinking_delta"},
|
||||
{"thinking", diff.reasoning_content_delta}
|
||||
}}
|
||||
}}
|
||||
});
|
||||
}
|
||||
|
||||
// handle regular text content
|
||||
if (!diff.content_delta.empty()) {
|
||||
if (!text_started) {
|
||||
events.push_back({
|
||||
{"event", "content_block_start"},
|
||||
{"data", {
|
||||
{"type", "content_block_start"},
|
||||
{"index", text_block_index},
|
||||
{"content_block", {
|
||||
{"type", "text"},
|
||||
{"text", ""}
|
||||
}}
|
||||
}}
|
||||
});
|
||||
text_started = true;
|
||||
}
|
||||
|
||||
events.push_back({
|
||||
{"event", "content_block_delta"},
|
||||
{"data", {
|
||||
{"type", "content_block_delta"},
|
||||
{"index", text_block_index},
|
||||
{"delta", {
|
||||
{"type", "text_delta"},
|
||||
{"text", diff.content_delta}
|
||||
@@ -1210,8 +1321,10 @@ json server_task_result_cmpl_partial::to_json_anthropic() {
|
||||
});
|
||||
}
|
||||
|
||||
// handle tool calls
|
||||
if (diff.tool_call_index != std::string::npos) {
|
||||
size_t content_block_index = (text_block_started ? 1 : 0) + diff.tool_call_index;
|
||||
// use anthropic_has_reasoning for thinking block count (persists across calls)
|
||||
size_t content_block_index = (anthropic_has_reasoning ? 1 : 0) + (text_started ? 1 : 0) + diff.tool_call_index;
|
||||
|
||||
if (!diff.tool_call_delta.name.empty()) {
|
||||
events.push_back({
|
||||
|
||||
@@ -96,6 +96,10 @@ struct task_result_state {
|
||||
std::string generated_text; // append new chunks of generated text here
|
||||
std::vector<std::string> generated_tool_call_ids;
|
||||
|
||||
// for Anthropic API streaming: track content block state across chunks
|
||||
bool anthropic_thinking_block_started = false;
|
||||
bool anthropic_text_block_started = false;
|
||||
|
||||
task_result_state(const common_chat_syntax & oaicompat_chat_syntax)
|
||||
: oaicompat_chat_syntax(oaicompat_chat_syntax) {}
|
||||
|
||||
@@ -337,6 +341,12 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
|
||||
bool is_updated = false;
|
||||
|
||||
// for Anthropic API: track if any reasoning content has been generated
|
||||
bool anthropic_has_reasoning = false;
|
||||
// Streaming state copied from task_result_state for this chunk
|
||||
bool anthropic_thinking_block_started = false;
|
||||
bool anthropic_text_block_started = false;
|
||||
|
||||
virtual bool is_stop() override {
|
||||
return false; // in stream mode, partial responses are not considered stop
|
||||
}
|
||||
@@ -346,6 +356,22 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
virtual void update(task_result_state & state) override {
|
||||
is_updated = true;
|
||||
state.update_chat_msg(content, true, oaicompat_msg_diffs);
|
||||
// track if the accumulated message has any reasoning content
|
||||
anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty();
|
||||
|
||||
// Copy current state for use in to_json_anthropic() (reflects state BEFORE this chunk)
|
||||
anthropic_thinking_block_started = state.anthropic_thinking_block_started;
|
||||
anthropic_text_block_started = state.anthropic_text_block_started;
|
||||
|
||||
// Pre-compute state updates based on diffs (for next chunk)
|
||||
for (const auto & diff : oaicompat_msg_diffs) {
|
||||
if (!diff.reasoning_content_delta.empty() && !state.anthropic_thinking_block_started) {
|
||||
state.anthropic_thinking_block_started = true;
|
||||
}
|
||||
if (!diff.content_delta.empty() && !state.anthropic_text_block_started) {
|
||||
state.anthropic_text_block_started = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
json to_json_non_oaicompat();
|
||||
|
||||
@@ -66,7 +66,7 @@ static server_http_context::handler_t ex_wrapper(server_http_context::handler_t
|
||||
};
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv, char ** envp) {
|
||||
int main(int argc, char ** argv) {
|
||||
// own arguments required by this example
|
||||
common_params params;
|
||||
|
||||
@@ -126,7 +126,7 @@ int main(int argc, char ** argv, char ** envp) {
|
||||
if (is_router_server) {
|
||||
// setup server instances manager
|
||||
try {
|
||||
models_routes.emplace(params, argc, argv, envp);
|
||||
models_routes.emplace(params, argc, argv);
|
||||
} catch (const std::exception & e) {
|
||||
LOG_ERR("%s: failed to initialize router models: %s\n", __func__, e.what());
|
||||
return 1;
|
||||
|
||||
@@ -805,3 +805,92 @@ def test_anthropic_vs_openai_different_response_format():
|
||||
assert "input_tokens" in anthropic_res.body["usage"]
|
||||
assert "completion_tokens" in openai_res.body["usage"]
|
||||
assert "output_tokens" in anthropic_res.body["usage"]
|
||||
|
||||
|
||||
# Extended thinking tests with reasoning models
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.parametrize("stream", [False, True])
|
||||
def test_anthropic_thinking_with_reasoning_model(stream):
|
||||
"""Test that thinking content blocks are properly returned for reasoning models"""
|
||||
global server
|
||||
server = ServerProcess()
|
||||
server.model_hf_repo = "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF"
|
||||
server.model_hf_file = "DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf"
|
||||
server.reasoning_format = "deepseek"
|
||||
server.jinja = True
|
||||
server.n_ctx = 8192
|
||||
server.n_predict = 1024
|
||||
server.server_port = 8084
|
||||
server.start(timeout_seconds=600) # large model needs time to download
|
||||
|
||||
if stream:
|
||||
res = server.make_stream_request("POST", "/v1/messages", data={
|
||||
"model": "test",
|
||||
"max_tokens": 1024,
|
||||
"thinking": {
|
||||
"type": "enabled",
|
||||
"budget_tokens": 500
|
||||
},
|
||||
"messages": [
|
||||
{"role": "user", "content": "What is 2+2?"}
|
||||
],
|
||||
"stream": True
|
||||
})
|
||||
|
||||
events = list(res)
|
||||
|
||||
# should have thinking content block events
|
||||
thinking_starts = [e for e in events if
|
||||
e.get("type") == "content_block_start" and
|
||||
e.get("content_block", {}).get("type") == "thinking"]
|
||||
assert len(thinking_starts) > 0, "Should have thinking content_block_start event"
|
||||
assert thinking_starts[0]["index"] == 0, "Thinking block should be at index 0"
|
||||
|
||||
# should have thinking_delta events
|
||||
thinking_deltas = [e for e in events if
|
||||
e.get("type") == "content_block_delta" and
|
||||
e.get("delta", {}).get("type") == "thinking_delta"]
|
||||
assert len(thinking_deltas) > 0, "Should have thinking_delta events"
|
||||
|
||||
# should have signature_delta event before thinking block closes (Anthropic API requirement)
|
||||
signature_deltas = [e for e in events if
|
||||
e.get("type") == "content_block_delta" and
|
||||
e.get("delta", {}).get("type") == "signature_delta"]
|
||||
assert len(signature_deltas) > 0, "Should have signature_delta event for thinking block"
|
||||
|
||||
# should have text block after thinking
|
||||
text_starts = [e for e in events if
|
||||
e.get("type") == "content_block_start" and
|
||||
e.get("content_block", {}).get("type") == "text"]
|
||||
assert len(text_starts) > 0, "Should have text content_block_start event"
|
||||
assert text_starts[0]["index"] == 1, "Text block should be at index 1 (after thinking)"
|
||||
else:
|
||||
res = server.make_request("POST", "/v1/messages", data={
|
||||
"model": "test",
|
||||
"max_tokens": 1024,
|
||||
"thinking": {
|
||||
"type": "enabled",
|
||||
"budget_tokens": 500
|
||||
},
|
||||
"messages": [
|
||||
{"role": "user", "content": "What is 2+2?"}
|
||||
]
|
||||
})
|
||||
|
||||
assert res.status_code == 200
|
||||
assert res.body["type"] == "message"
|
||||
|
||||
content = res.body["content"]
|
||||
assert len(content) >= 2, "Should have at least thinking and text blocks"
|
||||
|
||||
# first block should be thinking
|
||||
thinking_blocks = [b for b in content if b.get("type") == "thinking"]
|
||||
assert len(thinking_blocks) > 0, "Should have thinking content block"
|
||||
assert "thinking" in thinking_blocks[0], "Thinking block should have 'thinking' field"
|
||||
assert len(thinking_blocks[0]["thinking"]) > 0, "Thinking content should not be empty"
|
||||
assert "signature" in thinking_blocks[0], "Thinking block should have 'signature' field (Anthropic API requirement)"
|
||||
|
||||
# should also have text block
|
||||
text_blocks = [b for b in content if b.get("type") == "text"]
|
||||
assert len(text_blocks) > 0, "Should have text content block"
|
||||
|
||||
Reference in New Issue
Block a user