mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-21 08:24:06 +00:00
Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7538246e7c | ||
|
|
b32efad2bc | ||
|
|
a19b5cef16 | ||
|
|
78a1ba0a4f | ||
|
|
2dabf759e7 | ||
|
|
1d343b4069 | ||
|
|
8ca6e1c3a4 | ||
|
|
656babd6c2 | ||
|
|
a226bc7a9a | ||
|
|
1466621e73 | ||
|
|
82974011f3 | ||
|
|
4ccea213bc |
@@ -163,6 +163,8 @@ struct common_hf_file_res {
|
||||
# if !defined(PATH_MAX)
|
||||
# define PATH_MAX MAX_PATH
|
||||
# endif
|
||||
#elif defined(_AIX)
|
||||
#include <sys/limits.h>
|
||||
#else
|
||||
#include <sys/syslimits.h>
|
||||
#endif
|
||||
|
||||
@@ -714,6 +714,9 @@ class Model:
|
||||
if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224":
|
||||
# ref: https://huggingface.co/inclusionAI/Ling-lite
|
||||
res = "bailingmoe"
|
||||
if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406":
|
||||
# ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
|
||||
res = "llama4"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -1608,6 +1611,7 @@ class StableLMModel(Model):
|
||||
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
||||
class LlamaModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.LLAMA
|
||||
undo_permute = True
|
||||
|
||||
def set_vocab(self):
|
||||
try:
|
||||
@@ -1672,10 +1676,11 @@ class LlamaModel(Model):
|
||||
n_head = self.hparams["num_attention_heads"]
|
||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||
|
||||
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
||||
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
||||
if self.undo_permute:
|
||||
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
||||
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
||||
|
||||
# process the experts separately
|
||||
if name.find("block_sparse_moe.experts") != -1:
|
||||
@@ -1752,6 +1757,61 @@ class LlamaModel(Model):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@Model.register("Llama4ForConditionalGeneration")
|
||||
class Llama4Model(LlamaModel):
|
||||
model_arch = gguf.MODEL_ARCH.LLAMA4
|
||||
has_vision: bool = False
|
||||
undo_permute = False
|
||||
|
||||
# TODO @ngxson : avoid duplicate this code everywhere by at least support "text_config"
|
||||
# same with llama, but we need to merge the text_config into the root level of hparams
|
||||
def __init__(self, *args, **kwargs):
|
||||
hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0])
|
||||
if "text_config" in hparams:
|
||||
hparams = {**hparams, **hparams["text_config"]}
|
||||
kwargs["hparams"] = hparams
|
||||
super().__init__(*args, **kwargs)
|
||||
if "vision_config" in hparams:
|
||||
logger.info("Has vision encoder, but it will be ignored")
|
||||
self.has_vision = True
|
||||
# IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this
|
||||
self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"]
|
||||
self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"]
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_gpt2()
|
||||
self.gguf_writer.add_add_bos_token(True)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"])
|
||||
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
||||
name = name.replace("language_model.", "")
|
||||
name = name.replace("feed_forward.", "mlp.") # a bit hacky for now
|
||||
name = name.replace(".router.weight", ".gate.weight") # a bit hacky for now
|
||||
|
||||
# split the gate_up into gate and up
|
||||
if "gate_up_proj" in name:
|
||||
name_up = name.replace("gate_up_proj", "up_proj.weight")
|
||||
name_gate = name.replace("gate_up_proj", "gate_proj.weight")
|
||||
dim_half = data_torch.shape[-1] // 2
|
||||
gate_proj_weight, up_proj_weight = data_torch.transpose(-1, -2).split(dim_half, dim=-2)
|
||||
return [
|
||||
(self.map_tensor_name(name_gate), gate_proj_weight),
|
||||
(self.map_tensor_name(name_up), up_proj_weight)
|
||||
]
|
||||
|
||||
if name.endswith("down_proj"):
|
||||
name += ".weight"
|
||||
data_torch = data_torch.transpose(-1, -2)
|
||||
|
||||
if "multi_modal_projector" in name or "vision_model" in name:
|
||||
return []
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@Model.register("Mistral3ForConditionalGeneration")
|
||||
class Mistral3Model(LlamaModel):
|
||||
model_arch = gguf.MODEL_ARCH.LLAMA
|
||||
|
||||
@@ -113,6 +113,7 @@ models = [
|
||||
{"name": "superbpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", },
|
||||
{"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
|
||||
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
|
||||
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -380,6 +380,7 @@ struct clip_ctx {
|
||||
if (backend_cpu != backend) {
|
||||
ggml_backend_free(backend_cpu);
|
||||
}
|
||||
clip_image_size_free(load_image_size);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1618,6 +1619,12 @@ struct clip_image_f32 * clip_image_f32_init() {
|
||||
return new clip_image_f32();
|
||||
}
|
||||
|
||||
void clip_image_size_free(struct clip_image_size * load_image_size) {
|
||||
if (load_image_size == nullptr) {
|
||||
return;
|
||||
}
|
||||
delete load_image_size;
|
||||
}
|
||||
void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
|
||||
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
|
||||
void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) {
|
||||
@@ -2270,6 +2277,9 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
|
||||
}
|
||||
|
||||
void clip_free(clip_ctx * ctx) {
|
||||
if (ctx == nullptr) {
|
||||
return;
|
||||
}
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
@@ -2840,10 +2850,19 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
|
||||
bool clip_is_glm(const struct clip_ctx * ctx) {
|
||||
return ctx->has_glm_projector;
|
||||
}
|
||||
|
||||
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
|
||||
return ctx->has_qwen2vl_merger;
|
||||
}
|
||||
|
||||
bool clip_is_llava(const struct clip_ctx * ctx) {
|
||||
return ctx->has_llava_projector;
|
||||
}
|
||||
|
||||
bool clip_is_gemma3(const struct clip_ctx * ctx) {
|
||||
return ctx->proj_type == PROJECTOR_TYPE_GEMMA3;
|
||||
}
|
||||
|
||||
// Determine the number of encoder layers to iterate over
|
||||
int get_deepest_feature_layer(const struct clip_ctx * ctx) {
|
||||
// Get the index of the second to last layer; this is the
|
||||
|
||||
@@ -77,6 +77,7 @@ CLIP_API struct clip_image_size * clip_image_size_init();
|
||||
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
||||
CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
||||
|
||||
CLIP_API void clip_image_size_free (struct clip_image_size * img_size);
|
||||
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
|
||||
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
|
||||
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
|
||||
@@ -106,6 +107,8 @@ CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out
|
||||
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
|
||||
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
|
||||
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
||||
CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
|
||||
CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
|
||||
|
||||
|
||||
@@ -851,7 +851,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
|
||||
|
||||
LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);
|
||||
|
||||
LOG("\ntask\tacc_norm\n");
|
||||
LOG("\ntask\tacc_norm\t95%% confidence interval\n");
|
||||
|
||||
double acc = 0.0f;
|
||||
|
||||
@@ -985,8 +985,22 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
|
||||
acc += 1.0;
|
||||
}
|
||||
|
||||
// Print the accumulated accuracy mean x 100
|
||||
LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
|
||||
double freq = acc / double(i + 1);
|
||||
|
||||
const double za = 1.95996398454;
|
||||
|
||||
// // Wald normal approx
|
||||
// double conf =za*sqrt(freq*(1-freq)/double(i + 1));
|
||||
// LOG("%zu\t%.8lf +/- %.8lf\n", i + 1, freq*100.0, conf*100.0);
|
||||
|
||||
// Wilson score interval, more accurate
|
||||
double z = za * za / double(i + 1);
|
||||
double cnf = z * sqrt(double(i + 1) * (4.0 * freq * (1 - freq) + z)) / (za + za);
|
||||
double a = (freq + z * 0.5 - cnf) / (1.0 + z);
|
||||
double b = (freq + z * 0.5 + cnf) / (1.0 + z);
|
||||
|
||||
// Print the accumulated accuracy mean x 100 and confidence interval
|
||||
LOG("%zu\t%3.8lf%%\t[%3.4lf%%, %3.4lf%%]\n", i + 1, freq * 100.0, a * 100.0, b * 100.0);
|
||||
}
|
||||
|
||||
i0 = i1 - 1;
|
||||
|
||||
Binary file not shown.
@@ -1705,6 +1705,8 @@ private:
|
||||
};
|
||||
|
||||
struct server_response {
|
||||
bool running = true;
|
||||
|
||||
// for keeping track of all tasks waiting for the result
|
||||
std::unordered_set<int> waiting_task_ids;
|
||||
|
||||
@@ -1759,6 +1761,10 @@ struct server_response {
|
||||
while (true) {
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
condition_results.wait(lock, [&]{
|
||||
if (!running) {
|
||||
SRV_DBG("%s : queue result stop\n", __func__);
|
||||
std::terminate(); // we cannot return here since the caller is HTTP code
|
||||
}
|
||||
return !queue_results.empty();
|
||||
});
|
||||
|
||||
@@ -1789,6 +1795,10 @@ struct server_response {
|
||||
}
|
||||
|
||||
std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
|
||||
if (!running) {
|
||||
SRV_DBG("%s : queue result stop\n", __func__);
|
||||
std::terminate(); // we cannot return here since the caller is HTTP code
|
||||
}
|
||||
if (cr_res == std::cv_status::timeout) {
|
||||
return nullptr;
|
||||
}
|
||||
@@ -1818,6 +1828,12 @@ struct server_response {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// terminate the waiting loop
|
||||
void terminate() {
|
||||
running = false;
|
||||
condition_results.notify_all();
|
||||
}
|
||||
};
|
||||
|
||||
struct server_context {
|
||||
@@ -4491,9 +4507,10 @@ int main(int argc, char ** argv) {
|
||||
svr->new_task_queue = [¶ms] { return new httplib::ThreadPool(params.n_threads_http); };
|
||||
|
||||
// clean up function, to be called before exit
|
||||
auto clean_up = [&svr]() {
|
||||
auto clean_up = [&svr, &ctx_server]() {
|
||||
SRV_INF("%s: cleaning up before exit...\n", __func__);
|
||||
svr->stop();
|
||||
ctx_server.queue_results.terminate();
|
||||
llama_backend_free();
|
||||
};
|
||||
|
||||
@@ -4534,7 +4551,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (!ctx_server.load_model(params)) {
|
||||
clean_up();
|
||||
// t.join(); // FIXME: see below
|
||||
t.join();
|
||||
LOG_ERR("%s: exiting due to model loading error\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
@@ -4582,7 +4599,7 @@ int main(int argc, char ** argv) {
|
||||
ctx_server.queue_tasks.start_loop();
|
||||
|
||||
clean_up();
|
||||
// t.join(); // FIXME: http thread may stuck if there is an on-going request. we don't need to care about this for now as the HTTP connection will already be closed at this point, but it's better to fix this
|
||||
t.join();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -49,6 +49,26 @@ def test_embedding_multiple():
|
||||
assert len(d['embedding']) > 1
|
||||
|
||||
|
||||
def test_embedding_multiple_with_fa():
|
||||
server = ServerPreset.bert_bge_small_with_fa()
|
||||
server.pooling = 'last'
|
||||
server.start()
|
||||
# one of these should trigger the FA branch (i.e. context size % 256 == 0)
|
||||
res = server.make_request("POST", "/v1/embeddings", data={
|
||||
"input": [
|
||||
"a "*253,
|
||||
"b "*254,
|
||||
"c "*255,
|
||||
"d "*256,
|
||||
],
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert len(res.body['data']) == 4
|
||||
for d in res.body['data']:
|
||||
assert 'embedding' in d
|
||||
assert len(d['embedding']) > 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input,is_multi_prompt",
|
||||
[
|
||||
|
||||
@@ -323,6 +323,21 @@ class ServerPreset:
|
||||
server.server_embeddings = True
|
||||
return server
|
||||
|
||||
@staticmethod
|
||||
def bert_bge_small_with_fa() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
|
||||
server.model_alias = "bert-bge-small"
|
||||
server.n_ctx = 1024
|
||||
server.n_batch = 300
|
||||
server.n_ubatch = 300
|
||||
server.n_slots = 2
|
||||
server.fa = True
|
||||
server.seed = 42
|
||||
server.server_embeddings = True
|
||||
return server
|
||||
|
||||
@staticmethod
|
||||
def tinyllama_infill() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { useEffect, useMemo, useRef, useState } from 'react';
|
||||
import { useEffect, useMemo, useState } from 'react';
|
||||
import { CallbackGeneratedChunk, useAppContext } from '../utils/app.context';
|
||||
import ChatMessage from './ChatMessage';
|
||||
import { CanvasType, Message, PendingMessage } from '../utils/types';
|
||||
@@ -6,6 +6,7 @@ import { classNames, cleanCurrentUrl, throttle } from '../utils/misc';
|
||||
import CanvasPyInterpreter from './CanvasPyInterpreter';
|
||||
import StorageUtils from '../utils/storage';
|
||||
import { useVSCodeContext } from '../utils/llama-vscode';
|
||||
import { useChatTextarea, ChatTextareaApi } from './useChatTextarea.ts';
|
||||
|
||||
/**
|
||||
* A message display is a message node with additional information for rendering.
|
||||
@@ -99,7 +100,8 @@ export default function ChatScreen() {
|
||||
canvasData,
|
||||
replaceMessageAndGenerate,
|
||||
} = useAppContext();
|
||||
const textarea = useOptimizedTextarea(prefilledMsg.content());
|
||||
|
||||
const textarea: ChatTextareaApi = useChatTextarea(prefilledMsg.content());
|
||||
|
||||
const { extraContext, clearExtraContext } = useVSCodeContext(textarea);
|
||||
// TODO: improve this when we have "upload file" feature
|
||||
@@ -248,14 +250,16 @@ export default function ChatScreen() {
|
||||
</div>
|
||||
|
||||
{/* chat input */}
|
||||
<div className="flex flex-row items-center pt-8 pb-6 sticky bottom-0 bg-base-100">
|
||||
<div className="flex flex-row items-end pt-8 pb-6 sticky bottom-0 bg-base-100">
|
||||
<textarea
|
||||
className="textarea textarea-bordered w-full"
|
||||
// Default (mobile): Enable vertical resize, overflow auto for scrolling if needed
|
||||
// Large screens (lg:): Disable manual resize, apply max-height for autosize limit
|
||||
className="textarea textarea-bordered w-full resize-vertical lg:resize-none lg:max-h-48 lg:overflow-y-auto" // Adjust lg:max-h-48 as needed (e.g., lg:max-h-60)
|
||||
placeholder="Type a message (Shift+Enter to add a new line)"
|
||||
ref={textarea.ref}
|
||||
onInput={textarea.onInput} // Hook's input handler (will only resize height on lg+ screens)
|
||||
onKeyDown={(e) => {
|
||||
if (e.nativeEvent.isComposing || e.keyCode === 229) return;
|
||||
if (e.key === 'Enter' && e.shiftKey) return;
|
||||
if (e.key === 'Enter' && !e.shiftKey) {
|
||||
e.preventDefault();
|
||||
sendNewMessage();
|
||||
@@ -263,7 +267,11 @@ export default function ChatScreen() {
|
||||
}}
|
||||
id="msg-input"
|
||||
dir="auto"
|
||||
// Set a base height of 2 rows for mobile views
|
||||
// On lg+ screens, the hook will calculate and set the initial height anyway
|
||||
rows={2}
|
||||
></textarea>
|
||||
|
||||
{isGenerating(currConvId ?? '') ? (
|
||||
<button
|
||||
className="btn btn-neutral ml-2"
|
||||
@@ -286,43 +294,3 @@ export default function ChatScreen() {
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
export interface OptimizedTextareaValue {
|
||||
value: () => string;
|
||||
setValue: (value: string) => void;
|
||||
focus: () => void;
|
||||
ref: React.RefObject<HTMLTextAreaElement>;
|
||||
}
|
||||
|
||||
// This is a workaround to prevent the textarea from re-rendering when the inner content changes
|
||||
// See https://github.com/ggml-org/llama.cpp/pull/12299
|
||||
function useOptimizedTextarea(initValue: string): OptimizedTextareaValue {
|
||||
const [savedInitValue, setSavedInitValue] = useState<string>(initValue);
|
||||
const textareaRef = useRef<HTMLTextAreaElement>(null);
|
||||
|
||||
useEffect(() => {
|
||||
if (textareaRef.current && savedInitValue) {
|
||||
textareaRef.current.value = savedInitValue;
|
||||
setSavedInitValue('');
|
||||
}
|
||||
}, [textareaRef, savedInitValue, setSavedInitValue]);
|
||||
|
||||
return {
|
||||
value: () => {
|
||||
return textareaRef.current?.value ?? savedInitValue;
|
||||
},
|
||||
setValue: (value: string) => {
|
||||
if (textareaRef.current) {
|
||||
textareaRef.current.value = value;
|
||||
}
|
||||
},
|
||||
focus: () => {
|
||||
if (textareaRef.current) {
|
||||
// focus and move the cursor to the end
|
||||
textareaRef.current.focus();
|
||||
textareaRef.current.selectionStart = textareaRef.current.value.length;
|
||||
}
|
||||
},
|
||||
ref: textareaRef,
|
||||
};
|
||||
}
|
||||
|
||||
96
examples/server/webui/src/components/useChatTextarea.ts
Normal file
96
examples/server/webui/src/components/useChatTextarea.ts
Normal file
@@ -0,0 +1,96 @@
|
||||
import { useEffect, useRef, useState, useCallback } from 'react';
|
||||
|
||||
// Media Query for detecting "large" screens (matching Tailwind's lg: breakpoint)
|
||||
const LARGE_SCREEN_MQ = '(min-width: 1024px)';
|
||||
|
||||
// Calculates and sets the textarea height based on its scrollHeight
|
||||
const adjustTextareaHeight = (textarea: HTMLTextAreaElement | null) => {
|
||||
if (!textarea) return;
|
||||
|
||||
// Only perform auto-sizing on large screens
|
||||
if (!window.matchMedia(LARGE_SCREEN_MQ).matches) {
|
||||
// On small screens, reset inline height and max-height styles.
|
||||
// This allows CSS (e.g., `rows` attribute or classes) to control the height,
|
||||
// and enables manual resizing if `resize-vertical` is set.
|
||||
textarea.style.height = ''; // Use 'auto' or '' to reset
|
||||
textarea.style.maxHeight = '';
|
||||
return; // Do not adjust height programmatically on small screens
|
||||
}
|
||||
|
||||
const computedStyle = window.getComputedStyle(textarea);
|
||||
// Get the max-height specified by CSS (e.g., from `lg:max-h-48`)
|
||||
const currentMaxHeight = computedStyle.maxHeight;
|
||||
|
||||
// Temporarily remove max-height to allow scrollHeight to be calculated correctly
|
||||
textarea.style.maxHeight = 'none';
|
||||
// Reset height to 'auto' to measure the actual scrollHeight needed
|
||||
textarea.style.height = 'auto';
|
||||
// Set the height to the calculated scrollHeight
|
||||
textarea.style.height = `${textarea.scrollHeight}px`;
|
||||
// Re-apply the original max-height from CSS to enforce the limit
|
||||
textarea.style.maxHeight = currentMaxHeight;
|
||||
};
|
||||
|
||||
// Interface describing the API returned by the hook
|
||||
export interface ChatTextareaApi {
|
||||
value: () => string;
|
||||
setValue: (value: string) => void;
|
||||
focus: () => void;
|
||||
ref: React.RefObject<HTMLTextAreaElement>;
|
||||
onInput: (event: React.FormEvent<HTMLTextAreaElement>) => void; // Input handler
|
||||
}
|
||||
|
||||
// This is a workaround to prevent the textarea from re-rendering when the inner content changes
|
||||
// See https://github.com/ggml-org/llama.cpp/pull/12299
|
||||
// combined now with auto-sizing logic.
|
||||
export function useChatTextarea(initValue: string): ChatTextareaApi {
|
||||
const [savedInitValue, setSavedInitValue] = useState<string>(initValue);
|
||||
const textareaRef = useRef<HTMLTextAreaElement>(null);
|
||||
|
||||
// Effect to set initial value and height on mount or when initValue changes
|
||||
useEffect(() => {
|
||||
const textarea = textareaRef.current;
|
||||
if (textarea) {
|
||||
if (typeof savedInitValue === 'string' && savedInitValue.length > 0) {
|
||||
textarea.value = savedInitValue;
|
||||
// Call adjustTextareaHeight - it will check screen size internally
|
||||
setTimeout(() => adjustTextareaHeight(textarea), 0);
|
||||
setSavedInitValue(''); // Reset after applying
|
||||
} else {
|
||||
// Adjust height even if there's no initial value (for initial render)
|
||||
setTimeout(() => adjustTextareaHeight(textarea), 0);
|
||||
}
|
||||
}
|
||||
}, [textareaRef, savedInitValue]); // Depend on ref and savedInitValue
|
||||
|
||||
const handleInput = useCallback(
|
||||
(event: React.FormEvent<HTMLTextAreaElement>) => {
|
||||
// Call adjustTextareaHeight on every input - it will decide whether to act
|
||||
adjustTextareaHeight(event.currentTarget);
|
||||
},
|
||||
[]
|
||||
);
|
||||
|
||||
return {
|
||||
// Method to get the current value directly from the textarea
|
||||
value: () => {
|
||||
return textareaRef.current?.value ?? '';
|
||||
},
|
||||
// Method to programmatically set the value and trigger height adjustment
|
||||
setValue: (value: string) => {
|
||||
const textarea = textareaRef.current;
|
||||
if (textarea) {
|
||||
textarea.value = value;
|
||||
// Call adjustTextareaHeight - it will check screen size internally
|
||||
setTimeout(() => adjustTextareaHeight(textarea), 0);
|
||||
}
|
||||
},
|
||||
focus: () => {
|
||||
if (textareaRef.current) {
|
||||
textareaRef.current.focus();
|
||||
}
|
||||
},
|
||||
ref: textareaRef,
|
||||
onInput: handleInput,
|
||||
};
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
import { useEffect, useState } from 'react';
|
||||
import { MessageExtraContext } from './types';
|
||||
import { OptimizedTextareaValue } from '../components/ChatScreen';
|
||||
import { ChatTextareaApi } from '../components/useChatTextarea.ts';
|
||||
|
||||
// Extra context when using llama.cpp WebUI from llama-vscode, inside an iframe
|
||||
// Ref: https://github.com/ggml-org/llama.cpp/pull/11940
|
||||
@@ -15,7 +15,7 @@ interface SetTextEvData {
|
||||
* window.postMessage({ command: 'setText', text: 'Spot the syntax error', context: 'def test()\n return 123' }, '*');
|
||||
*/
|
||||
|
||||
export const useVSCodeContext = (textarea: OptimizedTextareaValue) => {
|
||||
export const useVSCodeContext = (textarea: ChatTextareaApi) => {
|
||||
const [extraContext, setExtraContext] = useState<MessageExtraContext | null>(
|
||||
null
|
||||
);
|
||||
|
||||
@@ -15,7 +15,7 @@ async def main():
|
||||
model_url = "http://127.0.0.1:6900"
|
||||
responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
|
||||
url= f"{model_url}/embedding",
|
||||
json= {"content": str(0)*1024}
|
||||
json= {"content": "a "*1022}
|
||||
) for i in range(n)])
|
||||
|
||||
for response in responses:
|
||||
|
||||
@@ -6721,8 +6721,8 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
||||
ggml_vec_dot_t const kq_vec_dot = ggml_get_type_traits_cpu(k->type)->vec_dot;
|
||||
ggml_to_float_t const v_to_float = ggml_get_type_traits(v->type)->to_float;
|
||||
|
||||
GGML_ASSERT(q_to_vec_dot && "fattn: unsupported K-type");
|
||||
GGML_ASSERT(v_to_float && "fattn: unsupported V-type");
|
||||
GGML_ASSERT(( q_to_vec_dot) && "fattn: unsupported K-type");
|
||||
GGML_ASSERT((v->type == GGML_TYPE_F32 || v_to_float ) && "fattn: unsupported V-type");
|
||||
|
||||
// loop over n_batch and n_head
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
@@ -6818,10 +6818,14 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
||||
vs = expf(s - M);
|
||||
}
|
||||
|
||||
v_to_float(v_data, V32, DV);
|
||||
|
||||
// V += v*expf(s - M)
|
||||
ggml_vec_mad_f32(DV, VKQ32, V32, vs);
|
||||
if (v_to_float) {
|
||||
v_to_float(v_data, V32, DV);
|
||||
ggml_vec_mad_f32(DV, VKQ32, V32, vs);
|
||||
} else {
|
||||
// V is F32
|
||||
ggml_vec_mad_f32(DV, VKQ32, (const float *) v_data, vs);
|
||||
}
|
||||
}
|
||||
|
||||
S = S*ms + vs; // scale and increment sum with partial sum
|
||||
|
||||
@@ -10,6 +10,13 @@ static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
static __device__ void cpy_1_f32_bf16(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
nv_bfloat16 * dsti = (nv_bfloat16 *) cdsti;
|
||||
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
|
||||
const float * xi = (const float *) cxi;
|
||||
half * dsti = (half *) cdsti;
|
||||
@@ -386,6 +393,16 @@ static void ggml_cpy_f32_f32_cuda(
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_bf16_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f32_bf16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_f16_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
@@ -581,6 +598,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
|
||||
ggml_cpy_f32_bf16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||
@@ -634,6 +653,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
||||
return nullptr;
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
return (void*) cpy_f32_f16<cpy_1_f32_f32>;
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
|
||||
return (void*) cpy_f32_f16<cpy_1_f32_bf16>;
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||
|
||||
@@ -3079,6 +3079,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_BF16) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1345,6 +1345,11 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_OP_ARANGE:
|
||||
return true;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
if (op->src[0]->ne[0] == 32) {
|
||||
// head size == 32 (e.g. bert-bge-small)
|
||||
// TODO: not sure if it is worth adding kernels for this size
|
||||
return false;
|
||||
}
|
||||
if (op->src[1]->type != op->src[2]->type) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -415,6 +415,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
unsigned number;
|
||||
cl_device_type type;
|
||||
char name[128];
|
||||
char version[128];
|
||||
};
|
||||
|
||||
enum { NPLAT = 16, NDEV = 16 };
|
||||
@@ -455,6 +456,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
d->platform = p;
|
||||
CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
|
||||
CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
|
||||
CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_VERSION, sizeof(d->version), &d->version, NULL));
|
||||
|
||||
if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
|
||||
p->default_device = d;
|
||||
@@ -547,7 +549,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
|
||||
GGML_LOG_INFO("ggml_opencl: selecting device: '%s'\n", default_device->name);
|
||||
GGML_LOG_INFO("ggml_opencl: selecting device: '%s (%s)'\n", default_device->name, default_device->version);
|
||||
if (default_device->type != CL_DEVICE_TYPE_GPU) {
|
||||
GGML_LOG_WARN("ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
|
||||
}
|
||||
@@ -556,9 +558,15 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
dev_ctx->device = default_device->id;
|
||||
backend_ctx->device = default_device->id;
|
||||
|
||||
if (strstr(default_device->name, "Adreno")) {
|
||||
if (strstr(default_device->name, "Adreno") ||
|
||||
strstr(default_device->name, "Qualcomm") ||
|
||||
strstr(default_device->version, "Adreno")) {
|
||||
backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
|
||||
backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
|
||||
// Usually device version contains the detailed device name
|
||||
backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->version);
|
||||
if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
|
||||
backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
|
||||
}
|
||||
|
||||
// Use wave size of 64 for all Adreno GPUs.
|
||||
backend_ctx->adreno_wave_size = 64;
|
||||
|
||||
@@ -372,9 +372,14 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
|
||||
// Note: Use host buffer to save the data from mmap(), then copy to device. It's workaround for mmap() issue on PVC GPU.
|
||||
// This function will be called during load model from disk. Use memory buffer replace dynamic won't save more time and brings potential memory leak risk here.
|
||||
char* host_buf = (char*)malloc(size);
|
||||
memcpy(host_buf, data, size);
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR((*stream).memcpy((char *)tensor->data + offset, data, size)
|
||||
CHECK_TRY_ERROR((*stream).memcpy((char *)tensor->data + offset, host_buf, size)
|
||||
.wait()));
|
||||
free(host_buf);
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
|
||||
@@ -116,6 +116,7 @@ class Keys:
|
||||
RESIDUAL_SCALE = "{arch}.residual_scale"
|
||||
EMBEDDING_SCALE = "{arch}.embedding_scale"
|
||||
TOKEN_SHIFT_COUNT = "{arch}.token_shift_count"
|
||||
INTERLEAVE_MOE_LAYER_STEP = "{arch}.interleave_moe_layer_step"
|
||||
|
||||
class Attention:
|
||||
HEAD_COUNT = "{arch}.attention.head_count"
|
||||
@@ -227,6 +228,7 @@ class GGUFType:
|
||||
|
||||
class MODEL_ARCH(IntEnum):
|
||||
LLAMA = auto()
|
||||
LLAMA4 = auto()
|
||||
DECI = auto()
|
||||
FALCON = auto()
|
||||
BAICHUAN = auto()
|
||||
@@ -431,6 +433,7 @@ class MODEL_TENSOR(IntEnum):
|
||||
|
||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.LLAMA: "llama",
|
||||
MODEL_ARCH.LLAMA4: "llama4",
|
||||
MODEL_ARCH.DECI: "deci",
|
||||
MODEL_ARCH.FALCON: "falcon",
|
||||
MODEL_ARCH.BAICHUAN: "baichuan",
|
||||
@@ -654,6 +657,29 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
MODEL_ARCH.LLAMA4: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP,
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||
],
|
||||
MODEL_ARCH.DECI: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
|
||||
@@ -746,6 +746,9 @@ class GGUFWriter:
|
||||
def add_token_shift_count(self, count: int) -> None:
|
||||
self.add_uint32(Keys.LLM.TOKEN_SHIFT_COUNT.format(arch=self.arch), count)
|
||||
|
||||
def add_interleave_moe_layer_step(self, value: int) -> None:
|
||||
self.add_uint32(Keys.LLM.INTERLEAVE_MOE_LAYER_STEP.format(arch=self.arch), value)
|
||||
|
||||
def add_layer_norm_eps(self, value: float) -> None:
|
||||
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
|
||||
|
||||
|
||||
@@ -139,6 +139,16 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
||||
|
||||
if isinstance(res, cls._tensor_type):
|
||||
return cls(meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn)
|
||||
elif isinstance(res, tuple) and all(isinstance(t, cls._tensor_type) for t in res):
|
||||
# share the evaluation between lazy tuple elements
|
||||
shared_args: list = [args, None]
|
||||
|
||||
def eager_tuple_element(a: list[Any], i: int = 0, /, **kw) -> LazyBase:
|
||||
assert len(a) == 2
|
||||
if a[1] is None:
|
||||
a[1] = fn(*a[0], **kw)
|
||||
return a[1][i]
|
||||
return tuple(cls(meta=cls.eager_to_meta(res[i]), args=(shared_args, i), kwargs=kwargs, func=eager_tuple_element) for i in range(len(res)))
|
||||
else:
|
||||
del res # not needed
|
||||
# non-tensor return likely relies on the contents of the args
|
||||
|
||||
@@ -110,6 +110,7 @@ extern "C" {
|
||||
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
|
||||
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
||||
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
||||
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
||||
};
|
||||
|
||||
enum llama_rope_type {
|
||||
|
||||
112
models/ggml-vocab-llama4.gguf.inp
Normal file
112
models/ggml-vocab-llama4.gguf.inp
Normal file
@@ -0,0 +1,112 @@
|
||||
ied 4 ½ months
|
||||
__ggml_vocab_test__
|
||||
Führer
|
||||
__ggml_vocab_test__
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
|
||||
|
||||
|
||||
__ggml_vocab_test__
|
||||
|
||||
|
||||
__ggml_vocab_test__
|
||||
Hello world
|
||||
__ggml_vocab_test__
|
||||
Hello world
|
||||
__ggml_vocab_test__
|
||||
Hello World
|
||||
__ggml_vocab_test__
|
||||
Hello World
|
||||
__ggml_vocab_test__
|
||||
Hello World!
|
||||
__ggml_vocab_test__
|
||||
Hello, world!
|
||||
__ggml_vocab_test__
|
||||
Hello, world!
|
||||
__ggml_vocab_test__
|
||||
this is 🦙.cpp
|
||||
__ggml_vocab_test__
|
||||
w048 7tuijk dsdfhu
|
||||
__ggml_vocab_test__
|
||||
нещо на Български
|
||||
__ggml_vocab_test__
|
||||
កាន់តែពិសេសអាចខលចេញ
|
||||
__ggml_vocab_test__
|
||||
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
|
||||
__ggml_vocab_test__
|
||||
Hello
|
||||
__ggml_vocab_test__
|
||||
Hello
|
||||
__ggml_vocab_test__
|
||||
Hello
|
||||
__ggml_vocab_test__
|
||||
Hello
|
||||
__ggml_vocab_test__
|
||||
Hello
|
||||
__ggml_vocab_test__
|
||||
Hello
|
||||
Hello
|
||||
__ggml_vocab_test__
|
||||
(
|
||||
__ggml_vocab_test__
|
||||
|
||||
=
|
||||
__ggml_vocab_test__
|
||||
' era
|
||||
__ggml_vocab_test__
|
||||
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
|
||||
__ggml_vocab_test__
|
||||
!!!!!!
|
||||
__ggml_vocab_test__
|
||||
3
|
||||
__ggml_vocab_test__
|
||||
33
|
||||
__ggml_vocab_test__
|
||||
333
|
||||
__ggml_vocab_test__
|
||||
3333
|
||||
__ggml_vocab_test__
|
||||
33333
|
||||
__ggml_vocab_test__
|
||||
333333
|
||||
__ggml_vocab_test__
|
||||
3333333
|
||||
__ggml_vocab_test__
|
||||
33333333
|
||||
__ggml_vocab_test__
|
||||
333333333
|
||||
__ggml_vocab_test__
|
||||
Cửa Việt
|
||||
__ggml_vocab_test__
|
||||
discards
|
||||
__ggml_vocab_test__
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
|
||||
__ggml_vocab_test__
|
||||
46
models/ggml-vocab-llama4.gguf.out
Normal file
46
models/ggml-vocab-llama4.gguf.out
Normal file
@@ -0,0 +1,46 @@
|
||||
1190 220 32 220 18215 7112
|
||||
50 16800 258
|
||||
|
||||
220
|
||||
256
|
||||
277
|
||||
197
|
||||
198
|
||||
368
|
||||
2946
|
||||
3271
|
||||
19873 3817
|
||||
39715 3817
|
||||
19873 7353
|
||||
39715 7353
|
||||
39715 7353 13
|
||||
19873 24 3817 13
|
||||
39715 24 3817 13
|
||||
544 373 9522 112 247 26 36315
|
||||
99 39923 220 35 9607 21498 21470 3679 9433
|
||||
1595 7653 633 79829 34051 1636
|
||||
8755 102595 115960 21125 148305 96819 102816 39048 14105 22528 160234
|
||||
114590 222 330 14879 21 51358 127 12817 93293 117 24204 330 68239 881 120327 170428 21 89101 330 7384 88230 511 947 1492 3742 7233 21
|
||||
19873
|
||||
39715
|
||||
220 39715
|
||||
256 39715
|
||||
277 39715
|
||||
277 39715 198 277 39715
|
||||
330
|
||||
198 319
|
||||
19 7359
|
||||
19873 24 386 87799 13 2403 583 650 51358 223 1663 155736 1522 42056 7544 13336 28785 29 4412 20645
|
||||
17931 4959
|
||||
31
|
||||
1922
|
||||
12325
|
||||
12325 31
|
||||
12325 1922
|
||||
12325 12325
|
||||
12325 12325 31
|
||||
12325 12325 1922
|
||||
12325 12325 12325
|
||||
47 19811 12077
|
||||
3260 3579
|
||||
198 7283 51499 191231 20192 3271 3322 9287 2143 17860 114590 222 330 14879 21 51358 127 12817 93293 117 24204 330 68239 881 120327 170428 21 89101 9522 112 247 172394 247 220 31 220 1922 220 12325 220 12325 31 220 12325 1922 220 12325 12325 220 12325 12325 31 220 12325 12325 1922 220 31 26 31 220 31 396 31 220 31 1043 31 117131 102595 115960 21125 148305 96819 102816 80883 223 1663 155736 1522 42056 7544 13336 28785 29 4412 20645 79745 150278 117079 633 79829 34051 1636 25611 41990 109428 1488 91054 24072 17931 4959 29795 9296 16517 1806 481 96 1386 36633 1609 24 481 1109 650 5074 43 481 57 702 5074 27088 2170 536 24 481 48 650 1933 1696 30262 43 1665 19 32818 262 27236 56
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_LLAMA, "llama" },
|
||||
{ LLM_ARCH_LLAMA4, "llama4" },
|
||||
{ LLM_ARCH_DECI, "deci" },
|
||||
{ LLM_ARCH_FALCON, "falcon" },
|
||||
{ LLM_ARCH_GROK, "grok" },
|
||||
@@ -114,6 +115,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
|
||||
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
|
||||
{ LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
|
||||
{ LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" },
|
||||
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||
@@ -233,6 +235,35 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_LLAMA4,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
||||
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
||||
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_DECI,
|
||||
{
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
|
||||
enum llm_arch {
|
||||
LLM_ARCH_LLAMA,
|
||||
LLM_ARCH_LLAMA4,
|
||||
LLM_ARCH_DECI,
|
||||
LLM_ARCH_FALCON,
|
||||
LLM_ARCH_BAICHUAN,
|
||||
@@ -118,6 +119,7 @@ enum llm_kv {
|
||||
LLM_KV_RESIDUAL_SCALE,
|
||||
LLM_KV_EMBEDDING_SCALE,
|
||||
LLM_KV_TOKEN_SHIFT_COUNT,
|
||||
LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
|
||||
|
||||
LLM_KV_ATTENTION_HEAD_COUNT,
|
||||
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
||||
|
||||
@@ -61,6 +61,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||
{ "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
|
||||
{ "yandex", LLM_CHAT_TEMPLATE_YANDEX },
|
||||
{ "bailing", LLM_CHAT_TEMPLATE_BAILING },
|
||||
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
|
||||
};
|
||||
|
||||
llm_chat_template llm_chat_template_from_str(const std::string & name) {
|
||||
@@ -174,6 +175,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||
return LLM_CHAT_TEMPLATE_YANDEX;
|
||||
} else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("'HUMAN'")) {
|
||||
return LLM_CHAT_TEMPLATE_BAILING;
|
||||
} else if (tmpl_contains("<|header_start|>") && tmpl_contains("<|header_end|>")) {
|
||||
return LLM_CHAT_TEMPLATE_LLAMA4;
|
||||
}
|
||||
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
||||
}
|
||||
@@ -608,7 +611,16 @@ int32_t llm_chat_apply_template(
|
||||
if (add_ass) {
|
||||
ss << "<role>ASSISTANT</role>";
|
||||
}
|
||||
} else {
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA4) {
|
||||
// Llama 4
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
ss << "<|header_start|>" << role << "<|header_end|>\n\n" << trim(message->content) << "<|eot|>";
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|header_start|>assistant<|header_end|>\n\n";
|
||||
}
|
||||
} else {
|
||||
// template not supported
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -40,6 +40,7 @@ enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_MEGREZ,
|
||||
LLM_CHAT_TEMPLATE_YANDEX,
|
||||
LLM_CHAT_TEMPLATE_BAILING,
|
||||
LLM_CHAT_TEMPLATE_LLAMA4,
|
||||
LLM_CHAT_TEMPLATE_UNKNOWN,
|
||||
};
|
||||
|
||||
|
||||
@@ -59,6 +59,22 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
|
||||
if (ubatch->pos && attn_scale) {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
|
||||
std::vector<float> attn_scale_data(n_tokens, 0.0f);
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
const float pos = ubatch->pos[i];
|
||||
attn_scale_data[i] = std::log(
|
||||
std::floor((pos + 1.0f) / n_attn_temp_floor_scale) + 1.0
|
||||
) * f_attn_temp_scale + 1.0;
|
||||
}
|
||||
|
||||
ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale));
|
||||
}
|
||||
}
|
||||
|
||||
void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
|
||||
if (pos_bucket) {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
@@ -458,9 +474,17 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
|
||||
}
|
||||
|
||||
// may need to cut off old tokens for sliding window
|
||||
// TODO @ngxson : we are currently re-using the swa logic to store the chunked mask, we should rename SWA to something more generic like "aux mask"
|
||||
if (data_swa) {
|
||||
if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
|
||||
f = -INFINITY;
|
||||
if (hparams.n_attn_chunk) {
|
||||
llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;
|
||||
if (kv_self->cells[i].pos < pos_chunk_start || pos < pos_chunk_start) {
|
||||
f = -INFINITY;
|
||||
}
|
||||
} else {
|
||||
if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
|
||||
f = -INFINITY;
|
||||
}
|
||||
}
|
||||
data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
|
||||
}
|
||||
@@ -812,8 +836,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
float w_scale,
|
||||
llama_expert_gating_func_type gating_op,
|
||||
int il) const {
|
||||
int64_t n_embd = cur->ne[0];
|
||||
int64_t n_tokens = cur->ne[1];
|
||||
const int64_t n_embd = cur->ne[0];
|
||||
const int64_t n_tokens = cur->ne[1];
|
||||
const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
|
||||
|
||||
ggml_tensor * logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
|
||||
cb(logits, "ffn_moe_logits", il);
|
||||
@@ -841,6 +866,12 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
cb(selection_probs, "ffn_moe_probs_biased", il);
|
||||
}
|
||||
|
||||
// llama4 doesn't have exp_probs_b, and sigmoid is only used after top_k
|
||||
// see: https://github.com/meta-llama/llama-models/blob/699a02993512fb36936b1b0741e13c06790bcf98/models/llama4/moe.py#L183-L198
|
||||
if (arch == LLM_ARCH_LLAMA4) {
|
||||
selection_probs = logits;
|
||||
}
|
||||
|
||||
// select experts
|
||||
ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
|
||||
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
||||
@@ -867,6 +898,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
}
|
||||
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
|
||||
|
||||
if (weight_before_ffn) {
|
||||
// TODO: this is a workaround as we don't yet have a repeat op that takes custom dim (ggml_repeat_4d)
|
||||
ggml_tensor * repeated = ggml_new_tensor_3d(ctx0, cur->type, n_embd, n_expert_used, n_tokens);
|
||||
repeated = ggml_repeat(ctx0, cur, repeated); // [n_embd, n_expert_used, n_tokens]
|
||||
cur = ggml_mul(ctx0, repeated, weights);
|
||||
cb(cur, "ffn_moe_weighted", il);
|
||||
}
|
||||
|
||||
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
||||
cb(up, "ffn_moe_up", il);
|
||||
|
||||
@@ -894,7 +934,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
||||
cb(experts, "ffn_moe_down", il);
|
||||
|
||||
experts = ggml_mul(ctx0, experts, weights);
|
||||
if (!weight_before_ffn) {
|
||||
experts = ggml_mul(ctx0, experts, weights);
|
||||
cb(cur, "ffn_moe_weighted", il);
|
||||
}
|
||||
|
||||
// aggregate experts
|
||||
ggml_tensor * moe_out = nullptr;
|
||||
@@ -914,6 +957,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
moe_out = ggml_cont(ctx0, moe_out);
|
||||
}
|
||||
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
||||
return moe_out;
|
||||
}
|
||||
|
||||
@@ -981,6 +1026,19 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
|
||||
return cur;
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
||||
auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
|
||||
|
||||
auto & cur = inp->attn_scale;
|
||||
|
||||
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token());
|
||||
ggml_set_input(cur);
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_out_ids() const {
|
||||
auto inp = std::make_unique<llm_graph_input_out_ids>(hparams, cparams, n_outputs);
|
||||
|
||||
@@ -1157,6 +1215,15 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
v = ggml_transpose(ctx0, v);
|
||||
}
|
||||
|
||||
// this can happen when KV cache is not used (e.g. an embedding model with non-causal attn)
|
||||
if (k->type == GGML_TYPE_F32) {
|
||||
k = ggml_cast(ctx0, k, GGML_TYPE_F16);
|
||||
}
|
||||
|
||||
if (v->type == GGML_TYPE_F32) {
|
||||
v = ggml_cast(ctx0, v, GGML_TYPE_F16);
|
||||
}
|
||||
|
||||
cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
|
||||
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
|
||||
|
||||
|
||||
@@ -100,6 +100,23 @@ public:
|
||||
const int64_t n_pos_per_token = 1;
|
||||
};
|
||||
|
||||
// temperature tuning, used by llama4
|
||||
class llm_graph_input_attn_temp : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
|
||||
: n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
|
||||
virtual ~llm_graph_input_attn_temp() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
|
||||
|
||||
const int64_t n_pos_per_token = 1;
|
||||
|
||||
const uint32_t n_attn_temp_floor_scale;
|
||||
const float f_attn_temp_scale;
|
||||
};
|
||||
|
||||
class llm_graph_input_pos_bucket : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {}
|
||||
@@ -470,6 +487,7 @@ struct llm_graph_context {
|
||||
|
||||
ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const;
|
||||
ggml_tensor * build_inp_pos() const;
|
||||
ggml_tensor * build_inp_attn_scale() const;
|
||||
ggml_tensor * build_inp_out_ids() const;
|
||||
ggml_tensor * build_inp_mean() const;
|
||||
ggml_tensor * build_inp_cls() const;
|
||||
|
||||
@@ -112,6 +112,14 @@ struct llama_hparams {
|
||||
bool use_alibi = false;
|
||||
bool attn_soft_cap = false;
|
||||
|
||||
uint32_t n_moe_layer_step = 0;
|
||||
bool use_kq_norm = true;
|
||||
uint32_t n_attn_chunk = 0;
|
||||
// values below seems to be fixed on llama4
|
||||
uint32_t n_no_rope_layer_step = 4;
|
||||
uint32_t n_attn_temp_floor_scale = 8192;
|
||||
float f_attn_temp_scale = 0.1;
|
||||
|
||||
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
|
||||
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|
||||
|
||||
@@ -90,6 +90,8 @@ const char * llm_type_name(llm_type type) {
|
||||
case LLM_TYPE_57B_A14B: return "57B.A14B";
|
||||
case LLM_TYPE_27B: return "27B";
|
||||
case LLM_TYPE_290B: return "290B";
|
||||
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
|
||||
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
|
||||
default: return "?B";
|
||||
}
|
||||
}
|
||||
@@ -550,6 +552,25 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_LLAMA4:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
|
||||
hparams.n_swa_pattern = 4; // pattern: 3 chunked - 1 full
|
||||
hparams.n_attn_chunk = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
|
||||
hparams.n_swa = 1; // TODO @ngxson : this is added to trigger the SWA branch (we store the chunked attn mask in the SWA tensor), will need to clean this up later
|
||||
|
||||
switch (hparams.n_expert) {
|
||||
case 16: type = LLM_TYPE_17B_16E; break;
|
||||
case 128: type = LLM_TYPE_17B_128E; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
if (type == LLM_TYPE_17B_128E) {
|
||||
hparams.use_kq_norm = false;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
@@ -1690,6 +1711,56 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_LLAMA4:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
// output
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
// if output is NULL, init from the input tok embed
|
||||
if (output == NULL) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
|
||||
|
||||
auto & layer = layers[i];
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
||||
|
||||
if (is_moe_layer) {
|
||||
int n_ff_exp = hparams.n_ff_exp;
|
||||
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
||||
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
|
||||
|
||||
// Shared expert
|
||||
const int64_t n_ff_shexp = n_ff_exp;
|
||||
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
|
||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
|
||||
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
|
||||
} else {
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
@@ -4203,12 +4274,22 @@ struct llm_build_llama : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// temperature tuning
|
||||
ggml_tensor * inp_attn_scale = nullptr;
|
||||
if (arch == LLM_ARCH_LLAMA4) {
|
||||
inp_attn_scale = build_inp_attn_scale();
|
||||
}
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
bool use_rope = arch == LLM_ARCH_LLAMA4
|
||||
? (il + 1) % hparams.n_no_rope_layer_step != 0
|
||||
: true;
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
@@ -4246,25 +4327,38 @@ struct llm_build_llama : public llm_graph_context {
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
if (use_rope) {
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
} else if (inp_attn_scale) {
|
||||
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
|
||||
}
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
if (arch == LLM_ARCH_LLAMA4 && use_rope && hparams.use_kq_norm) {
|
||||
// Llama4TextL2Norm
|
||||
Qcur = ggml_rms_norm(ctx0, Qcur, 1e-6);
|
||||
Kcur = ggml_rms_norm(ctx0, Kcur, 1e-6);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
}
|
||||
|
||||
cur = build_attn(inp_attn, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
@@ -4282,7 +4376,7 @@ struct llm_build_llama : public llm_graph_context {
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
// feed-forward network (non-MoE)
|
||||
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||
|
||||
cur = build_norm(ffn_inp,
|
||||
@@ -4297,6 +4391,38 @@ struct llm_build_llama : public llm_graph_context {
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
} else if (arch == LLM_ARCH_LLAMA4) {
|
||||
// llama4 MoE
|
||||
ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
ggml_tensor * moe_out = build_moe_ffn(ffn_inp_normed,
|
||||
model.layers[il].ffn_gate_inp,
|
||||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, false,
|
||||
false, 0.0,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
|
||||
il);
|
||||
|
||||
// Shared experts
|
||||
ggml_tensor * shexp_out = build_ffn(ffn_inp_normed,
|
||||
model.layers[il].ffn_up_shexp, NULL, NULL,
|
||||
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
||||
model.layers[il].ffn_down_shexp, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(shexp_out, "ffn_moe_shexp", il);
|
||||
|
||||
cur = ggml_add(ctx0, moe_out, shexp_out);
|
||||
cb(cur, "ffn_moe_out_merged", il);
|
||||
|
||||
} else {
|
||||
// MoE branch
|
||||
cur = build_norm(ffn_inp,
|
||||
@@ -12091,6 +12217,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
|
||||
switch (arch) {
|
||||
case LLM_ARCH_LLAMA:
|
||||
case LLM_ARCH_LLAMA4:
|
||||
case LLM_ARCH_MINICPM:
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
@@ -12440,6 +12567,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
|
||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||
case LLM_ARCH_LLAMA:
|
||||
case LLM_ARCH_LLAMA4:
|
||||
case LLM_ARCH_DECI:
|
||||
case LLM_ARCH_BAICHUAN:
|
||||
case LLM_ARCH_STARCODER:
|
||||
|
||||
@@ -86,6 +86,8 @@ enum llm_type {
|
||||
LLM_TYPE_57B_A14B,
|
||||
LLM_TYPE_27B,
|
||||
LLM_TYPE_290B,
|
||||
LLM_TYPE_17B_16E, // llama4 Scout
|
||||
LLM_TYPE_17B_128E, // llama4 Maverick
|
||||
};
|
||||
|
||||
struct llama_layer_posnet {
|
||||
|
||||
@@ -1616,7 +1616,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
tokenizer_pre == "megrez") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
||||
} else if (
|
||||
tokenizer_pre == "gpt-4o") {
|
||||
tokenizer_pre == "gpt-4o" ||
|
||||
tokenizer_pre == "llama4") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
|
||||
Reference in New Issue
Block a user