mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-02 23:24:06 +00:00
Compare commits
14 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a68247439b | ||
|
|
26b79b6cb3 | ||
|
|
1e8659e65a | ||
|
|
a3c30846e4 | ||
|
|
1701d4c54f | ||
|
|
bef8176387 | ||
|
|
34b7c0439e | ||
|
|
f3101a8cc6 | ||
|
|
1c49c70d07 | ||
|
|
a8ea03d8ad | ||
|
|
05f6ac6283 | ||
|
|
bc583e3c63 | ||
|
|
72b090da2c | ||
|
|
7fe03e7446 |
@@ -432,6 +432,9 @@ class ModelBase:
|
||||
if "llm_config" in config:
|
||||
# rename for InternVL
|
||||
config["text_config"] = config["llm_config"]
|
||||
if "thinker_config" in config:
|
||||
# rename for Qwen2.5-Omni
|
||||
config["text_config"] = config["thinker_config"]["text_config"]
|
||||
return config
|
||||
|
||||
@classmethod
|
||||
@@ -1121,18 +1124,21 @@ class MmprojModel(ModelBase):
|
||||
preprocessor_config: dict[str, Any]
|
||||
global_config: dict[str, Any]
|
||||
|
||||
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
|
||||
|
||||
has_vision_encoder: bool = True # by default
|
||||
has_audio_encoder: bool = False
|
||||
|
||||
# for models having multiple encoders, we need to separate their hparams
|
||||
hparams_vision: dict[str, Any] | None = None
|
||||
hparams_audio: dict[str, Any] | None = None
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
if self.model_arch != gguf.MODEL_ARCH.MMPROJ:
|
||||
raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ")
|
||||
|
||||
if self.has_vision_encoder and self.has_audio_encoder:
|
||||
raise NotImplementedError("both vision + audio not supported yet")
|
||||
|
||||
# get n_embd of the text model
|
||||
if "text_config" not in self.hparams:
|
||||
self.hparams["text_config"] = {}
|
||||
@@ -1143,22 +1149,32 @@ class MmprojModel(ModelBase):
|
||||
assert self.n_embd_text > 0, "n_embd not found in hparams"
|
||||
|
||||
# move vision config to the top level, while preserving the original hparams in global_config
|
||||
self.global_config = self.hparams
|
||||
import copy
|
||||
self.global_config = copy.deepcopy(self.hparams)
|
||||
self.hparams_vision = self.get_vision_config()
|
||||
self.hparams_audio = self.get_audio_config()
|
||||
|
||||
if "vision_config" in self.hparams:
|
||||
self.hparams = self.hparams["vision_config"]
|
||||
elif "audio_config" in self.hparams:
|
||||
self.hparams = self.hparams["audio_config"]
|
||||
else:
|
||||
if self.hparams_vision is None and self.hparams_audio is None:
|
||||
raise ValueError("vision_config / audio_config not found in hparams")
|
||||
|
||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"])
|
||||
# for compat with vision-only models
|
||||
self.hparams = self.hparams_vision or self.hparams_audio or self.hparams
|
||||
|
||||
# TODO @ngxson : this is a hack to support both vision and audio encoders
|
||||
have_multiple_encoders = self.has_audio_encoder and self.has_vision_encoder
|
||||
self.block_count = 128 if have_multiple_encoders else self.find_hparam(self.n_block_keys, True)
|
||||
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
|
||||
|
||||
# load preprocessor config
|
||||
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
|
||||
self.preprocessor_config = json.load(f)
|
||||
|
||||
def get_vision_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config.get("vision_config")
|
||||
|
||||
def get_audio_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config.get("audio_config")
|
||||
|
||||
def set_type(self):
|
||||
self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
|
||||
|
||||
@@ -1170,26 +1186,26 @@ class MmprojModel(ModelBase):
|
||||
self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
|
||||
|
||||
# vision config
|
||||
self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"]))
|
||||
self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
|
||||
self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
|
||||
self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
|
||||
self.gguf_writer.add_vision_block_count(self.block_count)
|
||||
self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))
|
||||
self.gguf_writer.add_vision_image_size(self.find_vparam(["image_size"]))
|
||||
self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
|
||||
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
|
||||
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
|
||||
self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
|
||||
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
|
||||
|
||||
# preprocessor config
|
||||
self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
|
||||
self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
|
||||
|
||||
elif self.has_audio_encoder:
|
||||
if self.has_audio_encoder:
|
||||
self.gguf_writer.add_clip_has_audio_encoder(True)
|
||||
self.gguf_writer.add_audio_projection_dim(self.n_embd_text)
|
||||
|
||||
# audio config
|
||||
self.gguf_writer.add_audio_embedding_length(self.find_hparam(["hidden_size"]))
|
||||
self.gguf_writer.add_audio_feed_forward_length(self.find_hparam(["intermediate_size"]))
|
||||
self.gguf_writer.add_audio_block_count(self.block_count)
|
||||
self.gguf_writer.add_audio_head_count(self.find_hparam(["num_attention_heads"]))
|
||||
self.gguf_writer.add_audio_embedding_length(self.find_aparam(["hidden_size"]))
|
||||
self.gguf_writer.add_audio_feed_forward_length(self.find_aparam(["intermediate_size"]))
|
||||
self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys))
|
||||
self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"]))
|
||||
|
||||
else:
|
||||
raise ValueError("MmprojModel must have either vision or audio encoder")
|
||||
@@ -1197,6 +1213,22 @@ class MmprojModel(ModelBase):
|
||||
def write_vocab(self):
|
||||
raise ValueError("MmprojModel does not support vocab writing")
|
||||
|
||||
def find_vparam(self, keys: Iterable[str], optional: bool = False) -> Any:
|
||||
assert self.hparams_vision is not None
|
||||
return self._find_param(self.hparams_vision, keys, optional)
|
||||
|
||||
def find_aparam(self, keys: Iterable[str], optional: bool = False) -> Any:
|
||||
assert self.hparams_audio is not None
|
||||
return self._find_param(self.hparams_audio, keys, optional)
|
||||
|
||||
def _find_param(self, obj: dict[str, Any], keys: Iterable[str], optional: bool = False) -> Any:
|
||||
key = next((k for k in keys if k in obj), None)
|
||||
if key is not None:
|
||||
return obj[key]
|
||||
if optional:
|
||||
return None
|
||||
raise KeyError(f"could not find any of: {keys}")
|
||||
|
||||
|
||||
@ModelBase.register("GPTNeoXForCausalLM")
|
||||
class GPTNeoXModel(TextModel):
|
||||
@@ -2137,6 +2169,9 @@ class Llama4VisionModel(MmprojModel):
|
||||
# process vision tensors
|
||||
if "positional_embedding_vlm" in name and ".weight" not in name:
|
||||
name += ".weight"
|
||||
if "multi_modal_projector.linear_1" in name:
|
||||
# despite the name with number postfix, this is a single fully connected layer
|
||||
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
return []
|
||||
|
||||
@@ -2674,7 +2709,12 @@ class Qwen2Model(TextModel):
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
|
||||
@ModelBase.register(
|
||||
"Qwen2VLModel",
|
||||
"Qwen2VLForConditionalGeneration",
|
||||
"Qwen2_5_VLForConditionalGeneration",
|
||||
"Qwen2_5OmniModel",
|
||||
)
|
||||
class Qwen2VLModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN2VL
|
||||
|
||||
@@ -2692,8 +2732,11 @@ class Qwen2VLModel(TextModel):
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
if name.startswith("visual."):
|
||||
# skip visual tensors
|
||||
if name.startswith("thinker."):
|
||||
name = name.replace("thinker.", "")
|
||||
if name.startswith("visual") or name.startswith("audio") or \
|
||||
name.startswith("talker") or name.startswith("token2wav"):
|
||||
# skip multimodal tensors
|
||||
return []
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
@@ -2702,21 +2745,27 @@ class Qwen2VLModel(TextModel):
|
||||
class Qwen2VLVisionModel(MmprojModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.hparams["image_size"] = self.hparams.get("image_size", 560)
|
||||
assert self.hparams_vision is not None
|
||||
self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
|
||||
# rename config.json values
|
||||
self.hparams["num_attention_heads"] = self.hparams.get("num_heads")
|
||||
self.hparams["num_hidden_layers"] = self.hparams.get("depth")
|
||||
if "embed_dim" in self.hparams: # qwen2vl
|
||||
self.hparams["intermediate_size"] = self.hparams.get("hidden_size")
|
||||
self.hparams["hidden_size"] = self.hparams.get("embed_dim")
|
||||
self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
|
||||
self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
|
||||
if "embed_dim" in self.hparams_vision: # qwen2vl
|
||||
self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size")
|
||||
self.hparams_vision["hidden_size"] = self.hparams_vision.get("embed_dim")
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
if self.global_config['model_type'] == 'qwen2_vl':
|
||||
assert self.hparams_vision is not None
|
||||
hparams = self.hparams_vision
|
||||
model_type = self.global_config['model_type']
|
||||
if model_type == 'qwen2_vl':
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL)
|
||||
elif self.global_config['model_type'] == 'qwen2_5_vl':
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
|
||||
elif model_type == 'qwen2_5_vl' or model_type == 'qwen2_5_omni':
|
||||
if model_type == 'qwen2_5_omni':
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O)
|
||||
else:
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
|
||||
self.gguf_writer.add_vision_use_silu(True)
|
||||
# find n_wa_pattern (window attention pattern)
|
||||
fullatt_block_indexes = hparams.get("fullatt_block_indexes")
|
||||
@@ -2774,6 +2823,66 @@ class Qwen2VLVisionModel(MmprojModel):
|
||||
return [] # skip other tensors
|
||||
|
||||
|
||||
@ModelBase.register("Qwen2_5OmniModel")
|
||||
class Qwen25OmniModel(Qwen2VLVisionModel):
|
||||
has_vision_encoder = True
|
||||
has_audio_encoder = True
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_audio is not None
|
||||
self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"]
|
||||
self.hparams_audio["intermediate_size"] = self.hparams_audio["encoder_ffn_dim"]
|
||||
self.hparams_audio["num_attention_heads"] = self.hparams_audio["encoder_attention_heads"]
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
assert self.hparams_audio is not None
|
||||
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"])
|
||||
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5))
|
||||
|
||||
def get_vision_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config["thinker_config"].get("vision_config")
|
||||
|
||||
def get_audio_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config["thinker_config"].get("audio_config")
|
||||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
# SinusoidsPositionEmbedding
|
||||
assert self.hparams_audio is not None
|
||||
max_timescale = 10000
|
||||
length = 1500
|
||||
channels = self.hparams_audio["hidden_size"]
|
||||
log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
|
||||
inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float())
|
||||
scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
|
||||
pos_embd = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1).to(dtype=torch.float32)
|
||||
yield ("audio_tower.embed_positions.weight", pos_embd)
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
del bid, new_name, n_dims # unused
|
||||
if ".conv" in name and ".weight" in name:
|
||||
return gguf.GGMLQuantizationType.F16
|
||||
return False
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.startswith("thinker."):
|
||||
name = name.replace("thinker.", "")
|
||||
|
||||
if name.startswith("audio_tower"):
|
||||
# process audio tensors
|
||||
if "conv1.bias" in name or "conv2.bias" in name:
|
||||
# transpose conv1 and conv2 bias
|
||||
data_torch = data_torch.unsqueeze(-1)
|
||||
if "audio_bos_eos_token" in name:
|
||||
# this tensor is left unused in transformers code
|
||||
# https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809
|
||||
return []
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("InternVisionModel")
|
||||
class InternVisionModel(MmprojModel):
|
||||
def set_gguf_parameters(self):
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
|
||||
[chat.h](../common/chat.h) (https://github.com/ggml-org/llama.cpp/pull/9639) adds support for [OpenAI-style function calling](https://platform.openai.com/docs/guides/function-calling) and is used in:
|
||||
- `llama-server` when started w/ `--jinja` flag
|
||||
- `llama-cli` (WIP: https://github.com/ggml-org/llama.cpp/pull/11556)
|
||||
|
||||
## Universal support w/ Native & Generic handlers
|
||||
|
||||
|
||||
@@ -98,3 +98,12 @@ NOTE: some models may require large context window, for example: `-c 8192`
|
||||
# note: no pre-quantized GGUF this model, as they have very poor result
|
||||
# ref: https://github.com/ggml-org/llama.cpp/pull/13760
|
||||
```
|
||||
|
||||
**Mixed modalities**:
|
||||
|
||||
```sh
|
||||
# Qwen2.5 Omni
|
||||
# Capabilities: audio input, vision input
|
||||
(tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF
|
||||
(tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF
|
||||
```
|
||||
|
||||
@@ -129,6 +129,7 @@ option(GGML_LASX "ggml: enable lasx" ON)
|
||||
option(GGML_LSX "ggml: enable lsx" ON)
|
||||
option(GGML_RVV "ggml: enable rvv" ON)
|
||||
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
|
||||
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
||||
option(GGML_VXE "ggml: enable vxe" ON)
|
||||
|
||||
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
||||
@@ -176,7 +177,6 @@ option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks"
|
||||
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
||||
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
|
||||
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
|
||||
option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF)
|
||||
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
|
||||
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
||||
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
|
||||
|
||||
@@ -935,6 +935,15 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// repeat a to the specified shape
|
||||
GGML_API struct ggml_tensor * ggml_repeat_4d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
int64_t ne3);
|
||||
|
||||
// sums repetitions in a into shape of b
|
||||
GGML_API struct ggml_tensor * ggml_repeat_back(
|
||||
struct ggml_context * ctx,
|
||||
|
||||
@@ -30,6 +30,7 @@ string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower
|
||||
string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
|
||||
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
|
||||
string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
|
||||
message(STATUS "CANN: SOC_VERSION = ${SOC_VERSION}")
|
||||
|
||||
if (CANN_INSTALL_DIR)
|
||||
# Only Support Linux.
|
||||
|
||||
@@ -299,6 +299,25 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_BACKEND_DL)
|
||||
if (GGML_NATIVE)
|
||||
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
|
||||
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
|
||||
endif()
|
||||
|
||||
# The feature detection code is compiled as a separate target so that
|
||||
# it can be built without the architecture flags
|
||||
# Since multiple variants of the CPU backend may be included in the same
|
||||
# build, using set_source_files_properties() to set the arch flags is not possible
|
||||
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
|
||||
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
|
||||
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
||||
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
|
||||
endif()
|
||||
elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
|
||||
message(STATUS "PowerPC detected")
|
||||
if (GGML_NATIVE)
|
||||
@@ -338,8 +357,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
|
||||
message(STATUS "RISC-V detected")
|
||||
if (GGML_RVV)
|
||||
if (GGML_RV_ZFH)
|
||||
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -DGGML_RV_ZFH -mabi=lp64d)
|
||||
if (GGML_XTHEADVECTOR)
|
||||
list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
|
||||
elseif (GGML_RV_ZFH)
|
||||
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -mabi=lp64d)
|
||||
else()
|
||||
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
|
||||
endif()
|
||||
@@ -477,25 +498,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
||||
|
||||
if (GGML_BACKEND_DL)
|
||||
if (GGML_NATIVE)
|
||||
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
|
||||
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
|
||||
endif()
|
||||
|
||||
# The feature detection code is compiled as a separate target so that
|
||||
# it can be built without the architecture flags
|
||||
# Since multiple variants of the CPU backend may be included in the same
|
||||
# build, using set_source_files_properties() to set the arch flags is not possible
|
||||
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
|
||||
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
|
||||
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
||||
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
|
||||
endif()
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
|
||||
endif()
|
||||
|
||||
@@ -1191,7 +1191,7 @@ static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
}
|
||||
}
|
||||
return;
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined __riscv_v
|
||||
if (__riscv_vlenb() >= QK4_0) {
|
||||
const size_t vl = QK4_0;
|
||||
|
||||
@@ -3783,7 +3783,7 @@ static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
}
|
||||
return;
|
||||
}
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined __riscv_v
|
||||
if (__riscv_vlenb() >= QK4_0) {
|
||||
const size_t vl = QK4_0;
|
||||
|
||||
|
||||
@@ -320,21 +320,17 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
|
||||
|
||||
#ifdef __wasm_simd128__
|
||||
#include <wasm_simd128.h>
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef __POWER9_VECTOR__
|
||||
#include <altivec.h>
|
||||
#else
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
|
||||
#if !defined(__riscv)
|
||||
#elif defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __riscv_v_intrinsic
|
||||
#include <riscv_vector.h>
|
||||
|
||||
@@ -883,7 +883,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
|
||||
#endif
|
||||
}
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined(__riscv_v)
|
||||
|
||||
size_t vl = QK8_0;
|
||||
|
||||
@@ -1221,7 +1221,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
|
||||
#endif
|
||||
}
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined(__riscv_v)
|
||||
|
||||
size_t vl = QK8_1;
|
||||
|
||||
@@ -2384,7 +2384,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined(__riscv_v)
|
||||
size_t vl = qk / 2;
|
||||
|
||||
for (; ib < nb; ++ib) {
|
||||
@@ -2774,7 +2774,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(acc) + summs;
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined(__riscv_v)
|
||||
size_t vl = qk / 2;
|
||||
|
||||
for (; ib < nb; ++ib) {
|
||||
@@ -3121,7 +3121,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(acc);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined(__riscv_v)
|
||||
size_t vl;
|
||||
size_t vlenb = __riscv_vlenb();
|
||||
|
||||
@@ -3460,7 +3460,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(acc) + summs;
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined(__riscv_v)
|
||||
size_t vl;
|
||||
size_t vlenb = __riscv_vlenb();
|
||||
|
||||
@@ -3897,7 +3897,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(accum);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#elif defined(__riscv_v)
|
||||
size_t vl = qk;
|
||||
|
||||
for (; ib < nb; ++ib) {
|
||||
@@ -5100,14 +5100,111 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
#elif defined __riscv_xtheadvector
|
||||
|
||||
float sumf = 0;
|
||||
uint8_t atmp[16];
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * q2 = x[i].qs;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * sc = x[i].scales;
|
||||
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
uint8_t *patmp = atmp;
|
||||
int vsums;
|
||||
int tmp;
|
||||
__asm__ __volatile__(
|
||||
"th.vsetvli zero, %[vl16], e8, m1\n\t"
|
||||
"th.vmv.v.x v8, zero\n\t"
|
||||
"th.vlb.v v1, (%[sc])\n\t"
|
||||
"th.vand.vi v0, v1, 0xF\n\t"
|
||||
"th.vsrl.vi v1, v1, 4\n\t"
|
||||
"th.vsb.v v0, (%[scale])\n\t"
|
||||
"th.vwaddu.vx v16, v1, zero\n\t"
|
||||
"th.vsetvli zero, %[vl16], e16, m2\n\t"
|
||||
"th.vlh.v v2, (%[bsums])\n\t"
|
||||
"th.vwmul.vv v4, v16, v2\n\t"
|
||||
"th.vsetvli zero, %[vl16], e32, m4\n\t"
|
||||
"th.vredsum.vs v8, v4, v8\n\t"
|
||||
"th.vmv.x.s %[vsums], v8"
|
||||
: [tmp] "=&r" (tmp), [vsums] "=&r" (vsums)
|
||||
: [sc] "r" (sc), [scale] "r" (atmp), [bsums] "r" (y[i].bsums)
|
||||
, [vl16] "r" (16)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
sumf += dmin * vsums;
|
||||
int isum = 0;
|
||||
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
__asm__ __volatile__(
|
||||
"th.vsetvli zero, %[vl32], e8, m2\n\t"
|
||||
"th.vlb.v v0, (%[q2])\n\t"
|
||||
"th.vsrl.vi v2, v0, 2\n\t"
|
||||
"th.vsrl.vi v4, v0, 4\n\t"
|
||||
"th.vsrl.vi v6, v0, 6\n\t"
|
||||
"th.vand.vi v0, v0, 0x3\n\t"
|
||||
"th.vand.vi v2, v2, 0x3\n\t"
|
||||
"th.vand.vi v4, v4, 0x3\n\t"
|
||||
"th.vsetvli zero, %[vl128], e8, m8\n\t"
|
||||
"th.vlb.v v8, (%[q8])\n\t"
|
||||
"th.vsetvli zero, %[vl64], e8, m4\n\t"
|
||||
"th.vwmul.vv v16, v0, v8\n\t"
|
||||
"th.vwmul.vv v24, v4, v12\n\t"
|
||||
"th.vsetvli zero, %[vl16], e16, m2\n\t"
|
||||
"th.vmv.v.x v0, zero\n\t"
|
||||
"th.vwredsum.vs v10, v16, v0\n\t"
|
||||
"th.vwredsum.vs v9, v18, v0\n\t"
|
||||
"th.vwredsum.vs v8, v20, v0\n\t"
|
||||
"th.vwredsum.vs v7, v22, v0\n\t"
|
||||
"th.vwredsum.vs v11, v24, v0\n\t"
|
||||
"th.vwredsum.vs v12, v26, v0\n\t"
|
||||
"th.vwredsum.vs v13, v28, v0\n\t"
|
||||
"th.vwredsum.vs v14, v30, v0\n\t"
|
||||
"li %[tmp], 4\n\t"
|
||||
"th.vsetvli zero, %[tmp], e32, m1\n\t"
|
||||
"th.vslideup.vi v10, v9, 1\n\t"
|
||||
"th.vslideup.vi v8, v7, 1\n\t"
|
||||
"th.vslideup.vi v11, v12, 1\n\t"
|
||||
"th.vslideup.vi v13, v14, 1\n\t"
|
||||
"th.vslideup.vi v10, v8, 2\n\t"
|
||||
"th.vslideup.vi v11, v13, 2\n\t"
|
||||
"li %[tmp], 8\n\t"
|
||||
"th.vsetvli zero, %[tmp], e32, m2\n\t"
|
||||
"th.vlbu.v v12, (%[scale])\n\t"
|
||||
"th.vmul.vv v10, v10, v12\n\t"
|
||||
"th.vredsum.vs v0, v10, v0\n\t"
|
||||
"th.vmv.x.s %[tmp], v0\n\t"
|
||||
"add %[isum], %[isum], %[tmp]"
|
||||
: [tmp] "=&r" (tmp), [isum] "+&r" (isum)
|
||||
: [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
|
||||
, [vl16] "r" (16), [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
q2 += 32; q8 += 128; patmp += 8;
|
||||
}
|
||||
|
||||
sumf += dall * isum;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v
|
||||
|
||||
float sumf = 0;
|
||||
uint8_t atmp[16];
|
||||
|
||||
const int vector_length = __riscv_vlenb() * 8;
|
||||
float sumf = 0;
|
||||
|
||||
uint8_t temp_01[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
|
||||
uint8_t atmp[16];
|
||||
|
||||
switch (vector_length) {
|
||||
case 256:
|
||||
@@ -6137,14 +6234,141 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
#elif defined __riscv_xtheadvector
|
||||
|
||||
uint32_t aux[3];
|
||||
uint32_t utmp[4];
|
||||
|
||||
const int vector_length = __riscv_vlenb() * 8;
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * restrict q3 = x[i].qs;
|
||||
const uint8_t * restrict qh = x[i].hmask;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
int8_t * scale = (int8_t *)utmp;
|
||||
int tmp;
|
||||
__asm__ __volatile__(
|
||||
"li %[tmp], 12\n\t"
|
||||
"th.vsetvli zero, %[tmp], e8, m1\n\t"
|
||||
"th.vlb.v v0, (%[s6b])\n\t"
|
||||
"th.vmv.v.v v2, v0\n\t"
|
||||
"li %[tmp], 2\n\t"
|
||||
"th.vsetvli zero, %[tmp], e64, m1\n\t"
|
||||
"th.vmv.v.x v9, %[sh]\n\t"\
|
||||
"th.vslidedown.vi v1, v0, 1\n\t"
|
||||
"th.vslide1up.vx v8, v9, zero\n\t" // {0, 0, 4, 4}
|
||||
"th.vslideup.vi v0, v2, 1\n\t" // {aux[0], aux[1], aux[0], aux[1]}
|
||||
"li %[tmp], 4\n\t"
|
||||
"th.vsetvli zero, %[tmp], e32, m1\n\t"
|
||||
"th.vid.v v9\n\t"
|
||||
"th.vmv.x.s %[tmp], v1\n\t"
|
||||
"th.vsll.vi v9, v9, 1\n\t" // {0, 2, 4, 6}
|
||||
"th.vmv.v.x v1, %[tmp]\n\t" // {aux[2], aux[2], aux[2], aux[2]}
|
||||
"th.vsrl.vv v4, v1, v9\n\t"
|
||||
"th.vsrl.vv v2, v0, v8\n\t"
|
||||
"th.vand.vx v5, v4, %[kmask1]\n\t"
|
||||
"th.vand.vx v3, v2, %[kmask2]\n\t"
|
||||
"th.vsll.vi v6, v5, 4\n\t"
|
||||
"th.vor.vv v7, v6, v3\n\t"
|
||||
"li %[tmp], 16\n\t"
|
||||
"th.vsetvli zero, %[tmp], e8, m1\n\t"
|
||||
"th.vsub.vx v0, v7, %[c]\n\t"
|
||||
"th.vsb.v v0, (%[scale])"
|
||||
: [tmp] "=&r" (tmp)
|
||||
: [sh] "r" (0x0000000400000004), [s6b] "r" (x[i].scales), [c] "r" (32)
|
||||
, [scale] "r" (scale), [kmask1] "r" (kmask1), [kmask2] "r" (kmask2)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
|
||||
uint8_t m = 1;
|
||||
int isum = 0;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
__asm__ __volatile__(
|
||||
// fixme: use v0p7 mask layout directly
|
||||
"th.vsetvli zero, %[vl32], e8, m2\n\t"
|
||||
"th.vlb.v v8, (%[q3])\n\t"
|
||||
"th.vsrl.vi v10, v8, 2\n\t"
|
||||
"th.vsrl.vi v12, v8, 4\n\t"
|
||||
"th.vsrl.vi v14, v8, 6\n\t"
|
||||
"th.vand.vi v8, v8, 3\n\t"
|
||||
"th.vand.vi v10, v10, 3\n\t"
|
||||
"th.vand.vi v12, v12, 3\n\t"
|
||||
"th.vlb.v v2, (%[qh])\n\t"
|
||||
"th.vand.vx v4, v2, %[m]\n\t"
|
||||
"slli %[m], %[m], 1\n\t"
|
||||
"th.vmseq.vx v0, v4, zero\n\t"
|
||||
"th.vadd.vi v8, v8, -4, v0.t\n\t"
|
||||
"th.vand.vx v4, v2, %[m]\n\t"
|
||||
"slli %[m], %[m], 1\n\t"
|
||||
"th.vmseq.vx v0, v4, zero\n\t"
|
||||
"th.vadd.vi v10, v10, -4, v0.t\n\t"
|
||||
"th.vand.vx v4, v2, %[m]\n\t"
|
||||
"slli %[m], %[m], 1\n\t"
|
||||
"th.vmseq.vx v0, v4, zero\n\t"
|
||||
"th.vadd.vi v12, v12, -4, v0.t\n\t"
|
||||
"th.vand.vx v4, v2, %[m]\n\t"
|
||||
"slli %[m], %[m], 1\n\t"
|
||||
"th.vmseq.vx v0, v4, zero\n\t"
|
||||
"th.vadd.vi v14, v14, -4, v0.t\n\t"
|
||||
"th.vsetvli zero, %[vl128], e8, m8\n\t"
|
||||
"th.vlb.v v0, (%[q8])\n\t"
|
||||
"th.vsetvli zero, %[vl64], e8, m4\n\t"
|
||||
"th.vwmul.vv v16, v0, v8\n\t"
|
||||
"th.vwmul.vv v24, v4, v12\n\t"
|
||||
"li %[tmp], 16\n\t"
|
||||
"th.vsetvli zero, %[tmp], e16, m2\n\t"
|
||||
"th.vmv.v.x v0, zero\n\t"
|
||||
"th.vwredsum.vs v10, v16, v0\n\t"
|
||||
"th.vwredsum.vs v9, v18, v0\n\t"
|
||||
"th.vwredsum.vs v8, v20, v0\n\t"
|
||||
"th.vwredsum.vs v7, v22, v0\n\t"
|
||||
"th.vwredsum.vs v11, v24, v0\n\t"
|
||||
"th.vwredsum.vs v12, v26, v0\n\t"
|
||||
"th.vwredsum.vs v13, v28, v0\n\t"
|
||||
"th.vwredsum.vs v14, v30, v0\n\t"
|
||||
"li %[tmp], 4\n\t"
|
||||
"th.vsetvli zero, %[tmp], e32, m1\n\t"
|
||||
"th.vslideup.vi v10, v9, 1\n\t"
|
||||
"th.vslideup.vi v8, v7, 1\n\t"
|
||||
"th.vslideup.vi v11, v12, 1\n\t"
|
||||
"th.vslideup.vi v13, v14, 1\n\t"
|
||||
"th.vslideup.vi v10, v8, 2\n\t"
|
||||
"th.vslideup.vi v11, v13, 2\n\t"
|
||||
"li %[tmp], 8\n\t"
|
||||
"th.vsetvli zero, %[tmp], e32, m2\n\t"
|
||||
"th.vlb.v v12, (%[scale])\n\t"
|
||||
"th.vmul.vv v10, v10, v12\n\t"
|
||||
"th.vredsum.vs v0, v10, v0\n\t"
|
||||
"th.vmv.x.s %[tmp], v0\n\t"
|
||||
"add %[isum], %[isum], %[tmp]"
|
||||
: [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum)
|
||||
: [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
|
||||
, [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
q3 += 32; q8 += 128; scale += 8;
|
||||
}
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
sumf += d * isum;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v
|
||||
|
||||
uint32_t utmp[4];
|
||||
float sumf = 0;
|
||||
uint32_t aux[3];
|
||||
const int vector_length = __riscv_vlenb() * 8;
|
||||
|
||||
switch (vector_length) {
|
||||
case 256:
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
@@ -6331,7 +6555,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
"vslideup.vi v13, v14, 1\n\t"
|
||||
"vslideup.vi v10, v8, 2\n\t"
|
||||
"vslideup.vi v11, v13, 2\n\t"
|
||||
"vsetivli zero, 8, e32, m2\n\t"\
|
||||
"vsetivli zero, 8, e32, m2\n\t"
|
||||
"vle8.v v15, (%[scale])\n\t"
|
||||
"vsext.vf4 v12, v15\n\t"
|
||||
"vmul.vv v10, v10, v12\n\t"
|
||||
@@ -7180,14 +7404,130 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
*s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
#elif defined __riscv_xtheadvector
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
const int vector_length = __riscv_vlenb() * 8;
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int tmp, tmp2, sumi;
|
||||
__asm__ __volatile__(
|
||||
"li %[t1], 12\n\t"
|
||||
"th.vsetvli zero, %[t1], e8, m1\n\t"
|
||||
"th.vlb.v v1, (%[s6b])\n\t" // {aux[0], aux[1], aux[2]}
|
||||
"li %[t1], 4\n\t"
|
||||
"th.vsetvli zero, %[t1], e32, m1\n\t"
|
||||
"th.vslidedown.vi v2, v1, 2\n\t"
|
||||
"th.vmv.v.v v3, v2\n\t"
|
||||
"th.vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]}
|
||||
"li %[t1], 2\n\t"
|
||||
"th.vsetvli zero, %[t1], e32, m1\n\t"
|
||||
"th.vmv.v.i v4, 4\n\t"
|
||||
"th.vand.vx v8, v1, %[kmask1]\n\t"
|
||||
"th.vslide1up.vx v5, v4, zero\n\t" // {0, 4}
|
||||
"th.vsrl.vi v6, v1, 6\n\t"
|
||||
"th.vsrl.vv v7, v2, v5\n\t"
|
||||
"th.vand.vx v0, v6, %[kmask3]\n\t"
|
||||
"th.vand.vx v2, v7, %[kmask2]\n\t"
|
||||
"th.vsll.vi v6, v0, 4\n\t"
|
||||
"li %[t2], 8\n\t"
|
||||
"addi %[t1], %[utmp], 4\n\t"
|
||||
"th.vor.vv v1, v6, v2\n\t"
|
||||
"th.vssw.v v8, (%[utmp]), %[t2]\n\t"
|
||||
"th.vssw.v v1, (%[t1]), %[t2]\n\t"
|
||||
"th.vsetvli zero, zero, e32, m2\n\t" // vl == 8
|
||||
"th.vlw.v v2, (%[bsums])\n\t"
|
||||
"th.vsetvli zero, %[t2], e16, m1\n\t"
|
||||
"th.vnsrl.vi v0, v2, 0\n\t"
|
||||
"th.vnsrl.vi v1, v2, 16\n\t"
|
||||
"th.vadd.vv v2, v0, v1\n\t"
|
||||
"th.vlbu.v v4, (%[mins])\n\t"
|
||||
"th.vwmul.vv v6, v4, v2\n\t"
|
||||
"th.vmv.v.x v0, zero\n\t"
|
||||
"th.vsetvli zero, %[t2], e32, m2\n\t"
|
||||
"th.vredsum.vs v0, v6, v0\n\t"
|
||||
"th.vmv.x.s %[sumi], v0"
|
||||
: [t1] "=&r" (tmp), [t2] "=&r" (tmp2), [sumi] "=&r" (sumi)
|
||||
: [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
|
||||
, [s6b] "r" (x[i].scales), [kmask1] "r" (kmask1)
|
||||
, [kmask2] "r" (kmask2), [kmask3] "r" (kmask3)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
sumf -= dmin * sumi;
|
||||
|
||||
const uint8_t * restrict q4 = x[i].qs;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
sumi = 0;
|
||||
const uint8_t * scale = scales;
|
||||
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
int vl128 = 128, vl64 = 64, vl32 = 32;
|
||||
__asm__ __volatile__(
|
||||
"th.vsetvli zero, %[vl128], e8, m8\n\t"
|
||||
"th.vlb.v v8, (%[q8])\n\t"
|
||||
"th.vsetvli zero, %[vl64], e8, m4\n\t"
|
||||
"th.vlb.v v0, (%[q4])\n\t"
|
||||
"th.vsrl.vi v4, v0, 4\n\t"
|
||||
"th.vand.vi v0, v0, 0xF\n\t"
|
||||
"th.vsetvli zero, %[vl32], e8, m2\n\t"
|
||||
"th.vwmul.vv v28, v6, v14\n\t"
|
||||
"th.vwmul.vv v20, v4, v10\n\t"
|
||||
"th.vwmul.vv v24, v2, v12\n\t"
|
||||
"th.vwmul.vv v16, v0, v8\n\t"
|
||||
"li %[tmp], 4\n\t"
|
||||
"th.vsetvli zero, %[tmp], e32, m1\n\t"
|
||||
"th.vlbu.v v1, (%[scale])\n\t"
|
||||
"th.vmv.v.x v0, zero\n\t"
|
||||
"th.vsetvli zero, %[vl32], e16, m4\n\t"
|
||||
"th.vwredsum.vs v6, v24, v0\n\t"
|
||||
"th.vwredsum.vs v7, v28, v0\n\t"
|
||||
"th.vwredsum.vs v4, v16, v0\n\t"
|
||||
"th.vwredsum.vs v5, v20, v0\n\t"
|
||||
"th.vsetvli zero, %[tmp], e32, m1\n\t"
|
||||
"th.vslideup.vi v6, v7, 1\n\t"
|
||||
"th.vslideup.vi v4, v5, 1\n\t"
|
||||
"th.vslideup.vi v4, v6, 2\n\t"
|
||||
"th.vmul.vv v8, v4, v1\n\t"
|
||||
"th.vredsum.vs v0, v8, v0\n\t"
|
||||
"th.vmv.x.s %[tmp], v0\n\t"
|
||||
"add %[sumi], %[sumi], %[tmp]"
|
||||
: [tmp] "=&r" (tmp), [sumi] "+&r" (sumi)
|
||||
: [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32)
|
||||
, [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
|
||||
q4 += 64; q8 += 128; scale += 4;
|
||||
}
|
||||
|
||||
sumf += d * sumi;
|
||||
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
float sumf = 0;
|
||||
const int vector_length = __riscv_vlenb() * 8;
|
||||
|
||||
switch (vector_length) {
|
||||
case 256:
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
@@ -8074,7 +8414,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
#elif defined __riscv_v
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
@@ -9232,11 +9572,92 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v_intrinsic
|
||||
#elif defined __riscv_xtheadvector
|
||||
|
||||
const int vector_length = __riscv_vlenb() * 8;
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
|
||||
const uint8_t * restrict q6 = x[i].ql;
|
||||
const uint8_t * restrict qh = x[i].qh;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
const int8_t * restrict scale = x[i].scales;
|
||||
|
||||
int sum_t = 0;
|
||||
int t0;
|
||||
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
__asm__ __volatile__(
|
||||
"th.vsetvli zero, %[vl32], e8, m2\n\t" // vl == 32
|
||||
"th.vlb.v v4, (%[qh])\n\t"
|
||||
"th.vsll.vi v0, v4, 4\n\t"
|
||||
"th.vsll.vi v2, v4, 2\n\t"
|
||||
"th.vsrl.vi v6, v4, 2\n\t"
|
||||
"th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64
|
||||
"th.vlb.v v8, (%[q6])\n\t"
|
||||
"th.vsrl.vi v12, v8, 4\n\t"
|
||||
"th.vand.vi v8, v8, 0xF\n\t"
|
||||
"th.vsetvli zero, %[vl128], e8, m8\n\t" // vl == 128
|
||||
"th.vand.vx v0, v0, %[mask]\n\t"
|
||||
"th.vor.vv v8, v8, v0\n\t"
|
||||
"th.vlb.v v0, (%[q8])\n\t"
|
||||
"th.vsub.vx v8, v8, %[vl32]\n\t"
|
||||
"th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64
|
||||
"th.vwmul.vv v16, v0, v8\n\t"
|
||||
"th.vwmul.vv v24, v4, v12\n\t"
|
||||
"li %[t0], 16\n\t"
|
||||
"th.vsetvli zero, %[t0], e16, m2\n\t" // vl == 16
|
||||
"th.vmv.v.x v0, zero\n\t"
|
||||
"th.vwredsum.vs v10, v16, v0\n\t"
|
||||
"th.vwredsum.vs v9, v18, v0\n\t"
|
||||
"th.vwredsum.vs v8, v20, v0\n\t"
|
||||
"th.vwredsum.vs v7, v22, v0\n\t"
|
||||
"th.vwredsum.vs v11, v24, v0\n\t"
|
||||
"th.vwredsum.vs v12, v26, v0\n\t"
|
||||
"th.vwredsum.vs v13, v28, v0\n\t"
|
||||
"th.vwredsum.vs v14, v30, v0\n\t"
|
||||
"li %[t0], 4\n\t"
|
||||
"th.vsetvli zero, %[t0], e32, m1\n\t" // vl == 4
|
||||
"th.vslideup.vi v10, v9, 1\n\t"
|
||||
"th.vslideup.vi v8, v7, 1\n\t"
|
||||
"th.vslideup.vi v11, v12, 1\n\t"
|
||||
"th.vslideup.vi v13, v14, 1\n\t"
|
||||
"th.vslideup.vi v10, v8, 2\n\t"
|
||||
"th.vslideup.vi v11, v13, 2\n\t"
|
||||
"li %[t0], 8\n\t"
|
||||
"th.vsetvli zero, %[t0], e32, m2\n\t" // vl == 8
|
||||
"th.vlb.v v4, (%[scale])\n\t"
|
||||
"th.vmul.vv v2, v4, v10\n\t"
|
||||
"th.vredsum.vs v0, v2, v0\n\t"
|
||||
"th.vmv.x.s %[t0], v0\n\t"
|
||||
"add %[sumi], %[sumi], %[t0]"
|
||||
: [sumi] "+&r" (sum_t), [t0] "=&r" (t0)
|
||||
: [qh] "r" (qh), [q6] "r" (q6), [q8] "r" (q8), [scale] "r" (scale)
|
||||
, [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
|
||||
, [mask] "r" (0x30)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||
);
|
||||
q6 += 64; qh += 32; q8 += 128; scale += 8;
|
||||
}
|
||||
|
||||
sumf += d * sum_t;
|
||||
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __riscv_v
|
||||
|
||||
float sumf = 0;
|
||||
const int vector_length = __riscv_vlenb() * 8;
|
||||
|
||||
switch (vector_length) {
|
||||
case 256:
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
@@ -623,8 +623,8 @@ static __global__ void flash_attn_combine_results(
|
||||
__builtin_assume(tid < D);
|
||||
|
||||
extern __shared__ float2 meta[];
|
||||
if (tid < 2*parallel_blocks) {
|
||||
((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + tid];
|
||||
for (int i = tid; i < 2*parallel_blocks; i += D) {
|
||||
((float *) meta)[i] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + i];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
@@ -386,7 +386,7 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
|
||||
return r;
|
||||
}
|
||||
|
||||
#elif defined(__riscv) && defined(GGML_RV_ZFH)
|
||||
#elif defined(__riscv) && defined(__riscv_zfhmin)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
float f;
|
||||
|
||||
@@ -55,14 +55,17 @@ endfunction()
|
||||
|
||||
set(GGML_OPENCL_KERNELS
|
||||
add
|
||||
argsort
|
||||
clamp
|
||||
cpy
|
||||
cvt
|
||||
diag_mask_inf
|
||||
div
|
||||
gelu
|
||||
gemv_noshuffle_general
|
||||
gemv_noshuffle
|
||||
get_rows
|
||||
group_norm
|
||||
im2col_f32
|
||||
im2col_f16
|
||||
mul_mat_Ab_Bi_8x4
|
||||
@@ -83,11 +86,14 @@ set(GGML_OPENCL_KERNELS
|
||||
rms_norm
|
||||
rope
|
||||
scale
|
||||
sigmoid
|
||||
silu
|
||||
softmax_4_f32
|
||||
softmax_4_f16
|
||||
softmax_f32
|
||||
softmax_f16
|
||||
sub
|
||||
sum_rows
|
||||
transpose
|
||||
)
|
||||
|
||||
|
||||
@@ -299,27 +299,37 @@ struct ggml_backend_opencl_context {
|
||||
cl_program program_mul_mv_f16_f32;
|
||||
cl_program program_mul_mv_f32_f32;
|
||||
cl_program program_mul;
|
||||
cl_program program_div;
|
||||
cl_program program_sub;
|
||||
cl_program program_norm;
|
||||
cl_program program_relu;
|
||||
cl_program program_rms_norm;
|
||||
cl_program program_group_norm;
|
||||
cl_program program_rope;
|
||||
cl_program program_scale;
|
||||
cl_program program_silu;
|
||||
cl_program program_sigmoid;
|
||||
cl_program program_softmax_f32;
|
||||
cl_program program_softmax_f16;
|
||||
cl_program program_softmax_4_f32;
|
||||
cl_program program_softmax_4_f16;
|
||||
cl_program program_argsort_f32_i32;
|
||||
cl_program program_sum_rows_f32;
|
||||
|
||||
cl_kernel kernel_add, kernel_add_row;
|
||||
cl_kernel kernel_mul, kernel_mul_row;
|
||||
cl_kernel kernel_div, kernel_div_row;
|
||||
cl_kernel kernel_sub, kernel_sub_row;
|
||||
cl_kernel kernel_scale;
|
||||
cl_kernel kernel_silu, kernel_silu_4;
|
||||
cl_kernel kernel_gelu, kernel_gelu_4;
|
||||
cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
|
||||
cl_kernel kernel_relu;
|
||||
cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
|
||||
cl_kernel kernel_clamp;
|
||||
cl_kernel kernel_norm;
|
||||
cl_kernel kernel_rms_norm;
|
||||
cl_kernel kernel_group_norm;
|
||||
cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
|
||||
cl_kernel kernel_soft_max, kernel_soft_max_4;
|
||||
cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
|
||||
@@ -339,6 +349,8 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
|
||||
cl_kernel kernel_mul_mv_q6_K_f32;
|
||||
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
|
||||
cl_kernel kernel_argsort_f32_i32;
|
||||
cl_kernel kernel_sum_rows_f32;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// Transpose kernels
|
||||
@@ -986,6 +998,105 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// argsort
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "argsort.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("argsort.cl");
|
||||
#endif
|
||||
backend_ctx->program_argsort_f32_i32 =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// div
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "div.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("div.cl");
|
||||
#endif
|
||||
backend_ctx->program_div =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// sub
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "sub.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("sub.cl");
|
||||
#endif
|
||||
backend_ctx->program_sub =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_sub = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// sum_rows
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "sum_rows.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("sum_rows.cl");
|
||||
#endif
|
||||
backend_ctx->program_sum_rows_f32 =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_sum_rows_f32 = clCreateKernel(backend_ctx->program_sum_rows_f32, "kernel_sum_rows_f32", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// sigmoid
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "sigmoid.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("sigmoid.cl");
|
||||
#endif
|
||||
backend_ctx->program_sigmoid =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_sigmoid_f32 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_sigmoid_f16 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f16", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// group_norm
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "group_norm.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("group_norm.cl");
|
||||
#endif
|
||||
backend_ctx->program_group_norm =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_group_norm = clCreateKernel(backend_ctx->program_group_norm, "kernel_group_norm", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// Adreno kernels
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// transpose
|
||||
@@ -1856,6 +1967,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_SUB:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
@@ -1863,7 +1976,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -1873,11 +1988,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
return true;
|
||||
case GGML_OP_GROUP_NORM:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_MUL_MAT:
|
||||
if (op->src[0]->type == GGML_TYPE_F16) {
|
||||
return true;
|
||||
} else if (op->src[0]->type == GGML_TYPE_F32) {
|
||||
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
||||
return op->src[1]->type == GGML_TYPE_F32;
|
||||
} else if (op->src[0]->type == GGML_TYPE_Q4_0 ||
|
||||
op->src[0]->type == GGML_TYPE_Q6_K) {
|
||||
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
||||
@@ -1912,6 +2029,10 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
}
|
||||
case GGML_OP_IM2COL:
|
||||
return true;
|
||||
case GGML_OP_ARGSORT:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_SUM_ROWS:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -3238,6 +3359,256 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
const cl_ulong nb00 = src0->nb[0];
|
||||
const cl_ulong nb01 = src0->nb[1];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const int ne10 = src1->ne[0];
|
||||
const int ne11 = src1->ne[1];
|
||||
const int ne12 = src1->ne[2];
|
||||
const int ne13 = src1->ne[3];
|
||||
|
||||
const cl_ulong nb10 = src1->nb[0];
|
||||
const cl_ulong nb11 = src1->nb[1];
|
||||
const cl_ulong nb12 = src1->nb[2];
|
||||
const cl_ulong nb13 = src1->nb[3];
|
||||
|
||||
const int ne0 = dst->ne[0];
|
||||
|
||||
const cl_ulong nb0 = dst->nb[0];
|
||||
const cl_ulong nb1 = dst->nb[1];
|
||||
const cl_ulong nb2 = dst->nb[2];
|
||||
const cl_ulong nb3 = dst->nb[3];
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
bool bcast_row = false;
|
||||
cl_kernel kernel;
|
||||
|
||||
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
// src1 is a row
|
||||
GGML_ASSERT(ne11 == 1);
|
||||
|
||||
bcast_row = true;
|
||||
int ne = ne00 / 4;
|
||||
kernel = backend_ctx->kernel_div_row;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_div;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
|
||||
}
|
||||
|
||||
if (bcast_row) {
|
||||
int n = ggml_nelements(dst)/4;
|
||||
size_t global_work_size[] = {(size_t)n, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
#endif
|
||||
} else {
|
||||
unsigned int nth = MIN(64, ne0);
|
||||
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {nth, 1, 1};
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
const cl_ulong nb00 = src0->nb[0];
|
||||
const cl_ulong nb01 = src0->nb[1];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const int ne10 = src1->ne[0];
|
||||
const int ne11 = src1->ne[1];
|
||||
const int ne12 = src1->ne[2];
|
||||
const int ne13 = src1->ne[3];
|
||||
|
||||
const cl_ulong nb10 = src1->nb[0];
|
||||
const cl_ulong nb11 = src1->nb[1];
|
||||
const cl_ulong nb12 = src1->nb[2];
|
||||
const cl_ulong nb13 = src1->nb[3];
|
||||
|
||||
const int ne0 = dst->ne[0];
|
||||
|
||||
const cl_ulong nb0 = dst->nb[0];
|
||||
const cl_ulong nb1 = dst->nb[1];
|
||||
const cl_ulong nb2 = dst->nb[2];
|
||||
const cl_ulong nb3 = dst->nb[3];
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
bool bcast_row = false;
|
||||
cl_kernel kernel;
|
||||
|
||||
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
// src1 is a row
|
||||
GGML_ASSERT(ne11 == 1);
|
||||
|
||||
bcast_row = true;
|
||||
int ne = ne00 / 4;
|
||||
kernel = backend_ctx->kernel_sub_row;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_sub;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
|
||||
}
|
||||
|
||||
if (bcast_row) {
|
||||
int n = ggml_nelements(dst)/4;
|
||||
size_t global_work_size[] = {(size_t)n, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
#endif
|
||||
} else {
|
||||
unsigned int nth = MIN(64, ne0);
|
||||
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {nth, 1, 1};
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@@ -3429,6 +3800,58 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
UNUSED(src1);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
cl_kernel kernel;
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_sigmoid_f32;
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
||||
kernel = backend_ctx->kernel_sigmoid_f16;
|
||||
} else {
|
||||
GGML_ASSERT(false && "Unsupported data types for sigmoid (input and output must be both f32 or f16)");
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
size_t global_work_size[] = {(size_t)n, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
size_t * local_work_size_ptr = local_work_size;
|
||||
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
||||
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@@ -3626,6 +4049,65 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
UNUSED(src1);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
int32_t n_groups = ((const int32_t *) dst->op_params)[0];
|
||||
int32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + n_groups - 1) / n_groups);
|
||||
float eps = ((const float *) dst->op_params)[1];
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne = ne00*ne01*ne02;
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_group_norm;
|
||||
|
||||
size_t sgs = 64;
|
||||
if (backend_ctx->gpu_family == ADRENO) {
|
||||
sgs = 64;
|
||||
} else if (backend_ctx->gpu_family == INTEL) {
|
||||
sgs = 32;
|
||||
} else {
|
||||
GGML_ASSERT(false && "Unsupported GPU");
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &group_size));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
|
||||
|
||||
size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
|
||||
size_t local_work_size[] = {(size_t)sgs, 1, 1};
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@@ -4975,6 +5457,124 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
GGML_UNUSED(src1);
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int nrows = ggml_nrows(src0);
|
||||
|
||||
int ne00_padded = 1;
|
||||
while (ne00_padded < ne00) {
|
||||
ne00_padded *= 2;
|
||||
}
|
||||
|
||||
int order = (enum ggml_sort_order) dst->op_params[0];
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_argsort_f32_i32;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00_padded));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &order));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, ne00_padded*sizeof(int), NULL));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
|
||||
size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
GGML_UNUSED(src1);
|
||||
|
||||
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
const cl_ulong nb01 = src0->nb[1];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const cl_ulong nb1 = dst->nb[1];
|
||||
const cl_ulong nb2 = dst->nb[2];
|
||||
const cl_ulong nb3 = dst->nb[3];
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_sum_rows_f32;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)64, 1, 1};
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Op offloading
|
||||
//------------------------------------------------------------------------------
|
||||
@@ -5023,6 +5623,18 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_mul;
|
||||
break;
|
||||
case GGML_OP_DIV:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_div;
|
||||
break;
|
||||
case GGML_OP_SUB:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_sub;
|
||||
break;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(tensor)) {
|
||||
case GGML_UNARY_OP_GELU:
|
||||
@@ -5049,6 +5661,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_relu;
|
||||
break;
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_sigmoid;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
} break;
|
||||
@@ -5070,6 +5688,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_rms_norm;
|
||||
break;
|
||||
case GGML_OP_GROUP_NORM:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_group_norm;
|
||||
break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
|
||||
return false;
|
||||
@@ -5115,6 +5739,18 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_im2col;
|
||||
break;
|
||||
case GGML_OP_ARGSORT:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_argsort;
|
||||
break;
|
||||
case GGML_OP_SUM_ROWS:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_sum_rows;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
86
ggml/src/ggml-opencl/kernels/argsort.cl
Normal file
86
ggml/src/ggml-opencl/kernels/argsort.cl
Normal file
@@ -0,0 +1,86 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#define SWAP(x, y, T) { T tmp = (x); (x) = (y); (y) = tmp; }
|
||||
|
||||
enum ggml_sort_order {
|
||||
GGML_SORT_ORDER_ASC,
|
||||
GGML_SORT_ORDER_DESC,
|
||||
};
|
||||
|
||||
kernel void kernel_argsort_f32_i32(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global int * dst,
|
||||
ulong offsetd,
|
||||
const int ne00,
|
||||
const int ne00_pad,
|
||||
const int order,
|
||||
local int * dst_row
|
||||
) {
|
||||
// bitonic sort
|
||||
int col = get_local_id(0);
|
||||
int row = get_group_id(1);
|
||||
|
||||
if (col >= ne00_pad) {
|
||||
return;
|
||||
}
|
||||
|
||||
src0 = (global char *)((global char *)src0 + offset0);
|
||||
dst = (global float *)((global char *)dst + offsetd);
|
||||
|
||||
global float * x_row = src0 + row * ne00;
|
||||
|
||||
// initialize indices
|
||||
dst_row[col] = col;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (int k = 2; k <= ne00_pad; k *= 2) {
|
||||
for (int j = k / 2; j > 0; j /= 2) {
|
||||
int ixj = col ^ j;
|
||||
if (ixj > col) {
|
||||
if ((col & k) == 0) {
|
||||
if (dst_row[col] >= ne00 ||
|
||||
(dst_row[ixj] < ne00 && (order == GGML_SORT_ORDER_ASC ?
|
||||
x_row[dst_row[col]] > x_row[dst_row[ixj]] :
|
||||
x_row[dst_row[col]] < x_row[dst_row[ixj]]))
|
||||
) {
|
||||
SWAP(dst_row[col], dst_row[ixj], int);
|
||||
}
|
||||
} else {
|
||||
if (dst_row[ixj] >= ne00 ||
|
||||
(dst_row[col] < ne00 && (order == GGML_SORT_ORDER_ASC ?
|
||||
x_row[dst_row[col]] < x_row[dst_row[ixj]] :
|
||||
x_row[dst_row[col]] > x_row[dst_row[ixj]]))
|
||||
) {
|
||||
SWAP(dst_row[col], dst_row[ixj], int);
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
}
|
||||
|
||||
// copy the result to dst without the padding
|
||||
if (col < ne00) {
|
||||
dst[row * ne00 + col] = dst_row[col];
|
||||
}
|
||||
}
|
||||
72
ggml/src/ggml-opencl/kernels/div.cl
Normal file
72
ggml/src/ggml-opencl/kernels/div.cl
Normal file
@@ -0,0 +1,72 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// div
|
||||
//------------------------------------------------------------------------------
|
||||
kernel void kernel_div(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
int ne13,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
int i13 = i03 % ne13;
|
||||
int i12 = i02 % ne12;
|
||||
int i11 = i01 % ne11;
|
||||
|
||||
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
||||
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
const int i10 = i0 % ne10;
|
||||
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) / *((global float *)(src1_ptr + i10*nb10));
|
||||
}
|
||||
}
|
||||
|
||||
// assumption: src1 is a row
|
||||
// broadcast src1 into src0
|
||||
kernel void kernel_div_row(
|
||||
global float4 * src0,
|
||||
ulong offset0,
|
||||
global float4 * src1,
|
||||
ulong offset1,
|
||||
global float4 * dst,
|
||||
ulong offsetd,
|
||||
int ne
|
||||
) {
|
||||
src0 = (global float4*)((global char*)src0 + offset0);
|
||||
src1 = (global float4*)((global char*)src1 + offset1);
|
||||
dst = (global float4*)((global char*)dst + offsetd);
|
||||
|
||||
// This performs better than using %.
|
||||
uint gid = get_global_id(0);
|
||||
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||
dst[gid] = src0[gid] / src1[idx1];
|
||||
}
|
||||
72
ggml/src/ggml-opencl/kernels/group_norm.cl
Normal file
72
ggml/src/ggml-opencl/kernels/group_norm.cl
Normal file
@@ -0,0 +1,72 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
// Workgroup must be a subgroup
|
||||
#ifdef INTEL_GPU
|
||||
REQD_SUBGROUP_SIZE_32
|
||||
#elif defined (ADRENO_GPU)
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_group_norm(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne,
|
||||
int group_size,
|
||||
float eps
|
||||
) {
|
||||
src0 = (global float *)((global char *)src0 + offset0);
|
||||
dst = (global float *)((global char *)dst + offsetd);
|
||||
|
||||
int start = get_group_id(0) * group_size;
|
||||
int end = start + group_size;
|
||||
|
||||
start += get_local_id(0);
|
||||
|
||||
if (end >= ne) {
|
||||
end = ne;
|
||||
}
|
||||
|
||||
float tmp = 0.0f;
|
||||
|
||||
for (int j = start; j < end; j += get_local_size(0)) {
|
||||
tmp += src0[j];
|
||||
}
|
||||
|
||||
tmp = sub_group_reduce_add(tmp);
|
||||
|
||||
const float mean = tmp / group_size;
|
||||
tmp = 0.0f;
|
||||
|
||||
for (int j = start; j < end; j += get_local_size(0)) {
|
||||
float xi = src0[j] - mean;
|
||||
dst[j] = xi;
|
||||
tmp += xi * xi;
|
||||
}
|
||||
|
||||
tmp = sub_group_reduce_add(tmp);
|
||||
|
||||
const float variance = tmp / group_size;
|
||||
const float scale = 1.0f/sqrt(variance + eps);
|
||||
for (int j = start; j < end; j += get_local_size(0)) {
|
||||
dst[j] *= scale;
|
||||
}
|
||||
}
|
||||
29
ggml/src/ggml-opencl/kernels/sigmoid.cl
Normal file
29
ggml/src/ggml-opencl/kernels/sigmoid.cl
Normal file
@@ -0,0 +1,29 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// sigmoid
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
kernel void kernel_sigmoid_f32(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global float*)((global char*)src0 + offset0);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
|
||||
}
|
||||
|
||||
kernel void kernel_sigmoid_f16(
|
||||
global half * src0,
|
||||
ulong offset0,
|
||||
global half * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global half*)((global char*)src0 + offset0);
|
||||
dst = (global half*)((global char*)dst + offsetd);
|
||||
|
||||
dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
|
||||
}
|
||||
72
ggml/src/ggml-opencl/kernels/sub.cl
Normal file
72
ggml/src/ggml-opencl/kernels/sub.cl
Normal file
@@ -0,0 +1,72 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// div
|
||||
//------------------------------------------------------------------------------
|
||||
kernel void kernel_sub(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
int ne13,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
int ne0,
|
||||
ulong nb0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
int i13 = i03 % ne13;
|
||||
int i12 = i02 % ne12;
|
||||
int i11 = i01 % ne11;
|
||||
|
||||
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
||||
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
const int i10 = i0 % ne10;
|
||||
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) - *((global float *)(src1_ptr + i10*nb10));
|
||||
}
|
||||
}
|
||||
|
||||
// assumption: src1 is a row
|
||||
// broadcast src1 into src0
|
||||
kernel void kernel_sub_row(
|
||||
global float4 * src0,
|
||||
ulong offset0,
|
||||
global float4 * src1,
|
||||
ulong offset1,
|
||||
global float4 * dst,
|
||||
ulong offsetd,
|
||||
int ne
|
||||
) {
|
||||
src0 = (global float4*)((global char*)src0 + offset0);
|
||||
src1 = (global float4*)((global char*)src1 + offset1);
|
||||
dst = (global float4*)((global char*)dst + offsetd);
|
||||
|
||||
// This performs better than using %.
|
||||
uint gid = get_global_id(0);
|
||||
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
||||
dst[gid] = src0[gid] - src1[idx1];
|
||||
}
|
||||
39
ggml/src/ggml-opencl/kernels/sum_rows.cl
Normal file
39
ggml/src/ggml-opencl/kernels/sum_rows.cl
Normal file
@@ -0,0 +1,39 @@
|
||||
|
||||
kernel void kernel_sum_rows_f32(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = (global float *)((global char *)src0 + offset0);
|
||||
dst = (global float *)((global char *)dst + offsetd);
|
||||
|
||||
int i3 = get_global_id(2);
|
||||
int i2 = get_global_id(1);
|
||||
int i1 = get_global_id(0);
|
||||
|
||||
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
global float * src_row = (global float *) ((global char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
|
||||
global float * dst_row = (global float *) ((global char *) dst + i1*nb1 + i2*nb2 + i3*nb3);
|
||||
|
||||
float row_sum = 0;
|
||||
|
||||
for (int i0 = 0; i0 < ne00; i0++) {
|
||||
row_sum += src_row[i0];
|
||||
}
|
||||
|
||||
dst_row[0] = row_sum;
|
||||
}
|
||||
@@ -84,6 +84,15 @@ static void gelu_quick(const T *x, T *dst, int k,
|
||||
dst[i] = x[i] * (static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(GELU_QUICK_COEF * x[i])));
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void gelu_erf(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
|
||||
const T SQRT_2_INV = static_cast<T>(0.70710678118654752440084436210484f);
|
||||
for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
|
||||
auto x_i = x[i];
|
||||
dst[i] = static_cast<T>(0.5f) * x_i * (static_cast<T>(1.0f) + sycl::erf(x_i * SQRT_2_INV));
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void tanh(const T *x, T *dst, int k,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
@@ -400,6 +409,20 @@ static void gelu_quick_sycl(const T *x, T *dst, const int k,
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
static void gelu_erf_sycl(const T *x, T *dst, const int k,
|
||||
queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
|
||||
sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
gelu_erf(x, dst, k, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void tanh_sycl(const T *x, T *dst, const int k,
|
||||
queue_ptr stream) {
|
||||
@@ -816,6 +839,38 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor
|
||||
}
|
||||
}
|
||||
|
||||
inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
#if defined (GGML_SYCL_F16)
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||
#else
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
#endif
|
||||
GGML_ASSERT(dst->src[0]->type == dst->type);
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||
switch (dst->type) {
|
||||
#if defined (GGML_SYCL_F16)
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
auto data_pts = cast_data<sycl::half>(dst);
|
||||
gelu_erf_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
auto data_pts = cast_data<float>(dst);
|
||||
gelu_erf_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
#if defined (GGML_SYCL_F16)
|
||||
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
||||
@@ -1425,6 +1480,11 @@ void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_op_gelu_quick(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_gelu_erf(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_tanh(ctx, dst);
|
||||
|
||||
@@ -38,6 +38,8 @@ void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -3543,6 +3543,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
ggml_sycl_gelu_quick(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
ggml_sycl_gelu_erf(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_TANH:
|
||||
ggml_sycl_tanh(ctx, dst);
|
||||
break;
|
||||
@@ -4096,6 +4099,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
case GGML_UNARY_OP_EXP:
|
||||
case GGML_UNARY_OP_SGN:
|
||||
|
||||
@@ -109,10 +109,6 @@ if (Vulkan_FOUND)
|
||||
add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
|
||||
endif()
|
||||
|
||||
if (GGML_VULKAN_PERF)
|
||||
add_compile_definitions(GGML_VULKAN_PERF)
|
||||
endif()
|
||||
|
||||
if (GGML_VULKAN_VALIDATE)
|
||||
add_compile_definitions(GGML_VULKAN_VALIDATE)
|
||||
endif()
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include "ggml-vulkan.h"
|
||||
#include <vulkan/vulkan_core.h>
|
||||
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_PERF) || defined(GGML_VULKAN_CHECK_RESULTS)
|
||||
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS)
|
||||
#include <chrono>
|
||||
#include "ggml-cpu.h"
|
||||
#endif
|
||||
@@ -184,9 +184,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
||||
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
||||
class vk_memory_logger;
|
||||
#endif
|
||||
#ifdef GGML_VULKAN_PERF
|
||||
class vk_perf_logger;
|
||||
#endif
|
||||
static void ggml_vk_destroy_buffer(vk_buffer& buf);
|
||||
|
||||
static constexpr uint32_t mul_mat_vec_max_cols = 8;
|
||||
@@ -442,9 +440,11 @@ struct vk_device_struct {
|
||||
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
||||
std::unique_ptr<vk_memory_logger> memory_logger;
|
||||
#endif
|
||||
#ifdef GGML_VULKAN_PERF
|
||||
|
||||
// for GGML_VK_PERF_LOGGER
|
||||
std::unique_ptr<vk_perf_logger> perf_logger;
|
||||
#endif
|
||||
vk::QueryPool query_pool;
|
||||
uint32_t num_queries;
|
||||
|
||||
~vk_device_struct() {
|
||||
VK_LOG_DEBUG("destroy device " << name);
|
||||
@@ -828,8 +828,6 @@ private:
|
||||
#define VK_LOG_MEMORY(msg) ((void) 0)
|
||||
#endif // GGML_VULKAN_MEMORY_DEBUG
|
||||
|
||||
#if defined(GGML_VULKAN_PERF)
|
||||
|
||||
class vk_perf_logger {
|
||||
public:
|
||||
void print_timings() {
|
||||
@@ -839,7 +837,7 @@ public:
|
||||
for (const auto& time : t.second) {
|
||||
total += time;
|
||||
}
|
||||
std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " ms" << std::endl;
|
||||
std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us" << std::endl;
|
||||
}
|
||||
|
||||
timings.clear();
|
||||
@@ -868,7 +866,6 @@ public:
|
||||
private:
|
||||
std::map<std::string, std::vector<uint64_t>> timings;
|
||||
};
|
||||
#endif // GGML_VULKAN_PERF
|
||||
|
||||
struct ggml_backend_vk_context {
|
||||
std::string name;
|
||||
@@ -958,6 +955,8 @@ struct vk_instance_t {
|
||||
static bool vk_instance_initialized = false;
|
||||
static vk_instance_t vk_instance;
|
||||
|
||||
static bool vk_perf_logger_enabled = false;
|
||||
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
static size_t vk_skip_checks;
|
||||
static size_t vk_output_tensor;
|
||||
@@ -2757,9 +2756,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
#ifdef GGML_VULKAN_MEMORY_DEBUG
|
||||
device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
|
||||
#endif
|
||||
#ifdef GGML_VULKAN_PERF
|
||||
device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
|
||||
#endif
|
||||
if (vk_perf_logger_enabled) {
|
||||
device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
|
||||
}
|
||||
|
||||
size_t dev_num = vk_instance.device_indices[idx];
|
||||
|
||||
@@ -3547,6 +3546,8 @@ static void ggml_vk_instance_init() {
|
||||
vk_instance.instance = vk::createInstance(instance_create_info);
|
||||
vk_instance_initialized = true;
|
||||
|
||||
vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
|
||||
|
||||
size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
|
||||
|
||||
// Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
|
||||
@@ -8885,7 +8886,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
|
||||
ctx->tensor_ctxs[node_idx] = compute_ctx;
|
||||
|
||||
#if defined(GGML_VULKAN_CHECK_RESULTS) || defined(GGML_VULKAN_PERF)
|
||||
#if defined(GGML_VULKAN_CHECK_RESULTS)
|
||||
// Force context reset on each node so that each tensor ends up in its own context
|
||||
// and can be run and compared to its CPU equivalent separately
|
||||
last_node = true;
|
||||
@@ -9505,6 +9506,29 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
bool first_node_in_batch = true; // true if next node will be first node in a batch
|
||||
int submit_node_idx = 0; // index to first node in a batch
|
||||
|
||||
vk_context compute_ctx;
|
||||
if (vk_perf_logger_enabled) {
|
||||
// allocate/resize the query pool
|
||||
if (ctx->device->num_queries < cgraph->n_nodes + 1) {
|
||||
if (ctx->device->query_pool) {
|
||||
ctx->device->device.destroyQueryPool(ctx->device->query_pool);
|
||||
}
|
||||
VkQueryPoolCreateInfo query_create_info = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO };
|
||||
query_create_info.queryType = VK_QUERY_TYPE_TIMESTAMP;
|
||||
query_create_info.queryCount = cgraph->n_nodes + 100;
|
||||
ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
|
||||
ctx->device->num_queries = query_create_info.queryCount;
|
||||
}
|
||||
|
||||
ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
|
||||
|
||||
GGML_ASSERT(ctx->compute_ctx.expired());
|
||||
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
||||
ctx->compute_ctx = compute_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
|
||||
}
|
||||
|
||||
// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
|
||||
// Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
|
||||
// (and scaled down based on model size, so smaller models submit earlier).
|
||||
@@ -9532,6 +9556,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
|
||||
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit);
|
||||
|
||||
if (vk_perf_logger_enabled) {
|
||||
if (ctx->compute_ctx.expired()) {
|
||||
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
||||
ctx->compute_ctx = compute_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||
} else {
|
||||
compute_ctx = ctx->compute_ctx.lock();
|
||||
}
|
||||
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+1);
|
||||
}
|
||||
|
||||
if (enqueued) {
|
||||
++submitted_nodes;
|
||||
|
||||
@@ -9553,9 +9588,27 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_VULKAN_PERF
|
||||
ctx->device->perf_logger->print_timings();
|
||||
#endif
|
||||
if (vk_perf_logger_enabled) {
|
||||
// End the command buffer and submit/wait
|
||||
GGML_ASSERT(!ctx->compute_ctx.expired());
|
||||
compute_ctx = ctx->compute_ctx.lock();
|
||||
ggml_vk_ctx_end(compute_ctx);
|
||||
|
||||
ggml_vk_submit(compute_ctx, ctx->device->fence);
|
||||
VK_CHECK(ctx->device->device.waitForFences({ ctx->device->fence }, true, UINT64_MAX), "GGML_VULKAN_PERF waitForFences");
|
||||
ctx->device->device.resetFences({ ctx->device->fence });
|
||||
|
||||
// Get the results and pass them to the logger
|
||||
std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
|
||||
ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
if (!ggml_vk_is_empty(cgraph->nodes[i])) {
|
||||
ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod));
|
||||
}
|
||||
}
|
||||
|
||||
ctx->device->perf_logger->print_timings();
|
||||
}
|
||||
|
||||
ggml_vk_graph_cleanup(ctx);
|
||||
|
||||
|
||||
@@ -2312,6 +2312,26 @@ struct ggml_tensor * ggml_repeat(
|
||||
return result;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_repeat_4d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
|
||||
const bool can_repeat = ggml_is_empty(a) || (
|
||||
(ne0 % a->ne[0] == 0) &&
|
||||
(ne1 % a->ne[1] == 0) &&
|
||||
(ne2 % a->ne[2] == 0) &&
|
||||
(ne3 % a->ne[3] == 0)
|
||||
);
|
||||
GGML_ASSERT(can_repeat);
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
|
||||
|
||||
result->op = GGML_OP_REPEAT;
|
||||
result->src[0] = a;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_repeat_back
|
||||
|
||||
struct ggml_tensor * ggml_repeat_back(
|
||||
|
||||
@@ -2260,6 +2260,7 @@ class VisionProjectorType:
|
||||
ULTRAVOX = "ultravox"
|
||||
INTERNVL = "internvl"
|
||||
QWEN2A = "qwen2a" # audio
|
||||
QWEN25O = "qwen2.5o" # omni
|
||||
|
||||
|
||||
# Items here are (block size, type size)
|
||||
|
||||
@@ -902,7 +902,6 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.V_MMPROJ_FC: (
|
||||
"model.connector.modality_projection.proj", # SmolVLM
|
||||
"multi_modal_projector.linear_1", # llama 4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MMPROJ_MLP: (
|
||||
@@ -1125,6 +1124,7 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.A_POST_NORM: (
|
||||
"audio_tower.layer_norm", # ultravox
|
||||
"audio_tower.ln_post", # qwen2omni
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_Q: (
|
||||
@@ -1161,12 +1161,16 @@ class TensorNameMap:
|
||||
"audio_tower.layers.{bid}.fc2", # ultravox
|
||||
),
|
||||
|
||||
# note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
|
||||
# this prefix is added in the conversion code in modify_tensors()
|
||||
|
||||
MODEL_TENSOR.A_MMPROJ: (
|
||||
"audio.multi_modal_projector.linear_{bid}", # ultravox
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_MMPROJ_FC: (
|
||||
"audio.multi_modal_projector.linear", # qwen2audio
|
||||
"audio_tower.proj", # qwen2omni
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_MM_NORM_PRE: (
|
||||
|
||||
@@ -1 +1 @@
|
||||
7c06c10c532a6cda913c17fc56341e8880ae341d
|
||||
06b715f4c170232af261425240914fa49c44f982
|
||||
|
||||
@@ -14,6 +14,7 @@ add_library(llama
|
||||
llama-batch.cpp
|
||||
llama-chat.cpp
|
||||
llama-context.cpp
|
||||
llama-cparams.cpp
|
||||
llama-grammar.cpp
|
||||
llama-graph.cpp
|
||||
llama-hparams.cpp
|
||||
|
||||
@@ -130,6 +130,7 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_INTERNVL,
|
||||
PROJECTOR_TYPE_LLAMA4,
|
||||
PROJECTOR_TYPE_QWEN2A,
|
||||
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -148,6 +149,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
|
||||
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
|
||||
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
|
||||
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
|
||||
};
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -17,12 +17,22 @@ struct clip_image_f32;
|
||||
struct clip_image_u8_batch;
|
||||
struct clip_image_f32_batch;
|
||||
|
||||
enum clip_modality {
|
||||
CLIP_MODALITY_VISION,
|
||||
CLIP_MODALITY_AUDIO,
|
||||
};
|
||||
|
||||
struct clip_context_params {
|
||||
bool use_gpu;
|
||||
enum ggml_log_level verbosity;
|
||||
};
|
||||
|
||||
struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
|
||||
struct clip_init_result {
|
||||
struct clip_ctx * ctx_v; // vision context
|
||||
struct clip_ctx * ctx_a; // audio context
|
||||
};
|
||||
|
||||
struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params);
|
||||
|
||||
void clip_free(struct clip_ctx * ctx);
|
||||
|
||||
|
||||
@@ -284,7 +284,9 @@ int main(int argc, char ** argv) {
|
||||
if (is_single_turn) {
|
||||
g_is_generating = true;
|
||||
if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
|
||||
params.prompt += mtmd_default_marker();
|
||||
for (size_t i = 0; i < params.image.size(); i++) {
|
||||
params.prompt += mtmd_default_marker();
|
||||
}
|
||||
}
|
||||
common_chat_msg msg;
|
||||
msg.role = "user";
|
||||
|
||||
@@ -66,7 +66,8 @@ struct decode_embd_batch {
|
||||
}
|
||||
}
|
||||
|
||||
void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
|
||||
// M-RoPE for image
|
||||
void set_position_mrope_2d(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
|
||||
GGML_ASSERT(n_pos_per_embd == 4);
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int y = 0; y < ny; y++) {
|
||||
@@ -85,6 +86,23 @@ struct decode_embd_batch {
|
||||
}
|
||||
}
|
||||
|
||||
// M-RoPE for audio
|
||||
void set_position_mrope_1d(llama_pos pos_0, llama_seq_id seq_id) {
|
||||
GGML_ASSERT(n_pos_per_embd == 4);
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
pos[i ] = pos_0 + i;
|
||||
pos[i + batch.n_tokens ] = pos_0 + i;
|
||||
pos[i + batch.n_tokens * 2] = pos_0 + i;
|
||||
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
|
||||
}
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id [i] = seq_id_0.data();
|
||||
batch.logits [i] = false;
|
||||
}
|
||||
}
|
||||
|
||||
llama_batch get_view(int offset, int n_tokens) {
|
||||
llama_pos * pos_ptr;
|
||||
pos_view.clear();
|
||||
@@ -146,18 +164,20 @@ int32_t mtmd_helper_decode_image_chunk(
|
||||
decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
|
||||
|
||||
if (mtmd_decode_use_mrope(ctx)) {
|
||||
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
||||
if (chunk_type != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
LOG_ERR("failed to decode chunk: M-RoPE only accepts image chunk\n");
|
||||
return -1;
|
||||
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
||||
if (!image_tokens) {
|
||||
LOG_ERR("failed to decode chunk: image tokens are null\n");
|
||||
return -1;
|
||||
}
|
||||
const int nx = mtmd_image_tokens_get_nx(image_tokens);
|
||||
const int ny = mtmd_image_tokens_get_ny(image_tokens);
|
||||
batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
|
||||
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
||||
batch_embd.set_position_mrope_1d(n_past, seq_id);
|
||||
} else {
|
||||
GGML_ABORT("invalid chunk type for M-RoPE");
|
||||
}
|
||||
if (!image_tokens) {
|
||||
LOG_ERR("failed to decode chunk: image tokens are null\n");
|
||||
return -1;
|
||||
}
|
||||
const int nx = mtmd_image_tokens_get_nx(image_tokens);
|
||||
const int ny = mtmd_image_tokens_get_ny(image_tokens);
|
||||
batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
|
||||
} else {
|
||||
batch_embd.set_position_normal(n_past, seq_id);
|
||||
}
|
||||
|
||||
@@ -95,15 +95,21 @@ mtmd_context_params mtmd_context_params_default() {
|
||||
}
|
||||
|
||||
struct mtmd_context {
|
||||
struct clip_ctx * ctx_clip;
|
||||
struct clip_ctx * ctx_v; // vision
|
||||
struct clip_ctx * ctx_a; // audio
|
||||
const struct llama_model * text_model;
|
||||
std::vector<float> image_embd_v; // image embedding vector
|
||||
|
||||
bool print_timings;
|
||||
int n_threads;
|
||||
std::string media_marker;
|
||||
bool has_vision;
|
||||
bool has_audio;
|
||||
const int n_embd_text;
|
||||
|
||||
// these are not token, but strings used to mark the beginning and end of image/audio embeddings
|
||||
std::string img_beg;
|
||||
std::string img_end;
|
||||
std::string aud_beg;
|
||||
std::string aud_end;
|
||||
|
||||
// for llava-uhd style models, we need special tokens in-between slices
|
||||
// minicpmv calls them "slices", llama 4 calls them "tiles"
|
||||
@@ -132,33 +138,61 @@ struct mtmd_context {
|
||||
text_model (text_model),
|
||||
print_timings(ctx_params.print_timings),
|
||||
n_threads (ctx_params.n_threads),
|
||||
media_marker (ctx_params.media_marker)
|
||||
media_marker (ctx_params.media_marker),
|
||||
n_embd_text (llama_model_n_embd(text_model))
|
||||
{
|
||||
if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
|
||||
throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
|
||||
}
|
||||
|
||||
if (media_marker.empty()) {
|
||||
throw std::runtime_error("media_marker must not be empty");
|
||||
}
|
||||
|
||||
clip_context_params ctx_clip_params;
|
||||
ctx_clip_params.use_gpu = ctx_params.use_gpu;
|
||||
ctx_clip_params.verbosity = ctx_params.verbosity;
|
||||
ctx_clip = clip_init(mmproj_fname, ctx_clip_params);
|
||||
if (!ctx_clip) {
|
||||
auto res = clip_init(mmproj_fname, ctx_clip_params);
|
||||
ctx_v = res.ctx_v;
|
||||
ctx_a = res.ctx_a;
|
||||
if (!ctx_v && !ctx_a) {
|
||||
throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
|
||||
}
|
||||
|
||||
if (llama_model_n_embd(text_model) != clip_n_mmproj_embd(ctx_clip)) {
|
||||
// if both vision and audio mmproj are present, we need to validate their n_embd
|
||||
if (ctx_v && ctx_a) {
|
||||
int n_embd_v = clip_n_mmproj_embd(ctx_v);
|
||||
int n_embd_a = clip_n_mmproj_embd(ctx_a);
|
||||
if (n_embd_v != n_embd_a) {
|
||||
throw std::runtime_error(string_format(
|
||||
"mismatch between vision and audio mmproj (n_embd_v = %d, n_embd_a = %d)\n",
|
||||
n_embd_v, n_embd_a));
|
||||
}
|
||||
}
|
||||
|
||||
// since we already validate n_embd of vision and audio mmproj,
|
||||
// we can safely assume that they are the same
|
||||
int n_embd_clip = clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a);
|
||||
if (n_embd_text != n_embd_clip) {
|
||||
throw std::runtime_error(string_format(
|
||||
"mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"
|
||||
"hint: you may be using wrong mmproj\n",
|
||||
llama_model_n_embd(text_model), clip_n_mmproj_embd(ctx_clip)));
|
||||
n_embd_text, n_embd_clip));
|
||||
}
|
||||
if (ctx_v) {
|
||||
init_vision();
|
||||
}
|
||||
if (ctx_a) {
|
||||
init_audio();
|
||||
}
|
||||
}
|
||||
|
||||
has_vision = clip_has_vision_encoder(ctx_clip);
|
||||
has_audio = clip_has_audio_encoder(ctx_clip);
|
||||
use_mrope = clip_is_qwen2vl(ctx_clip);
|
||||
void init_vision() {
|
||||
GGML_ASSERT(ctx_v != nullptr);
|
||||
use_mrope = clip_is_qwen2vl(ctx_v);
|
||||
|
||||
projector_type proj = clip_get_projector_type(ctx_clip);
|
||||
int minicpmv_version = clip_is_minicpmv(ctx_clip);
|
||||
projector_type proj = clip_get_projector_type(ctx_v);
|
||||
int minicpmv_version = clip_is_minicpmv(ctx_v);
|
||||
if (minicpmv_version == 2) {
|
||||
// minicpmv 2.5 format:
|
||||
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
|
||||
@@ -203,24 +237,82 @@ struct mtmd_context {
|
||||
ov_img_first = false; // overview image is last
|
||||
}
|
||||
|
||||
if (clip_has_whisper_encoder(ctx_clip)) {
|
||||
// set boi/eoi
|
||||
if (proj == PROJECTOR_TYPE_GEMMA3) {
|
||||
// <start_of_image> ... (image embeddings) ... <end_of_image>
|
||||
img_beg = "<start_of_image>";
|
||||
img_end = "<end_of_image>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_IDEFICS3) {
|
||||
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
|
||||
img_beg = "<fake_token_around_image><global-img>";
|
||||
img_end = "<fake_token_around_image>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_PIXTRAL) {
|
||||
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
|
||||
img_end = "[IMG_END]";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL) {
|
||||
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
||||
img_beg = "<|vision_start|>";
|
||||
img_end = "<|vision_end|>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_LLAMA4) {
|
||||
// (more details in mtmd_context constructor)
|
||||
img_beg = "<|image_start|>";
|
||||
img_end = "<|image_end|>";
|
||||
LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
|
||||
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_INTERNVL) {
|
||||
// <img> ... (image embeddings) ... </img>
|
||||
img_beg = "<img>";
|
||||
img_end = "</img>";
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
void init_audio() {
|
||||
GGML_ASSERT(ctx_a != nullptr);
|
||||
projector_type proj = clip_get_projector_type(ctx_a);
|
||||
|
||||
if (clip_has_whisper_encoder(ctx_a)) {
|
||||
// TODO @ngxson : check if model n_mel is 128 or 80
|
||||
w_filters = whisper_precalc_filters::get_128_bins();
|
||||
}
|
||||
|
||||
// warning messages
|
||||
if (proj == PROJECTOR_TYPE_LLAMA4) {
|
||||
LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
|
||||
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
|
||||
}
|
||||
if (has_audio) {
|
||||
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
|
||||
" https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
|
||||
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
|
||||
" https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
|
||||
|
||||
if (proj == PROJECTOR_TYPE_QWEN2A) {
|
||||
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
|
||||
aud_beg = "<|audio_bos|>";
|
||||
aud_end = "<|audio_eos|>";
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// get clip ctx based on chunk type
|
||||
clip_ctx * get_clip_ctx(const mtmd_input_chunk * chunk) const {
|
||||
if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
return ctx_v;
|
||||
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
||||
return ctx_a;
|
||||
}
|
||||
GGML_ABORT("unknown chunk type");
|
||||
}
|
||||
|
||||
projector_type proj_type_v() const {
|
||||
return ctx_v ? clip_get_projector_type(ctx_v) : PROJECTOR_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
projector_type proj_type_a() const {
|
||||
return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
~mtmd_context() {
|
||||
clip_free(ctx_clip);
|
||||
clip_free(ctx_a);
|
||||
clip_free(ctx_v);
|
||||
}
|
||||
|
||||
private:
|
||||
@@ -267,107 +359,315 @@ void mtmd_free(mtmd_context * ctx) {
|
||||
}
|
||||
}
|
||||
|
||||
// copied from common_tokenize
|
||||
static std::vector<llama_token> mtmd_tokenize_text_internal(
|
||||
const struct llama_vocab * vocab,
|
||||
const std::string & text,
|
||||
bool add_special,
|
||||
bool parse_special) {
|
||||
// upper limit for the number of tokens
|
||||
int n_tokens = text.length() + 2 * add_special;
|
||||
std::vector<llama_token> result(n_tokens);
|
||||
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
struct mtmd_tokenizer {
|
||||
mtmd_context * ctx;
|
||||
std::vector<const mtmd_bitmap *> bitmaps;
|
||||
|
||||
int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||
mtmd_input_chunks * output,
|
||||
std::string input_text;
|
||||
bool add_special;
|
||||
bool parse_special;
|
||||
const llama_vocab * vocab;
|
||||
|
||||
mtmd_input_chunks cur;
|
||||
|
||||
mtmd_tokenizer(mtmd_context * ctx,
|
||||
const mtmd_input_text * text,
|
||||
const mtmd_bitmap ** bitmaps,
|
||||
size_t n_bitmaps) {
|
||||
auto vocab = llama_model_get_vocab(ctx->text_model);
|
||||
|
||||
std::string prompt_modified(text->text);
|
||||
std::string marker_modified(ctx->media_marker);
|
||||
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
|
||||
|
||||
// for compatibility, we convert image marker to media marker
|
||||
string_replace_all(prompt_modified, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
|
||||
|
||||
// a bit hacky here, but works for now
|
||||
// for some models, we need to add prefix and suffix to the image embeddings
|
||||
if (clip_is_gemma3(ctx->ctx_clip)) {
|
||||
// gemma 3
|
||||
// <start_of_image> ... (image embeddings) ... <end_of_image>
|
||||
marker_modified = "<start_of_image>" + ctx->media_marker + "<end_of_image>";
|
||||
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
|
||||
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
|
||||
marker_modified = "<fake_token_around_image><global-img>" + ctx->media_marker + "<fake_token_around_image>";
|
||||
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_PIXTRAL) {
|
||||
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
|
||||
marker_modified = ctx->media_marker + "[IMG_END]";
|
||||
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
||||
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
||||
marker_modified = "<|vision_start|>" + ctx->media_marker + "<|vision_end|>";
|
||||
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_LLAMA4) {
|
||||
// (more details in mtmd_context constructor)
|
||||
marker_modified = "<|image_start|>" + ctx->media_marker + "<|image_end|>";
|
||||
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
|
||||
// <img> ... (image embeddings) ... </img>
|
||||
marker_modified = "<img>" + ctx->media_marker + "</img>";
|
||||
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_QWEN2A) {
|
||||
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
|
||||
marker_modified = "<|audio_bos|>" + ctx->media_marker + "<|audio_eos|>";
|
||||
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
||||
size_t n_bitmaps) : ctx(ctx), bitmaps(bitmaps, bitmaps + n_bitmaps) {
|
||||
add_special = text->add_special;
|
||||
parse_special = text->parse_special;
|
||||
input_text = text->text;
|
||||
vocab = llama_model_get_vocab(ctx->text_model);
|
||||
|
||||
// for compatibility, we convert image marker to media marker
|
||||
string_replace_all(input_text, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
|
||||
}
|
||||
|
||||
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
|
||||
// for glm-edge, BOI and EOI token's embeddings are not present in the text model
|
||||
int32_t tokenize(mtmd_input_chunks * output) {
|
||||
cur.entries.clear();
|
||||
std::vector<std::string> parts = split_text(input_text, ctx->media_marker);
|
||||
size_t i_bm = 0; // index of the current bitmap
|
||||
for (auto & part : parts) {
|
||||
if (part == ctx->media_marker) {
|
||||
// this is a marker, we should add the next bitmap
|
||||
if (i_bm >= bitmaps.size()) {
|
||||
LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
|
||||
__func__, bitmaps.size(), parts.size() - 1);
|
||||
return 1;
|
||||
}
|
||||
const mtmd_bitmap * bitmap = bitmaps[i_bm++];
|
||||
int32_t res = add_media(bitmap);
|
||||
if (res != 0) {
|
||||
return res;
|
||||
}
|
||||
} else {
|
||||
// this is a text part, we should add it as text
|
||||
add_text(part, parse_special);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> parts = string_split_str(prompt_modified, ctx->media_marker);
|
||||
output->entries.clear();
|
||||
output->entries.reserve(parts.size());
|
||||
if (add_special && llama_vocab_get_add_bos(vocab)) {
|
||||
// if first chunk is text, we add BOS token to first text chunk
|
||||
// otherwise, create a new text chunk with BOS token
|
||||
if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
// add BOS token to the beginning of first text chunk
|
||||
cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab));
|
||||
} else {
|
||||
// create a new text chunk with BOS token at the beginning
|
||||
mtmd_input_chunk bos_chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
||||
{llama_vocab_bos(vocab)},
|
||||
nullptr, // image tokens
|
||||
nullptr, // audio tokens
|
||||
};
|
||||
cur.entries.insert(cur.entries.begin(), std::move(bos_chunk));
|
||||
}
|
||||
}
|
||||
|
||||
size_t i_bm = 0;
|
||||
if (add_special && llama_vocab_get_add_eos(vocab)) {
|
||||
// if last chunk is text, we add EOS token to it
|
||||
add_text({llama_vocab_eos(vocab)});
|
||||
}
|
||||
|
||||
// utility for adding raw tokens
|
||||
auto add_text_chunk = [&output](std::vector<llama_token> && tokens) {
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
||||
std::move(tokens),
|
||||
nullptr, // image tokens
|
||||
nullptr, // audio tokens
|
||||
};
|
||||
output->entries.emplace_back(std::move(chunk));
|
||||
};
|
||||
if (i_bm != bitmaps.size()) {
|
||||
LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
|
||||
__func__, bitmaps.size(), parts.size() - 1);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// utility for splitting batch of multiple images into chunks of batch having single images
|
||||
auto split_batch_to_chunk = [&ctx](clip_image_f32_batch && batch_f32, const std::string & id) {
|
||||
*output = std::move(cur);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void add_text(const std::string & txt, bool parse_special) {
|
||||
LOG_DBG("%s: %s\n", __func__, txt.c_str());
|
||||
auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special);
|
||||
add_text(tokens);
|
||||
}
|
||||
|
||||
void add_text(const std::vector<llama_token> & tokens) {
|
||||
if (tokens.empty()) {
|
||||
return;
|
||||
}
|
||||
// if last entry is also a text chunk, add tokens to it instead of creating new chunk
|
||||
if (!cur.entries.empty() && cur.entries.back().type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
cur.entries.back().tokens_text.insert(
|
||||
cur.entries.back().tokens_text.end(),
|
||||
tokens.begin(),
|
||||
tokens.end());
|
||||
} else {
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
||||
tokens,
|
||||
nullptr, // image tokens
|
||||
nullptr, // audio tokens
|
||||
};
|
||||
cur.entries.emplace_back(std::move(chunk));
|
||||
}
|
||||
}
|
||||
|
||||
int32_t add_media(const mtmd_bitmap * bitmap) {
|
||||
if (!bitmap->is_audio) {
|
||||
// handle image
|
||||
|
||||
if (!ctx->ctx_v) {
|
||||
LOG_ERR("%s: error: model does not support vision input\n", __func__);
|
||||
return 2;
|
||||
}
|
||||
|
||||
if (!ctx->img_beg.empty()) {
|
||||
add_text(ctx->img_beg, true); // add image begin token
|
||||
}
|
||||
|
||||
// convert mtmd_bitmap to clip_image_u8
|
||||
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
||||
img_u8->nx = bitmap->nx;
|
||||
img_u8->ny = bitmap->ny;
|
||||
img_u8->buf.resize(bitmap->data.size());
|
||||
std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
|
||||
|
||||
// preprocess image
|
||||
clip_image_f32_batch batch_f32;
|
||||
bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to preprocess image\n");
|
||||
return 2;
|
||||
}
|
||||
|
||||
// handle llava-uhd style preprocessing
|
||||
if (
|
||||
ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
|
||||
) {
|
||||
// split batch into chunks of single images
|
||||
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
|
||||
GGML_ASSERT(chunks.size() > 0);
|
||||
|
||||
auto ov_chunk = std::move(chunks.front());
|
||||
chunks.erase(chunks.begin());
|
||||
|
||||
// add overview image (first)
|
||||
if (ctx->ov_img_first) {
|
||||
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
|
||||
add_text({ctx->tok_ov_img_start});
|
||||
}
|
||||
cur.entries.emplace_back(std::move(ov_chunk));
|
||||
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
|
||||
add_text({ctx->tok_ov_img_end});
|
||||
}
|
||||
}
|
||||
|
||||
// add slices (or tiles)
|
||||
if (!chunks.empty()) {
|
||||
const int n_col = batch_f32.grid_x;
|
||||
const int n_row = batch_f32.grid_y;
|
||||
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
|
||||
add_text({ctx->tok_slices_start});
|
||||
}
|
||||
for (int y = 0; y < n_row; y++) {
|
||||
for (int x = 0; x < n_col; x++) {
|
||||
const bool is_last_in_row = (x == n_col - 1);
|
||||
if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
|
||||
add_text({ctx->tok_sli_img_start});
|
||||
}
|
||||
cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
|
||||
if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
|
||||
add_text({ctx->tok_sli_img_end});
|
||||
}
|
||||
if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
|
||||
add_text({ctx->tok_sli_img_mid});
|
||||
}
|
||||
}
|
||||
if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
|
||||
add_text({ctx->tok_row_end});
|
||||
}
|
||||
}
|
||||
if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
|
||||
add_text({ctx->tok_slices_end});
|
||||
}
|
||||
}
|
||||
|
||||
// add overview image (last)
|
||||
if (!ctx->ov_img_first) {
|
||||
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
|
||||
add_text({ctx->tok_ov_img_start});
|
||||
}
|
||||
cur.entries.emplace_back(std::move(ov_chunk));
|
||||
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
|
||||
add_text({ctx->tok_ov_img_end});
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
size_t n_tokens = 0;
|
||||
for (const auto & entry : batch_f32.entries) {
|
||||
n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get());
|
||||
}
|
||||
|
||||
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
||||
if (ctx->use_mrope) {
|
||||
// for Qwen2VL, we need this information for M-RoPE decoding positions
|
||||
image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
|
||||
image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
|
||||
image_tokens->use_mrope_pos = true;
|
||||
} else {
|
||||
// other models, we only need the total number of tokens
|
||||
image_tokens->nx = n_tokens;
|
||||
image_tokens->ny = 1;
|
||||
}
|
||||
image_tokens->batch_f32 = std::move(batch_f32);
|
||||
image_tokens->id = bitmap->id; // optional
|
||||
|
||||
LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
|
||||
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
|
||||
LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
|
||||
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
||||
{}, // text tokens
|
||||
std::move(image_tokens),
|
||||
nullptr, // audio tokens
|
||||
};
|
||||
cur.entries.emplace_back(std::move(chunk));
|
||||
}
|
||||
|
||||
if (!ctx->img_end.empty()) {
|
||||
add_text(ctx->img_end, true); // add image end token
|
||||
}
|
||||
|
||||
} else {
|
||||
// handle audio
|
||||
|
||||
if (!ctx->ctx_a) {
|
||||
LOG_ERR("%s: error: model does not support audio input\n", __func__);
|
||||
return 2;
|
||||
}
|
||||
|
||||
if (bitmap->data.size() == 0) {
|
||||
LOG_ERR("%s: error: empty audio data\n", __func__);
|
||||
return 2;
|
||||
}
|
||||
|
||||
if (!ctx->aud_beg.empty()) {
|
||||
add_text(ctx->aud_beg, true); // add audio begin token
|
||||
}
|
||||
|
||||
// preprocess audio
|
||||
GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
|
||||
std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
|
||||
const float * samples = (const float *)bitmap->data.data();
|
||||
size_t n_samples = bitmap->data.size() / sizeof(float);
|
||||
bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, ctx->w_filters, mel_spec_chunks);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to preprocess audio\n");
|
||||
return 2;
|
||||
}
|
||||
|
||||
// consider each mel_spec as a separate audio chunk
|
||||
// TODO: maybe support batching, but this may come with memory cost
|
||||
for (auto & mel_spec : mel_spec_chunks) {
|
||||
clip_image_f32_ptr mel_f32(clip_image_f32_init());
|
||||
mel_f32->nx = mel_spec.n_len;
|
||||
mel_f32->ny = mel_spec.n_mel;
|
||||
mel_f32->buf = std::move(mel_spec.data);
|
||||
size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get());
|
||||
|
||||
clip_image_f32_batch batch_f32;
|
||||
batch_f32.is_audio = true;
|
||||
batch_f32.entries.push_back(std::move(mel_f32));
|
||||
|
||||
mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
|
||||
audio_tokens->n_tokens = n_tokens;
|
||||
audio_tokens->batch_f32 = std::move(batch_f32);
|
||||
audio_tokens->id = bitmap->id; // optional
|
||||
|
||||
LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
|
||||
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_AUDIO,
|
||||
{}, // text tokens
|
||||
nullptr, // image tokens
|
||||
std::move(audio_tokens),
|
||||
};
|
||||
cur.entries.emplace_back(std::move(chunk));
|
||||
}
|
||||
|
||||
if (!ctx->aud_end.empty()) {
|
||||
add_text(ctx->aud_end, true); // add audio end token
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
|
||||
std::vector<mtmd_input_chunk> chunks;
|
||||
|
||||
for (auto & entry : batch_f32.entries) {
|
||||
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
||||
image_tokens->nx = clip_n_output_tokens(ctx->ctx_clip, entry.get());
|
||||
image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, entry.get());
|
||||
image_tokens->ny = 1;
|
||||
image_tokens->batch_f32.entries.push_back(std::move(entry));
|
||||
image_tokens->id = id;
|
||||
@@ -382,222 +682,57 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||
}
|
||||
|
||||
return chunks;
|
||||
};
|
||||
|
||||
for (const auto & part : parts) {
|
||||
// printf("tokenizing part: %s\n", part.c_str());
|
||||
bool add_bos = &parts.front() == ∂
|
||||
auto tokens = mtmd_tokenize_text_internal(vocab, part, text->add_special && add_bos, text->parse_special);
|
||||
if (tokens.empty()) {
|
||||
continue;
|
||||
}
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
||||
std::move(tokens),
|
||||
nullptr, // image tokens
|
||||
nullptr, // audio tokens
|
||||
};
|
||||
output->entries.emplace_back(std::move(chunk));
|
||||
|
||||
// only add image/audio tokens to middle of 2 parts
|
||||
// therefore, we skip handling image/audio if this is the last part
|
||||
if (&parts.back() == &part) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!bitmaps[i_bm]->is_audio) {
|
||||
// handle image
|
||||
|
||||
if (i_bm >= n_bitmaps) {
|
||||
LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!ctx->has_vision) {
|
||||
LOG_ERR("%s: error: model does not support vision input\n", __func__);
|
||||
return 2;
|
||||
}
|
||||
|
||||
// convert mtmd_bitmap to clip_image_u8
|
||||
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
||||
img_u8->nx = bitmaps[i_bm]->nx;
|
||||
img_u8->ny = bitmaps[i_bm]->ny;
|
||||
img_u8->buf.resize(bitmaps[i_bm]->data.size());
|
||||
std::memcpy(img_u8->buf.data(), bitmaps[i_bm]->data.data(), img_u8->nx * img_u8->ny * 3);
|
||||
|
||||
// preprocess image
|
||||
clip_image_f32_batch batch_f32;
|
||||
bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), &batch_f32);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to preprocess image\n");
|
||||
return 2;
|
||||
}
|
||||
|
||||
// handle llava-uhd style preprocessing
|
||||
if (
|
||||
ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
|
||||
) {
|
||||
// split batch into chunks of single images
|
||||
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_bm]->id);
|
||||
GGML_ASSERT(chunks.size() > 0);
|
||||
|
||||
auto ov_chunk = std::move(chunks.front());
|
||||
chunks.erase(chunks.begin());
|
||||
|
||||
// add overview image (first)
|
||||
if (ctx->ov_img_first) {
|
||||
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_ov_img_start});
|
||||
}
|
||||
output->entries.emplace_back(std::move(ov_chunk));
|
||||
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_ov_img_end});
|
||||
}
|
||||
}
|
||||
|
||||
// add slices (or tiles)
|
||||
if (!chunks.empty()) {
|
||||
const int n_col = batch_f32.grid_x;
|
||||
const int n_row = batch_f32.grid_y;
|
||||
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_slices_start});
|
||||
}
|
||||
for (int y = 0; y < n_row; y++) {
|
||||
for (int x = 0; x < n_col; x++) {
|
||||
const bool is_last_in_row = (x == n_col - 1);
|
||||
if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_sli_img_start});
|
||||
}
|
||||
output->entries.emplace_back(std::move(chunks[y * n_col + x]));
|
||||
if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_sli_img_end});
|
||||
}
|
||||
if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_sli_img_mid});
|
||||
}
|
||||
}
|
||||
if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_row_end});
|
||||
}
|
||||
}
|
||||
if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_slices_end});
|
||||
}
|
||||
}
|
||||
|
||||
// add overview image (last)
|
||||
if (!ctx->ov_img_first) {
|
||||
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_ov_img_start});
|
||||
}
|
||||
output->entries.emplace_back(std::move(ov_chunk));
|
||||
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_ov_img_end});
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
size_t n_tokens = 0;
|
||||
for (const auto & entry : batch_f32.entries) {
|
||||
n_tokens += clip_n_output_tokens(ctx->ctx_clip, entry.get());
|
||||
}
|
||||
|
||||
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
||||
if (ctx->use_mrope) {
|
||||
// for Qwen2VL, we need this information for M-RoPE decoding positions
|
||||
image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_clip, batch_f32.entries[0].get());
|
||||
image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_clip, batch_f32.entries[0].get());
|
||||
image_tokens->use_mrope_pos = true;
|
||||
} else {
|
||||
// other models, we only need the total number of tokens
|
||||
image_tokens->nx = n_tokens;
|
||||
image_tokens->ny = 1;
|
||||
}
|
||||
image_tokens->batch_f32 = std::move(batch_f32);
|
||||
image_tokens->id = bitmaps[i_bm]->id; // optional
|
||||
|
||||
LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
|
||||
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
|
||||
LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
|
||||
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
||||
{}, // text tokens
|
||||
std::move(image_tokens),
|
||||
nullptr, // audio tokens
|
||||
};
|
||||
output->entries.emplace_back(std::move(chunk));
|
||||
}
|
||||
|
||||
i_bm++; // move to next image
|
||||
continue;
|
||||
|
||||
} else {
|
||||
// handle audio
|
||||
|
||||
if (i_bm >= n_bitmaps) {
|
||||
LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!ctx->has_audio) {
|
||||
LOG_ERR("%s: error: model does not support audio input\n", __func__);
|
||||
return 2;
|
||||
}
|
||||
|
||||
if (bitmaps[i_bm]->data.size() == 0) {
|
||||
LOG_ERR("%s: error: empty audio data\n", __func__);
|
||||
return 2;
|
||||
}
|
||||
|
||||
// preprocess audio
|
||||
GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
|
||||
std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
|
||||
const float * samples = (const float *)bitmaps[i_bm]->data.data();
|
||||
size_t n_samples = bitmaps[i_bm]->data.size() / sizeof(float);
|
||||
bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, ctx->w_filters, mel_spec_chunks);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to preprocess audio\n");
|
||||
return 2;
|
||||
}
|
||||
|
||||
// consider each mel_spec as a separate audio chunk
|
||||
// TODO: maybe support batching, but this may come with memory cost
|
||||
for (auto & mel_spec : mel_spec_chunks) {
|
||||
clip_image_f32_ptr mel_f32(clip_image_f32_init());
|
||||
mel_f32->nx = mel_spec.n_len;
|
||||
mel_f32->ny = mel_spec.n_mel;
|
||||
mel_f32->buf = std::move(mel_spec.data);
|
||||
size_t n_tokens = clip_n_output_tokens(ctx->ctx_clip, mel_f32.get());
|
||||
|
||||
clip_image_f32_batch batch_f32;
|
||||
batch_f32.is_audio = true;
|
||||
batch_f32.entries.push_back(std::move(mel_f32));
|
||||
|
||||
mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
|
||||
audio_tokens->n_tokens = n_tokens;
|
||||
audio_tokens->batch_f32 = std::move(batch_f32);
|
||||
audio_tokens->id = bitmaps[i_bm]->id; // optional
|
||||
|
||||
LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
|
||||
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_AUDIO,
|
||||
{}, // text tokens
|
||||
nullptr, // image tokens
|
||||
std::move(audio_tokens),
|
||||
};
|
||||
output->entries.emplace_back(std::move(chunk));
|
||||
}
|
||||
|
||||
i_bm++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
// for example: "a <__media__> b <__media__> c" --> "a", "<__media__>", "b", "<__media__>", "c"
|
||||
static std::vector<std::string> split_text(const std::string & input, const std::string & delimiter) {
|
||||
std::vector<std::string> result;
|
||||
if (input.empty()) {
|
||||
return result;
|
||||
}
|
||||
size_t start = 0;
|
||||
size_t pos = 0;
|
||||
while ((pos = input.find(delimiter, start)) != std::string::npos) {
|
||||
if (pos > start) {
|
||||
result.push_back(input.substr(start, pos - start));
|
||||
}
|
||||
result.push_back(delimiter);
|
||||
start = pos + delimiter.length();
|
||||
}
|
||||
if (start < input.length()) {
|
||||
result.push_back(input.substr(start));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// copied from common_tokenize
|
||||
static std::vector<llama_token> mtmd_tokenize_text_internal(
|
||||
const struct llama_vocab * vocab,
|
||||
const std::string & text,
|
||||
bool add_special,
|
||||
bool parse_special) {
|
||||
// upper limit for the number of tokens
|
||||
int n_tokens = text.length() + 2 * add_special;
|
||||
std::vector<llama_token> result(n_tokens);
|
||||
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||
mtmd_input_chunks * output,
|
||||
const mtmd_input_text * text,
|
||||
const mtmd_bitmap ** bitmaps,
|
||||
size_t n_bitmaps) {
|
||||
mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
|
||||
return tokenizer.tokenize(output);
|
||||
}
|
||||
|
||||
int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
|
||||
@@ -605,41 +740,54 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
|
||||
LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
|
||||
return 0;
|
||||
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
if (!ctx->ctx_v) {
|
||||
LOG_ERR("%s: model does not support vision input\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
return mtmd_encode(ctx, chunk->tokens_image.get());
|
||||
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
||||
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
|
||||
if (!ctx->ctx_a) {
|
||||
LOG_ERR("%s: model does not support audio input\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
int n_mmproj_embd = ctx->n_embd_text;
|
||||
ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
|
||||
bool ok = clip_image_batch_encode(
|
||||
ctx->ctx_clip,
|
||||
ctx->ctx_a,
|
||||
ctx->n_threads,
|
||||
&chunk->tokens_audio->batch_f32,
|
||||
ctx->image_embd_v.data());
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
LOG_ERR("mtmd_encode_chunk: unknown chunk type %d\n", (int)chunk->type);
|
||||
LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
|
||||
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
|
||||
clip_ctx * ctx_clip = ctx->ctx_v;
|
||||
if (!ctx_clip) {
|
||||
LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
|
||||
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
|
||||
bool ok = false;
|
||||
|
||||
if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) {
|
||||
if (clip_is_llava(ctx_clip) || clip_is_minicpmv(ctx_clip) || clip_is_glm(ctx_clip)) {
|
||||
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
|
||||
const auto & entries = image_tokens->batch_f32.entries;
|
||||
for (size_t i = 0; i < entries.size(); i++) {
|
||||
int n_tokens_per_image = clip_n_output_tokens(ctx->ctx_clip, entries[i].get());
|
||||
int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
|
||||
ok = clip_image_encode(
|
||||
ctx->ctx_clip,
|
||||
ctx_clip,
|
||||
ctx->n_threads,
|
||||
entries[i].get(),
|
||||
ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
|
||||
}
|
||||
} else {
|
||||
ok = clip_image_batch_encode(
|
||||
ctx->ctx_clip,
|
||||
ctx_clip,
|
||||
ctx->n_threads,
|
||||
&image_tokens->batch_f32,
|
||||
ctx->image_embd_v.data());
|
||||
@@ -653,8 +801,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
|
||||
}
|
||||
|
||||
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
|
||||
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
|
||||
if (proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
@@ -665,11 +812,11 @@ bool mtmd_decode_use_mrope(mtmd_context * ctx) {
|
||||
}
|
||||
|
||||
bool mtmd_support_vision(mtmd_context * ctx) {
|
||||
return ctx->has_vision;
|
||||
return ctx->ctx_v != nullptr;
|
||||
}
|
||||
|
||||
bool mtmd_support_audio(mtmd_context * ctx) {
|
||||
return ctx->has_audio;
|
||||
return ctx->ctx_a != nullptr;
|
||||
}
|
||||
|
||||
// these 2 helpers below use internal clip_image_u8_ptr,
|
||||
|
||||
BIN
tools/mtmd/test-2.mp3
Normal file
BIN
tools/mtmd/test-2.mp3
Normal file
Binary file not shown.
@@ -25,80 +25,99 @@ RUN_HUGE_TESTS=false
|
||||
if [ "${1:-}" = "huge" ]; then
|
||||
RUN_HUGE_TESTS=true
|
||||
RUN_BIG_TESTS=true
|
||||
echo "Include BIG models..."
|
||||
echo "Include BIG and HUGE models..."
|
||||
fi
|
||||
|
||||
###############
|
||||
|
||||
arr_bin=()
|
||||
arr_prefix=()
|
||||
arr_hf=()
|
||||
arr_tmpl=() # chat template
|
||||
arr_file=()
|
||||
|
||||
add_test() {
|
||||
local bin=$1
|
||||
local hf=$2
|
||||
local tmpl=${3:-""} # default to empty string if not provided
|
||||
arr_bin+=("$bin")
|
||||
add_test_vision() {
|
||||
local hf=$1
|
||||
local tmpl=${2:-""} # default to empty string if not provided
|
||||
arr_prefix+=("[vision]")
|
||||
arr_hf+=("$hf")
|
||||
arr_tmpl+=("$tmpl")
|
||||
arr_file+=("test-1.jpeg")
|
||||
}
|
||||
|
||||
add_test "llama-mtmd-cli" "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
|
||||
add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
|
||||
add_test "llama-mtmd-cli" "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna"
|
||||
add_test "llama-mtmd-cli" "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" "vicuna"
|
||||
add_test "llama-mtmd-cli" "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
|
||||
add_test "llama-mtmd-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
|
||||
add_test "llama-mtmd-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
|
||||
add_test "llama-mtmd-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/InternVL2_5-1B-GGUF:Q8_0"
|
||||
add_test "llama-mtmd-cli" "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0"
|
||||
add_test_audio() {
|
||||
local hf=$1
|
||||
arr_prefix+=("[audio] ")
|
||||
arr_hf+=("$hf")
|
||||
arr_tmpl+=("") # no need for chat tmpl
|
||||
arr_file+=("test-2.mp3")
|
||||
}
|
||||
|
||||
add_test_vision "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
|
||||
add_test_vision "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
|
||||
add_test_vision "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
|
||||
add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
|
||||
add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna"
|
||||
add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" "vicuna"
|
||||
add_test_vision "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
|
||||
add_test_vision "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
|
||||
add_test_vision "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
|
||||
add_test_vision "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
|
||||
add_test_vision "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/InternVL2_5-1B-GGUF:Q8_0"
|
||||
add_test_vision "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0"
|
||||
add_test_vision "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
|
||||
|
||||
add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
|
||||
add_test_audio "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
|
||||
|
||||
# to test the big models, run: ./tests.sh big
|
||||
if [ "$RUN_BIG_TESTS" = true ]; then
|
||||
add_test "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7"
|
||||
add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/InternVL3-8B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
|
||||
# add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
|
||||
add_test_vision "ggml-org/pixtral-12b-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7"
|
||||
add_test_vision "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/InternVL3-8B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
|
||||
# add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
|
||||
|
||||
add_test_audio "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M"
|
||||
add_test_audio "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
|
||||
fi
|
||||
|
||||
# to test the huge models, run: ./tests.sh huge
|
||||
# this will run both the big and huge models
|
||||
# huge models are > 32B parameters
|
||||
if [ "$RUN_HUGE_TESTS" = true ]; then
|
||||
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF:IQ1_S"
|
||||
add_test_vision "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF:IQ1_S"
|
||||
fi
|
||||
|
||||
# these models always give the wrong answer, not sure why
|
||||
# add_test "llama-mtmd-cli" "ggml-org/SmolVLM-Instruct-GGUF:Q4_K_M"
|
||||
# add_test "llama-mtmd-cli" "ggml-org/SmolVLM-256M-Instruct-GGUF:Q8_0"
|
||||
# add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF:Q8_0"
|
||||
# add_test_vision "ggml-org/SmolVLM-Instruct-GGUF:Q4_K_M"
|
||||
# add_test_vision "ggml-org/SmolVLM-256M-Instruct-GGUF:Q8_0"
|
||||
# add_test_vision "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF:Q8_0"
|
||||
|
||||
# this model has broken chat template, not usable
|
||||
# add_test "llama-mtmd-cli" "cmp-nct/Yi-VL-6B-GGUF:Q5_K"
|
||||
# add_test "llama-mtmd-cli" "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek"
|
||||
# add_test_vision "cmp-nct/Yi-VL-6B-GGUF:Q5_K"
|
||||
# add_test_vision "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek"
|
||||
|
||||
###############
|
||||
|
||||
cmake --build build -j --target "${arr_bin[@]}"
|
||||
cmake --build build -j --target llama-mtmd-cli
|
||||
|
||||
arr_res=()
|
||||
|
||||
for i in "${!arr_bin[@]}"; do
|
||||
bin="${arr_bin[$i]}"
|
||||
for i in "${!arr_hf[@]}"; do
|
||||
bin="llama-mtmd-cli"
|
||||
prefix="${arr_prefix[$i]}"
|
||||
hf="${arr_hf[$i]}"
|
||||
tmpl="${arr_tmpl[$i]}"
|
||||
inp_file="${arr_file[$i]}"
|
||||
|
||||
echo "Running test with binary: $bin and HF model: $hf"
|
||||
echo ""
|
||||
@@ -107,7 +126,7 @@ for i in "${!arr_bin[@]}"; do
|
||||
output=$(\
|
||||
"$PROJ_ROOT/build/bin/$bin" \
|
||||
-hf "$hf" \
|
||||
--image $SCRIPT_DIR/test-1.jpeg \
|
||||
--image $SCRIPT_DIR/$inp_file \
|
||||
-p "what is the publisher name of the newspaper?" \
|
||||
--temp 0 -n 128 \
|
||||
${tmpl:+--chat-template "$tmpl"} \
|
||||
@@ -116,9 +135,9 @@ for i in "${!arr_bin[@]}"; do
|
||||
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
|
||||
|
||||
if echo "$output" | grep -iq "new york"; then
|
||||
result="\033[32mOK\033[0m: $bin $hf"
|
||||
result="$prefix \033[32mOK\033[0m: $bin $hf"
|
||||
else
|
||||
result="\033[31mFAIL\033[0m: $bin $hf"
|
||||
result="$prefix \033[31mFAIL\033[0m: $bin $hf"
|
||||
fi
|
||||
echo -e "$result"
|
||||
arr_res+=("$result")
|
||||
|
||||
Reference in New Issue
Block a user