mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-09 02:24:17 +00:00
Compare commits
67 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ad590be98c | ||
|
|
7d6d91babf | ||
|
|
d3e64b9f49 | ||
|
|
3ba0d843c6 | ||
|
|
0bf49eb668 | ||
|
|
4ad243677b | ||
|
|
c89c2d1ab9 | ||
|
|
3555b3004b | ||
|
|
d7da8dc83a | ||
|
|
cd355eda7d | ||
|
|
30e5b01de2 | ||
|
|
e54b394082 | ||
|
|
2c2caa4443 | ||
|
|
5fce5f948d | ||
|
|
9ae4143bc6 | ||
|
|
c311ac664d | ||
|
|
b9912ac570 | ||
|
|
00ba772610 | ||
|
|
3cb203c89f | ||
|
|
2e42be42bd | ||
|
|
fb85a288d7 | ||
|
|
40643edb86 | ||
|
|
3cfbbdb44e | ||
|
|
80709b70a2 | ||
|
|
26ff3685bf | ||
|
|
60c666347b | ||
|
|
b7cc7745e3 | ||
|
|
cc8d081879 | ||
|
|
d714dadb57 | ||
|
|
ffad043973 | ||
|
|
0889eba570 | ||
|
|
c61285e739 | ||
|
|
09cf2c7c65 | ||
|
|
c33fe8b8c4 | ||
|
|
ed52f3668e | ||
|
|
a681b4ba83 | ||
|
|
7d516443dd | ||
|
|
f6e1a7aa87 | ||
|
|
c3ee46fab4 | ||
|
|
e2c0b6e46a | ||
|
|
9596506965 | ||
|
|
a20b2b05bc | ||
|
|
2e89f76b7a | ||
|
|
532802f938 | ||
|
|
d4e0d95cf5 | ||
|
|
cc66a7f78f | ||
|
|
bd248d4dc7 | ||
|
|
7781e5fe99 | ||
|
|
89a184fa71 | ||
|
|
2baf07727f | ||
|
|
7ae2932116 | ||
|
|
1f7d50b293 | ||
|
|
4c763c8d1b | ||
|
|
dad5c44398 | ||
|
|
55f6b9fa65 | ||
|
|
3678b838bb | ||
|
|
652b70e667 | ||
|
|
3a12db23b6 | ||
|
|
ae92c1855b | ||
|
|
b7ce1ad1e3 | ||
|
|
97340b4c99 | ||
|
|
2bb0467043 | ||
|
|
b8e2194efc | ||
|
|
1a3b5e80f7 | ||
|
|
1f63e75f3b | ||
|
|
40cbf571c9 | ||
|
|
7f4fbe5183 |
@@ -49,19 +49,23 @@ COPY --from=build /app/full /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
&& pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
RUN apt-get update && \
|
||||
apt-get install -y \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
python3-venv && \
|
||||
python3 -m venv /opt/venv && \
|
||||
. /opt/venv/bin/activate && \
|
||||
pip install --upgrade pip setuptools wheel && \
|
||||
pip install -r requirements.txt && \
|
||||
apt autoremove -y && \
|
||||
apt clean -y && \
|
||||
rm -rf /tmp/* /var/tmp/* && \
|
||||
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
|
||||
find /var/cache -type f -delete
|
||||
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
ENTRYPOINT ["/app/tools.sh"]
|
||||
|
||||
|
||||
5
.github/workflows/build.yml
vendored
5
.github/workflows/build.yml
vendored
@@ -306,6 +306,7 @@ jobs:
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
export GGML_VK_VISIBLE_DEVICES=0
|
||||
# This is using llvmpipe and runs slower than other backends
|
||||
ctest -L main --verbose --timeout 3600
|
||||
|
||||
@@ -687,8 +688,8 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'cpu-x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
|
||||
- build: 'cpu-x64 (static)'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF'
|
||||
- build: 'openblas-x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'vulkan-x64'
|
||||
|
||||
@@ -89,6 +89,14 @@ option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
|
||||
|
||||
if (NOT DEFINED LLAMA_BUILD_NUMBER)
|
||||
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
|
||||
endif()
|
||||
if (NOT DEFINED LLAMA_BUILD_COMMIT)
|
||||
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
||||
endif()
|
||||
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
|
||||
|
||||
# override ggml options
|
||||
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
||||
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
|
||||
@@ -155,6 +163,8 @@ if (LLAMA_USE_SYSTEM_GGML)
|
||||
endif()
|
||||
|
||||
if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
|
||||
set(GGML_BUILD_NUMBER ${LLAMA_BUILD_NUMBER})
|
||||
set(GGML_BUILD_COMMIT ${LLAMA_BUILD_COMMIT})
|
||||
add_subdirectory(ggml)
|
||||
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
||||
endif()
|
||||
@@ -204,10 +214,6 @@ endif()
|
||||
include(GNUInstallDirs)
|
||||
include(CMakePackageConfigHelpers)
|
||||
|
||||
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
|
||||
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
||||
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
|
||||
|
||||
set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
|
||||
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
|
||||
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
[](https://github.com/ggml-org/llama.cpp/releases)
|
||||
[](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
|
||||
|
||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
|
||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
|
||||
|
||||
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
||||
|
||||
@@ -18,7 +18,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
## Hot topics
|
||||
|
||||
- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
|
||||
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
|
||||
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
|
||||
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
|
||||
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
|
||||
|
||||
@@ -7,8 +7,8 @@ llama_add_compile_flags()
|
||||
# Build info header
|
||||
#
|
||||
|
||||
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||
if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
|
||||
set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
|
||||
|
||||
# Is git submodule
|
||||
if(NOT IS_DIRECTORY "${GIT_DIR}")
|
||||
@@ -18,36 +18,26 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||
if (SLASH_POS EQUAL 0)
|
||||
set(GIT_DIR "${REAL_GIT_DIR}")
|
||||
else()
|
||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
|
||||
set(GIT_DIR "${PROJECT_SOURCE_DIR}/${REAL_GIT_DIR}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(EXISTS "${GIT_DIR}/index")
|
||||
set(GIT_INDEX "${GIT_DIR}/index")
|
||||
# For build-info.cpp below
|
||||
set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${GIT_DIR}/index")
|
||||
else()
|
||||
message(WARNING "Git index not found in git repository.")
|
||||
set(GIT_INDEX "")
|
||||
endif()
|
||||
else()
|
||||
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
|
||||
set(GIT_INDEX "")
|
||||
endif()
|
||||
|
||||
# Add a custom command to rebuild build-info.cpp when .git/index changes
|
||||
add_custom_command(
|
||||
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
|
||||
COMMENT "Generating build details from Git"
|
||||
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
||||
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
||||
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
|
||||
-DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
|
||||
-P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
|
||||
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
||||
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
||||
VERBATIM
|
||||
)
|
||||
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
|
||||
set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
|
||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||
|
||||
set(TARGET build_info)
|
||||
add_library(${TARGET} OBJECT build-info.cpp)
|
||||
add_library(${TARGET} OBJECT ${OUTPUT_FILE})
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
|
||||
@@ -988,10 +988,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
||||
}
|
||||
|
||||
if (params.reranking && params.embedding) {
|
||||
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
|
||||
}
|
||||
|
||||
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
|
||||
throw std::runtime_error(string_format(
|
||||
"error: the supplied chat template is not supported: %s%s\n",
|
||||
@@ -2747,9 +2743,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
||||
add_opt(common_arg(
|
||||
{"--reranking", "--rerank"},
|
||||
string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
|
||||
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
|
||||
[](common_params & params) {
|
||||
params.reranking = true;
|
||||
params.embedding = true;
|
||||
params.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
|
||||
add_opt(common_arg(
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
|
||||
char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
|
||||
int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
|
||||
char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
|
||||
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
|
||||
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
|
||||
|
||||
@@ -49,6 +49,7 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
|
||||
|
||||
// LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
|
||||
result_.tool_calls.emplace_back(tool_call);
|
||||
|
||||
return true;
|
||||
}
|
||||
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
|
||||
@@ -378,3 +379,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
|
||||
/* .is_partial = */ found_healing_marker,
|
||||
};
|
||||
}
|
||||
|
||||
void common_chat_msg_parser::clear_tools() {
|
||||
result_.tool_calls.clear();
|
||||
}
|
||||
|
||||
@@ -115,4 +115,6 @@ class common_chat_msg_parser {
|
||||
const std::vector<std::vector<std::string>> & args_paths = {},
|
||||
const std::vector<std::vector<std::string>> & content_paths = {}
|
||||
);
|
||||
|
||||
void clear_tools();
|
||||
};
|
||||
|
||||
@@ -1921,7 +1921,9 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
|
||||
} catch (const common_chat_msg_partial_exception & ex) {
|
||||
LOG_DBG("Partial parse: %s\n", ex.what());
|
||||
if (!is_partial) {
|
||||
throw std::runtime_error(ex.what());
|
||||
builder.clear_tools();
|
||||
builder.move_to(0);
|
||||
common_chat_parse_content_only(builder);
|
||||
}
|
||||
}
|
||||
auto msg = builder.result();
|
||||
|
||||
@@ -1,24 +0,0 @@
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||
|
||||
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
|
||||
set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
|
||||
|
||||
# Only write the build info if it changed
|
||||
if(EXISTS ${OUTPUT_FILE})
|
||||
file(READ ${OUTPUT_FILE} CONTENTS)
|
||||
string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
|
||||
set(OLD_COMMIT ${CMAKE_MATCH_1})
|
||||
string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
|
||||
set(OLD_COMPILER ${CMAKE_MATCH_1})
|
||||
string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
|
||||
set(OLD_TARGET ${CMAKE_MATCH_1})
|
||||
if (
|
||||
NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR
|
||||
NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
|
||||
NOT OLD_TARGET STREQUAL BUILD_TARGET
|
||||
)
|
||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||
endif()
|
||||
else()
|
||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||
endif()
|
||||
@@ -466,7 +466,7 @@ size_t string_find_partial_stop(const std::string_view & str, const std::string_
|
||||
|
||||
std::string regex_escape(const std::string & s) {
|
||||
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
||||
return std::regex_replace(s, special_chars, "\\$0");
|
||||
return std::regex_replace(s, special_chars, "\\$&");
|
||||
}
|
||||
|
||||
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
|
||||
@@ -897,34 +897,6 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
if (params.reranking) {
|
||||
bool ok = true;
|
||||
|
||||
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
||||
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
||||
|
||||
if (!has_eos && !has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
} else if (!has_eos) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
||||
} else if (!has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
llama_model_free(model);
|
||||
|
||||
return iparams;
|
||||
}
|
||||
}
|
||||
|
||||
auto cparams = common_context_params_to_llama(params);
|
||||
|
||||
llama_context * lctx = llama_init_from_model(model, cparams);
|
||||
@@ -966,6 +938,35 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
}
|
||||
}
|
||||
|
||||
if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
|
||||
bool ok = true;
|
||||
|
||||
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
||||
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
||||
|
||||
if (!has_eos && !has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
} else if (!has_eos) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
||||
} else if (!has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
llama_free(lctx);
|
||||
llama_model_free(model);
|
||||
|
||||
return iparams;
|
||||
}
|
||||
}
|
||||
|
||||
// load and optionally apply lora adapters
|
||||
for (auto & la : params.lora_adapters) {
|
||||
llama_adapter_lora_ptr lora;
|
||||
@@ -1143,11 +1144,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||
cparams.op_offload = !params.no_op_offload;
|
||||
cparams.swa_full = params.swa_full;
|
||||
|
||||
if (params.reranking) {
|
||||
cparams.embeddings = true;
|
||||
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
||||
}
|
||||
|
||||
cparams.type_k = params.cache_type_k;
|
||||
cparams.type_v = params.cache_type_v;
|
||||
|
||||
|
||||
@@ -355,7 +355,6 @@ struct common_params {
|
||||
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
||||
std::string embd_sep = "\n"; // separator of embeddings
|
||||
bool reranking = false; // enable reranking support on server
|
||||
|
||||
// server params
|
||||
int32_t port = 8080; // server listens on this network port
|
||||
|
||||
@@ -519,7 +519,7 @@ class TextModel(ModelBase):
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_block_count(self.block_count)
|
||||
|
||||
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions"], optional=True)) is not None:
|
||||
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True)) is not None:
|
||||
self.gguf_writer.add_context_length(n_ctx)
|
||||
logger.info(f"gguf: context length = {n_ctx}")
|
||||
|
||||
@@ -556,8 +556,11 @@ class TextModel(ModelBase):
|
||||
logger.info(f"gguf: experts used count = {n_experts_used}")
|
||||
|
||||
if (head_dim := self.hparams.get("head_dim")) is not None:
|
||||
self.gguf_writer.add_key_length(head_dim)
|
||||
self.gguf_writer.add_value_length(head_dim)
|
||||
# Workaround for incorrect AutoConfig value for DeepSeekV3 (is set correctly in DeepSeekV2Model class)
|
||||
# https://github.com/huggingface/transformers/blob/19224c3642705c5b6988c9f5f4251f83323d05ae/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py#L210
|
||||
if self.hparams.get("model_type") != "deepseek_v3":
|
||||
self.gguf_writer.add_key_length(head_dim)
|
||||
self.gguf_writer.add_value_length(head_dim)
|
||||
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
logger.info(f"gguf: file type = {self.ftype}")
|
||||
@@ -2017,6 +2020,20 @@ class LlamaModel(TextModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("ArceeForCausalLM")
|
||||
class ArceeModel(LlamaModel):
|
||||
model_arch = gguf.MODEL_ARCH.ARCEE
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self._try_set_pooling_type()
|
||||
rope_scaling = self.hparams.get("rope_scaling") or {}
|
||||
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
|
||||
|
||||
|
||||
@ModelBase.register(
|
||||
"LlavaForConditionalGeneration", # pixtral
|
||||
"Mistral3ForConditionalGeneration", # mistral small 3.1
|
||||
@@ -4059,6 +4076,34 @@ class NomicBertModel(BertModel):
|
||||
raise ValueError(f"unknown tokenizer: {toktyp}")
|
||||
|
||||
|
||||
@ModelBase.register("NeoBERT", "NeoBERTLMHead", "NeoBERTForSequenceClassification")
|
||||
class NeoBert(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.NEO_BERT
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
# NeoBERT uses 2/3 of the intermediate size as feed forward length
|
||||
self.gguf_writer.add_feed_forward_length(int(2 * self.hparams["intermediate_size"] / 3))
|
||||
self.gguf_writer.add_rope_freq_base(10000.0) # default value for NeoBERT
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
|
||||
f_rms_eps = self.hparams.get("norm_eps", 1e-6) # default value for NeoBERT
|
||||
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
|
||||
logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
|
||||
|
||||
self.gguf_writer.add_pooling_type(gguf.PoolingType.CLS) # https://huggingface.co/chandar-lab/NeoBERT#how-to-use
|
||||
|
||||
def modify_tensors(self, data_torch, name, bid):
|
||||
if name.startswith("decoder."):
|
||||
return []
|
||||
|
||||
if name.startswith("model."):
|
||||
name = name[6:]
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
|
||||
class XLMRobertaModel(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.BERT
|
||||
@@ -4798,25 +4843,6 @@ class OlmoeModel(TextModel):
|
||||
class JinaBertV2Model(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.intermediate_size = self.hparams["intermediate_size"]
|
||||
|
||||
def get_tensors(self):
|
||||
for name, data in super().get_tensors():
|
||||
if 'gated_layer' in name:
|
||||
d1 = data[:self.intermediate_size, :]
|
||||
name1 = name.replace('gated_layers', 'gated_layers_w')
|
||||
name1 = name1.replace('up_gated_layer', 'gated_layers_v')
|
||||
d2 = data[self.intermediate_size:, :]
|
||||
name2 = name.replace('gated_layers', 'gated_layers_v')
|
||||
name2 = name2.replace('up_gated_layer', 'gated_layers_w')
|
||||
yield name1, d1
|
||||
yield name2, d2
|
||||
continue
|
||||
|
||||
yield name, data
|
||||
|
||||
def set_vocab(self):
|
||||
tokenizer_class = 'BertTokenizer'
|
||||
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
||||
@@ -4832,14 +4858,6 @@ class JinaBertV2Model(BertModel):
|
||||
self.gguf_writer.add_add_bos_token(True)
|
||||
self.gguf_writer.add_add_eos_token(True)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# if name starts with "bert.", remove the prefix
|
||||
# e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
||||
if name.startswith("bert."):
|
||||
name = name[5:]
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("OpenELMForCausalLM")
|
||||
class OpenELMModel(TextModel):
|
||||
@@ -5286,6 +5304,34 @@ class DeepseekV2Model(TextModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("Dots1ForCausalLM")
|
||||
class Dots1Model(Qwen2MoeModel):
|
||||
model_arch = gguf.MODEL_ARCH.DOTS1
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.hparams["num_experts"] = self.hparams["n_routed_experts"]
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
|
||||
self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
|
||||
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
|
||||
self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
|
||||
|
||||
if self.hparams["scoring_func"] == "noaux_tc":
|
||||
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
|
||||
else:
|
||||
raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
||||
if name.endswith("e_score_correction_bias"):
|
||||
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
|
||||
if "shared_experts" in name:
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("PLMForCausalLM")
|
||||
class PLMModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.PLM
|
||||
|
||||
@@ -11,7 +11,7 @@ Function calling is supported for all models (see https://github.com/ggml-org/ll
|
||||
- Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
|
||||
- Functionary v3.1 / v3.2
|
||||
- Hermes 2/3, Qwen 2.5
|
||||
- Qwen 2.5 Coder (WIP: https://github.com/ggml-org/llama.cpp/pull/12034)
|
||||
- Qwen 2.5 Coder
|
||||
- Mistral Nemo
|
||||
- Firefunction v2
|
||||
- Command R7B
|
||||
|
||||
@@ -107,3 +107,7 @@ NOTE: some models may require large context window, for example: `-c 8192`
|
||||
(tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF
|
||||
(tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF
|
||||
```
|
||||
|
||||
## Finding more models:
|
||||
|
||||
GGUF models on Huggingface with vision capabilities can be found here: https://huggingface.co/models?pipeline_tag=image-text-to-text&sort=trending&search=gguf
|
||||
|
||||
@@ -41,12 +41,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
||||
|
||||
// add input to batch (this increments n_tokens)
|
||||
for (int32_t j = 0; j < n_toks; j++) {
|
||||
common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
|
||||
common_batch_add(batch, inputs[j], j, { 0 }, true);
|
||||
}
|
||||
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
llama_memory_clear(llama_get_memory(ctx), true);
|
||||
llama_set_embeddings(ctx, true);
|
||||
llama_set_causal_attn(ctx, false);
|
||||
|
||||
// run model
|
||||
@@ -103,7 +102,6 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
|
||||
llama_token eos_token = llama_vocab_eos(vocab);
|
||||
|
||||
llama_memory_clear(llama_get_memory(ctx), true);
|
||||
llama_set_embeddings(ctx, false);
|
||||
llama_set_causal_attn(ctx, true);
|
||||
|
||||
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
||||
@@ -166,6 +164,8 @@ int main(int argc, char * argv[]) {
|
||||
llama_model_params mparams = common_model_params_to_llama(params);
|
||||
llama_context_params cparams = common_context_params_to_llama(params);
|
||||
|
||||
cparams.embeddings = true;
|
||||
|
||||
llama_backend_init();
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
||||
@@ -213,6 +213,8 @@ int main(int argc, char * argv[]) {
|
||||
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
|
||||
}
|
||||
|
||||
llama_set_embeddings(ctx, false);
|
||||
|
||||
// ### Generation ###
|
||||
// GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
|
||||
{
|
||||
|
||||
@@ -172,6 +172,7 @@ option(GGML_HIP "ggml: use HIP"
|
||||
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
||||
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
||||
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
||||
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
||||
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
||||
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
||||
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
||||
|
||||
@@ -36,8 +36,7 @@ function(ggml_get_system_arch)
|
||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
|
||||
set(GGML_SYSTEM_ARCH "x86" PARENT_SCOPE)
|
||||
elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR
|
||||
"${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc|power")
|
||||
set(GGML_SYSTEM_ARCH "PowerPC" PARENT_SCOPE)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||
set(GGML_SYSTEM_ARCH "loongarch64" PARENT_SCOPE)
|
||||
|
||||
@@ -212,6 +212,7 @@ endif()
|
||||
|
||||
add_library(ggml
|
||||
ggml-backend-reg.cpp)
|
||||
add_library(ggml::ggml ALIAS ggml)
|
||||
|
||||
target_link_libraries(ggml PUBLIC ggml-base)
|
||||
|
||||
@@ -269,17 +270,23 @@ endfunction()
|
||||
function(ggml_add_cpu_backend_variant tag_name)
|
||||
set(GGML_CPU_TAG_NAME ${tag_name})
|
||||
# other: OPENMP LLAMAFILE CPU_HBM
|
||||
foreach (feat NATIVE
|
||||
SSE42
|
||||
AVX AVX2 BMI2 AVX_VNNI FMA F16C
|
||||
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
||||
AMX_TILE AMX_INT8 AMX_BF16)
|
||||
set(GGML_${feat} OFF)
|
||||
endforeach()
|
||||
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
||||
foreach (feat NATIVE
|
||||
SSE42
|
||||
AVX AVX2 BMI2 AVX_VNNI FMA F16C
|
||||
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
||||
AMX_TILE AMX_INT8 AMX_BF16)
|
||||
set(GGML_${feat} OFF)
|
||||
endforeach()
|
||||
|
||||
foreach (feat ${ARGN})
|
||||
set(GGML_${feat} ON)
|
||||
endforeach()
|
||||
foreach (feat ${ARGN})
|
||||
set(GGML_${feat} ON)
|
||||
endforeach()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "ARM")
|
||||
foreach (feat ${ARGN})
|
||||
set(GGML_INTERNAL_${feat} ON)
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
ggml_add_cpu_backend_variant_impl(${tag_name})
|
||||
endfunction()
|
||||
@@ -289,6 +296,8 @@ ggml_add_backend(CPU)
|
||||
if (GGML_CPU_ALL_VARIANTS)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
||||
elseif (GGML_CPU_ARM_ARCH)
|
||||
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
|
||||
endif()
|
||||
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
||||
ggml_add_cpu_backend_variant(x64)
|
||||
@@ -302,8 +311,30 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
# MSVC doesn't support AMX
|
||||
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
endif()
|
||||
elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
# Many of these features are optional so we build versions with popular
|
||||
# combinations and name the backends based on the version they were
|
||||
# first released with
|
||||
ggml_add_cpu_backend_variant(armv8.0_1)
|
||||
ggml_add_cpu_backend_variant(armv8.2_1 DOTPROD)
|
||||
ggml_add_cpu_backend_variant(armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
||||
ggml_add_cpu_backend_variant(armv8.2_3 DOTPROD FP16_VECTOR_ARITHMETIC SVE)
|
||||
ggml_add_cpu_backend_variant(armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8)
|
||||
ggml_add_cpu_backend_variant(armv8.6_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2)
|
||||
ggml_add_cpu_backend_variant(armv9.2_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SME)
|
||||
ggml_add_cpu_backend_variant(armv9.2_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2 SME)
|
||||
elseif (CMAKE_SYSTEM_NAME MATCHES "Android")
|
||||
# Android-specific backends with SoC-compatible feature sets
|
||||
ggml_add_cpu_backend_variant(android_armv8.0_1)
|
||||
ggml_add_cpu_backend_variant(android_armv8.2_1 DOTPROD)
|
||||
ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
||||
ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
|
||||
else()
|
||||
message(FATAL_ERROR "Unsupported ARM target OS: ${CMAKE_SYSTEM_NAME}")
|
||||
endif()
|
||||
else()
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported on ${GGML_SYSTEM_ARCH}")
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
|
||||
endif()
|
||||
elseif (GGML_CPU)
|
||||
ggml_add_cpu_backend_variant_impl("")
|
||||
|
||||
@@ -1,3 +1,17 @@
|
||||
function(ggml_add_cpu_backend_features cpu_name arch)
|
||||
# The feature detection code is compiled as a separate target so that
|
||||
# it can be built without the architecture flags
|
||||
# Since multiple variants of the CPU backend may be included in the same
|
||||
# build, using set_source_files_properties() to set the arch flags is not possible
|
||||
set(GGML_CPU_FEATS_NAME ${cpu_name}-feats)
|
||||
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp)
|
||||
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
||||
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_link_libraries(${cpu_name} PRIVATE ${GGML_CPU_FEATS_NAME})
|
||||
endfunction()
|
||||
|
||||
function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
if (tag_name)
|
||||
set(GGML_CPU_NAME ggml-cpu-${tag_name})
|
||||
@@ -143,6 +157,46 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
else()
|
||||
if (GGML_CPU_ARM_ARCH)
|
||||
list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
|
||||
elseif(GGML_CPU_ALL_VARIANTS)
|
||||
# Begin with the lowest baseline
|
||||
set(ARM_MCPU "armv8-a")
|
||||
set(ARCH_TAGS "")
|
||||
set(ARCH_DEFINITIONS "")
|
||||
|
||||
# When a feature is selected, bump the MCPU to the first
|
||||
# version that supported it
|
||||
if (GGML_INTERNAL_DOTPROD)
|
||||
set(ARM_MCPU "armv8.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+dotprod")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_DOTPROD)
|
||||
endif()
|
||||
if (GGML_INTERNAL_FP16_VECTOR_ARITHMETIC)
|
||||
set(ARM_MCPU "armv8.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+fp16")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_FP16_VECTOR_ARITHMETIC)
|
||||
endif()
|
||||
if (GGML_INTERNAL_SVE)
|
||||
set(ARM_MCPU "armv8.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+sve")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE)
|
||||
endif()
|
||||
if (GGML_INTERNAL_MATMUL_INT8)
|
||||
set(ARM_MCPU "armv8.6-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+i8mm")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_MATMUL_INT8)
|
||||
endif()
|
||||
if (GGML_INTERNAL_SVE2)
|
||||
set(ARM_MCPU "armv8.6-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+sve2")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE2)
|
||||
endif()
|
||||
if (GGML_INTERNAL_SME)
|
||||
set(ARM_MCPU "armv9.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+sme")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_SME)
|
||||
endif()
|
||||
list(APPEND ARCH_FLAGS "-march=${ARM_MCPU}${ARCH_TAGS}")
|
||||
ggml_add_cpu_backend_features(${GGML_CPU_NAME} arm ${ARCH_DEFINITIONS})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -306,18 +360,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
|
||||
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
|
||||
endif()
|
||||
|
||||
# The feature detection code is compiled as a separate target so that
|
||||
# it can be built without the architecture flags
|
||||
# Since multiple variants of the CPU backend may be included in the same
|
||||
# build, using set_source_files_properties() to set the arch flags is not possible
|
||||
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
|
||||
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/x86/cpu-feats.cpp)
|
||||
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
||||
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
|
||||
ggml_add_cpu_backend_features(${GGML_CPU_NAME} x86 ${ARCH_DEFINITIONS})
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
|
||||
message(STATUS "PowerPC detected")
|
||||
|
||||
88
ggml/src/ggml-cpu/apple-fallback.h
Normal file
88
ggml/src/ggml-cpu/apple-fallback.h
Normal file
@@ -0,0 +1,88 @@
|
||||
#pragma once
|
||||
|
||||
// Solve alias issue for Apple targets (currently PowerPC, x86, and ARM64).
|
||||
// Mach-O has a weak alias equivalent but no practical compiler support can
|
||||
// be found, so we need to do it manually.
|
||||
// ref: https://stackoverflow.com/questions/42757744
|
||||
//
|
||||
// This file is a complement to native implementations in the `arch` folder.
|
||||
// A kernel in quants.c or repack.cpp is either:
|
||||
// - implemented in the `arch` folder, or
|
||||
// - defined in this file to remove the `_generic` suffix
|
||||
|
||||
#if defined(GGML_CPU_GENERIC)
|
||||
// quants.c
|
||||
#define quantize_row_q8_0_generic quantize_row_q8_0
|
||||
#define quantize_row_q8_1_generic quantize_row_q8_1
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_q4_0_q8_0_generic ggml_vec_dot_q4_0_q8_0
|
||||
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
|
||||
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
|
||||
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
|
||||
#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
||||
#define ggml_vec_dot_q3_K_q8_K_generic ggml_vec_dot_q3_K_q8_K
|
||||
#define ggml_vec_dot_q4_K_q8_K_generic ggml_vec_dot_q4_K_q8_K
|
||||
#define ggml_vec_dot_q5_K_q8_K_generic ggml_vec_dot_q5_K_q8_K
|
||||
#define ggml_vec_dot_q6_K_q8_K_generic ggml_vec_dot_q6_K_q8_K
|
||||
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
|
||||
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#elif defined(__aarch64__) || defined(__arm__)
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#elif defined(__x86_64__) || defined(__i386__)
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#elif defined(__POWERPC__)
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#endif
|
||||
94
ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp
Normal file
94
ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp
Normal file
@@ -0,0 +1,94 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#if defined(__aarch64__)
|
||||
|
||||
#if defined(__linux__)
|
||||
#include <sys/auxv.h>
|
||||
#elif defined(__APPLE__)
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
|
||||
#if !defined(HWCAP2_I8MM)
|
||||
#define HWCAP2_I8MM (1 << 13)
|
||||
#endif
|
||||
|
||||
#if !defined(HWCAP2_SME)
|
||||
#define HWCAP2_SME (1 << 23)
|
||||
#endif
|
||||
|
||||
struct aarch64_features {
|
||||
// has_neon not needed, aarch64 has NEON guaranteed
|
||||
bool has_dotprod = false;
|
||||
bool has_fp16_va = false;
|
||||
bool has_sve = false;
|
||||
bool has_sve2 = false;
|
||||
bool has_i8mm = false;
|
||||
bool has_sme = false;
|
||||
|
||||
aarch64_features() {
|
||||
#if defined(__linux__)
|
||||
uint32_t hwcap = getauxval(AT_HWCAP);
|
||||
uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
||||
|
||||
has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
|
||||
has_fp16_va = !!(hwcap & HWCAP_FPHP);
|
||||
has_sve = !!(hwcap & HWCAP_SVE);
|
||||
has_sve2 = !!(hwcap2 & HWCAP2_SVE2);
|
||||
has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
|
||||
has_sme = !!(hwcap2 & HWCAP2_SME);
|
||||
#elif defined(__APPLE__)
|
||||
int oldp = 0;
|
||||
size_t size = sizeof(oldp);
|
||||
|
||||
if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) == 0) {
|
||||
has_dotprod = static_cast<bool>(oldp);
|
||||
}
|
||||
|
||||
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) == 0) {
|
||||
has_i8mm = static_cast<bool>(oldp);
|
||||
}
|
||||
|
||||
if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) == 0) {
|
||||
has_sme = static_cast<bool>(oldp);
|
||||
}
|
||||
|
||||
// Apple apparently does not implement SVE yet
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
static int ggml_backend_cpu_aarch64_score() {
|
||||
int score = 1;
|
||||
aarch64_features af;
|
||||
|
||||
#ifdef GGML_USE_DOTPROD
|
||||
if (!af.has_dotprod) { return 0; }
|
||||
score += 1<<1;
|
||||
#endif
|
||||
#ifdef GGML_USE_FP16_VECTOR_ARITHMETIC
|
||||
if (!af.has_fp16_va) { return 0; }
|
||||
score += 1<<2;
|
||||
#endif
|
||||
#ifdef GGML_USE_SVE
|
||||
if (!af.has_sve) { return 0; }
|
||||
score += 1<<3;
|
||||
#endif
|
||||
#ifdef GGML_USE_MATMUL_INT8
|
||||
if (!af.has_i8mm) { return 0; }
|
||||
score += 1<<4;
|
||||
#endif
|
||||
#ifdef GGML_USE_SVE2
|
||||
if (!af.has_sve2) { return 0; }
|
||||
score += 1<<5;
|
||||
#endif
|
||||
#ifdef GGML_USE_SME
|
||||
if (!af.has_sme) { return 0; }
|
||||
score += 1<<6;
|
||||
#endif
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_aarch64_score)
|
||||
|
||||
# endif // defined(__aarch64__)
|
||||
@@ -509,7 +509,7 @@ void ggml_barrier(struct ggml_threadpool * tp);
|
||||
|
||||
#define GGML_DO_PRAGMA_(x) _Pragma (#x)
|
||||
#define GGML_DO_PRAGMA(x) GGML_DO_PRAGMA_(x)
|
||||
#if defined(GGML_CPU_GENERIC) || defined(__HIPCC__)
|
||||
#if defined(GGML_CPU_GENERIC) || defined(__HIPCC__) || defined(__APPLE__)
|
||||
// Note for Apple targets:
|
||||
// - clang: aliases are not supported on darwin
|
||||
// - all native kernels need to be implemented in both x86 and arm files
|
||||
@@ -518,11 +518,14 @@ void ggml_barrier(struct ggml_threadpool * tp);
|
||||
#elif defined(__GNUC__)
|
||||
// GCC/Clang on *nix
|
||||
# define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(weak name = alias) // NOLINT
|
||||
#elif defined(_MSC_VER) && defined (_WIN64)
|
||||
#elif defined(_MSC_VER) && defined(_WIN64)
|
||||
// MSVC
|
||||
// Note: C name mangling varies across different calling conventions
|
||||
// see https://learn.microsoft.com/en-us/cpp/build/reference/decorated-names?view=msvc-170
|
||||
# define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(comment(linker, "/alternatename:" #name "=" #alias))
|
||||
#elif defined(_MSC_VER) && defined(WIN32)
|
||||
// ref: https://github.com/ggml-org/whisper.cpp/pull/3239#issuecomment-2958224591
|
||||
# define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(comment(linker, "/alternatename:_" #name "=_" #alias))
|
||||
#else
|
||||
# error "Unsupported compiler for GGML_WEAK_ALIAS"
|
||||
#endif
|
||||
|
||||
@@ -5,6 +5,10 @@
|
||||
#include "ggml-quants.h"
|
||||
#include "quants.h"
|
||||
|
||||
#if defined(__APPLE__)
|
||||
#include "apple-fallback.h"
|
||||
#endif
|
||||
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <float.h>
|
||||
|
||||
@@ -84,33 +84,6 @@ void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
#if defined(GGML_CPU_GENERIC)
|
||||
#define quantize_row_q8_0_generic quantize_row_q8_0
|
||||
#define quantize_row_q8_1_generic quantize_row_q8_1
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_q4_0_q8_0_generic ggml_vec_dot_q4_0_q8_0
|
||||
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
|
||||
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
|
||||
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
|
||||
#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
||||
#define ggml_vec_dot_q3_K_q8_K_generic ggml_vec_dot_q3_K_q8_K
|
||||
#define ggml_vec_dot_q4_K_q8_K_generic ggml_vec_dot_q4_K_q8_K
|
||||
#define ggml_vec_dot_q5_K_q8_K_generic ggml_vec_dot_q5_K_q8_K
|
||||
#define ggml_vec_dot_q6_K_q8_K_generic ggml_vec_dot_q6_K_q8_K
|
||||
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
|
||||
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -8,6 +8,10 @@
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "traits.h"
|
||||
|
||||
#if defined(__APPLE__)
|
||||
#include "apple-fallback.h"
|
||||
#endif
|
||||
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <cassert>
|
||||
|
||||
@@ -67,7 +67,7 @@ extern "C" {
|
||||
// Workaround for clang:
|
||||
// clang++ complains: ``error: call to 'ggml_gemm_q4_0_4x4_q8_0' is ambiguous''
|
||||
// repro: https://godbolt.org/z/oKdeWKonM (ICE), https://godbolt.org/z/1szq6P36v (ambiguous call)
|
||||
#if defined(GGML_CPU_CLANG_WORKAROUND) || !(defined(__GNUC__) && defined(__clang__)) || defined(__HIPCC__)
|
||||
#if defined(GGML_CPU_CLANG_WORKAROUND) || defined(__APPLE__) || !(defined(__GNUC__) && defined(__clang__)) || defined(__HIPCC__)
|
||||
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
@@ -98,22 +98,6 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
#if defined(GGML_CPU_GENERIC)
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus)
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
@@ -207,9 +207,9 @@ typedef float2 dfloat2;
|
||||
#define FP16_MMA_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
|
||||
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
|
||||
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
||||
#define FP16_MMA_AVAILABLE
|
||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
|
||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
#define NEW_MMA_AVAILABLE
|
||||
@@ -262,11 +262,11 @@ static bool cp_async_available(const int cc) {
|
||||
}
|
||||
|
||||
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
return __AMDGCN_WAVEFRONT_SIZE;
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
|
||||
return 64;
|
||||
#else
|
||||
return 32;
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
|
||||
}
|
||||
|
||||
[[noreturn]]
|
||||
|
||||
@@ -10,6 +10,8 @@ __global__ void __launch_bounds__(splitD, 2)
|
||||
float * __restrict__ dst, const int64_t L) {
|
||||
GGML_UNUSED(src1_nb0);
|
||||
GGML_UNUSED(src2_nb0);
|
||||
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
const int bidx = blockIdx.x; // split along B
|
||||
const int bidy = blockIdx.y; // split along D
|
||||
const int tid = threadIdx.x;
|
||||
@@ -44,16 +46,16 @@ __global__ void __launch_bounds__(splitD, 2)
|
||||
if (N == 16) {
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < splitD / 4; i += 2) {
|
||||
float value = A_block[(wid * warpSize + i) * stride_A + wtid];
|
||||
float value = A_block[(wid * warp_size + i) * stride_A + wtid];
|
||||
// todo: bank conflict
|
||||
// I am always confused with how to use the swizzling method to solve
|
||||
// bank conflit. Hoping somebody can tell me.
|
||||
smem_A[(wid * warpSize + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
||||
smem_A[(wid * warp_size + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
||||
}
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < splitD / 4; i += 2) {
|
||||
float value = s0_block[(wid * warpSize + i) * stride_s0 + wtid];
|
||||
smem_s0[(wid * warpSize + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
||||
float value = s0_block[(wid * warp_size + i) * stride_s0 + wtid];
|
||||
smem_s0[(wid * warp_size + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -113,6 +113,10 @@ if (GGML_HIP_ROCWMMA_FATTN)
|
||||
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN)
|
||||
endif()
|
||||
|
||||
if (GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 OR ${hip_VERSION} VERSION_GREATER_EQUAL 7.0)
|
||||
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN_GFX12)
|
||||
endif()
|
||||
|
||||
if (NOT GGML_CUDA_FA)
|
||||
add_compile_definitions(GGML_CUDA_NO_FA)
|
||||
endif()
|
||||
|
||||
@@ -44,21 +44,22 @@ if (GGML_METAL_EMBED_LIBRARY)
|
||||
set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${METALLIB_EMBED_ASM}
|
||||
OUTPUT "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo "Embedding Metal library"
|
||||
COMMAND sed -e '/__embed_ggml-common.h__/r ${METALLIB_COMMON}' -e '/__embed_ggml-common.h__/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED_TMP}
|
||||
COMMAND sed -e '/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}' -e '/\#include \"ggml-metal-impl.h\"/d' < ${METALLIB_SOURCE_EMBED_TMP} > ${METALLIB_SOURCE_EMBED}
|
||||
COMMAND echo ".section __DATA,__ggml_metallib" > ${METALLIB_EMBED_ASM}
|
||||
COMMAND echo ".globl _ggml_metallib_start" >> ${METALLIB_EMBED_ASM}
|
||||
COMMAND echo "_ggml_metallib_start:" >> ${METALLIB_EMBED_ASM}
|
||||
COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM}
|
||||
COMMAND echo ".globl _ggml_metallib_end" >> ${METALLIB_EMBED_ASM}
|
||||
COMMAND echo "_ggml_metallib_end:" >> ${METALLIB_EMBED_ASM}
|
||||
COMMAND sed -e "/__embed_ggml-common.h__/r ${METALLIB_COMMON}" -e "/__embed_ggml-common.h__/d" < "${METALLIB_SOURCE}" > "${METALLIB_SOURCE_EMBED_TMP}"
|
||||
COMMAND sed -e "/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}" -e "/\#include \"ggml-metal-impl.h\"/d" < "${METALLIB_SOURCE_EMBED_TMP}" > "${METALLIB_SOURCE_EMBED}"
|
||||
COMMAND echo ".section __DATA,__ggml_metallib" > "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo ".globl _ggml_metallib_start" >> "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo "_ggml_metallib_start:" >> "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo .incbin "\"${METALLIB_SOURCE_EMBED}\"" >> "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo ".globl _ggml_metallib_end" >> "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo "_ggml_metallib_end:" >> "${METALLIB_EMBED_ASM}"
|
||||
DEPENDS ../ggml-common.h ggml-metal.metal ggml-metal-impl.h
|
||||
COMMENT "Generate assembly for embedded Metal library"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
target_sources(ggml-metal PRIVATE ${METALLIB_EMBED_ASM})
|
||||
target_sources(ggml-metal PRIVATE "${METALLIB_EMBED_ASM}")
|
||||
else()
|
||||
if (GGML_METAL_SHADER_DEBUG)
|
||||
# custom command to do the following:
|
||||
|
||||
@@ -3333,8 +3333,6 @@ kernel void kernel_flash_attn_ext(
|
||||
|
||||
threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*DK); // holds the query data
|
||||
threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*DK); // same as above but in q4_t
|
||||
threadgroup o_t * so = (threadgroup o_t *) (shmem_f16 + 0*DK); // reuse query data for accumulation
|
||||
threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 + 0*DK); // same as above but in o4_t
|
||||
threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + 2*sgitg*SH + 2*Q*DK); // scratch buffer for attention, mask and diagonal matrix
|
||||
|
||||
threadgroup k_t * sk = (threadgroup k_t *) (shmem_f16 + sgitg*(4*16*KV) + Q*T); // scratch buffer to load K in shared memory
|
||||
@@ -3548,20 +3546,20 @@ kernel void kernel_flash_attn_ext(
|
||||
|
||||
// O = diag(ms)*O
|
||||
{
|
||||
s8x8_t mm;
|
||||
simdgroup_load(mm, ss + 2*C, TS, 0, false);
|
||||
s8x8_t ms;
|
||||
simdgroup_load(ms, ss + 2*C, TS, 0, false);
|
||||
|
||||
#pragma unroll(DV8)
|
||||
for (short i = 0; i < DV8; ++i) {
|
||||
simdgroup_multiply(lo[i], mm, lo[i]);
|
||||
simdgroup_multiply(lo[i], ms, lo[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// O = O + (Q*K^T)*V
|
||||
{
|
||||
for (short cc = 0; cc < C/8; ++cc) {
|
||||
s8x8_t ms;
|
||||
simdgroup_load(ms, ss + 8*cc, TS, 0, false);
|
||||
s8x8_t vs;
|
||||
simdgroup_load(vs, ss + 8*cc, TS, 0, false);
|
||||
|
||||
if (is_same<vd4x4_t, v4x4_t>::value) {
|
||||
// we can read directly from global memory
|
||||
@@ -3572,7 +3570,7 @@ kernel void kernel_flash_attn_ext(
|
||||
v8x8_t mv;
|
||||
simdgroup_load(mv, pv + i*8, args.nb21/sizeof(v_t), 0, false); // TODO: use ne20
|
||||
|
||||
simdgroup_multiply_accumulate(lo[i], ms, mv, lo[i]);
|
||||
simdgroup_multiply_accumulate(lo[i], vs, mv, lo[i]);
|
||||
}
|
||||
} else {
|
||||
for (short ii = 0; ii < DV16; ii += 4) {
|
||||
@@ -3593,10 +3591,10 @@ kernel void kernel_flash_attn_ext(
|
||||
v8x8_t mv;
|
||||
|
||||
simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], vs, mv, lo[2*(ii + k) + 0]);
|
||||
|
||||
simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], vs, mv, lo[2*(ii + k) + 1]);
|
||||
}
|
||||
} else {
|
||||
if (ii + tx < DV16) {
|
||||
@@ -3611,10 +3609,10 @@ kernel void kernel_flash_attn_ext(
|
||||
v8x8_t mv;
|
||||
|
||||
simdgroup_load(mv, sv + 16*k + 0*8, 4*16, 0, false);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], ms, mv, lo[2*(ii + k) + 0]);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 0], vs, mv, lo[2*(ii + k) + 0]);
|
||||
|
||||
simdgroup_load(mv, sv + 16*k + 1*8, 4*16, 0, false);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], ms, mv, lo[2*(ii + k) + 1]);
|
||||
simdgroup_multiply_accumulate(lo[2*(ii + k) + 1], vs, mv, lo[2*(ii + k) + 1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3624,83 +3622,80 @@ kernel void kernel_flash_attn_ext(
|
||||
}
|
||||
|
||||
// these are needed for reducing the results from the simdgroups (reuse the ss buffer)
|
||||
for (short j = 0; j < Q; ++j) {
|
||||
if (tiisg == 0) {
|
||||
ss[j*TS + 0] = S[j];
|
||||
ss[j*TS + 1] = M[j];
|
||||
}
|
||||
for (short j = tiisg; j < Q; j += NW) {
|
||||
ss[j*TS + 0] = S[j];
|
||||
ss[j*TS + 1] = M[j];
|
||||
}
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
threadgroup float * so = (threadgroup float *) (shmem_f16 + 0*DK); // reuse query data for accumulation
|
||||
threadgroup float4 * so4 = (threadgroup float4 *) (shmem_f16 + 0*DK);
|
||||
|
||||
// store result to shared memory in F32
|
||||
if (sgitg == 0) {
|
||||
for (short i = 0; i < DV8; ++i) {
|
||||
//simdgroup_store(lo[i], so + i*8, DV, 0, false);
|
||||
simdgroup_float8x8 t(1.0f);
|
||||
simdgroup_multiply(t, lo[i], t);
|
||||
simdgroup_store(t, so + i*8, DV, 0, false);
|
||||
}
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// reduce the warps sequentially
|
||||
for (ushort sg = 1; sg < nsg; ++sg) {
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// each simdgroup stores its output to shared memory, reusing sq
|
||||
if (sgitg == sg) {
|
||||
for (short i = 0; i < DV8; ++i) {
|
||||
simdgroup_store(lo[i], so + i*8, DV, 0, false);
|
||||
}
|
||||
}
|
||||
for (short j = tiisg; j < Q; j += NW) {
|
||||
const float S0 = ss[j*TS - 1*SH + 0];
|
||||
const float S1 = ss[j*TS + 0];
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// the first simdgroup accumulates the results from the other simdgroups
|
||||
if (sgitg == 0) {
|
||||
for (short j = 0; j < Q; ++j) {
|
||||
const float S0 = ss[j*TS + 0];
|
||||
const float S1 = ss[j*TS + sg*SH + 0];
|
||||
|
||||
const float M0 = ss[j*TS + 1];
|
||||
const float M1 = ss[j*TS + sg*SH + 1];
|
||||
const float M0 = ss[j*TS - 1*SH + 1];
|
||||
const float M1 = ss[j*TS + 1];
|
||||
|
||||
const float M = max(M0, M1);
|
||||
|
||||
const float ms0 = exp(M0 - M);
|
||||
const float ms1 = exp(M1 - M);
|
||||
float ms0 = exp(M0 - M);
|
||||
float ms1 = exp(M1 - M);
|
||||
|
||||
const float S = S0*ms0 + S1*ms1;
|
||||
|
||||
if (tiisg == 0) {
|
||||
ss[j*TS + 0] = S;
|
||||
ss[j*TS + 1] = M;
|
||||
ss[j*TS + 0] = S;
|
||||
ss[j*TS + 1] = M;
|
||||
|
||||
ss[j*TS + 2*C + j ] = ms0;
|
||||
ss[j*TS + 2*C + j + sg*SH] = ms1;
|
||||
}
|
||||
ss[j*TS + 2*C + j - 1*SH] = ms0;
|
||||
ss[j*TS + 2*C + j ] = ms1;
|
||||
}
|
||||
|
||||
//simdgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
|
||||
{
|
||||
s8x8_t ms0;
|
||||
s8x8_t ms1;
|
||||
|
||||
simdgroup_load(ms0, ss + 2*C, TS, 0, false);
|
||||
simdgroup_load(ms1, ss + 2*C + sg*SH, TS, 0, false);
|
||||
simdgroup_load(ms0, ss + 2*C - 1*SH, TS, 0, false);
|
||||
simdgroup_load(ms1, ss + 2*C, TS, 0, false);
|
||||
|
||||
#pragma unroll(DV8)
|
||||
for (short i = 0; i < DV8; ++i) {
|
||||
o8x8_t t;
|
||||
simdgroup_float8x8 t;
|
||||
|
||||
simdgroup_load (t, so + i*8, DV, 0, false);
|
||||
simdgroup_multiply(t, ms1, t);
|
||||
simdgroup_multiply(t, ms0, t);
|
||||
|
||||
simdgroup_multiply_accumulate(lo[i], ms0, lo[i], t);
|
||||
simdgroup_multiply_accumulate(t, ms1, lo[i], t);
|
||||
simdgroup_store(t, so + i*8, DV, 0, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
}
|
||||
|
||||
// store result to shared memory (reuse sq)
|
||||
if (sgitg == 0) {
|
||||
for (short i = 0; i < DV8; ++i) {
|
||||
simdgroup_store(lo[i], so + i*8, DV, 0, false);
|
||||
}
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
threadgroup s_t * sf = (threadgroup s_t *) (shmem_f16 + 2*Q*DK);
|
||||
threadgroup s_t * sf = (threadgroup s_t *) (shmem_f16 + 2*(nsg-1)*SH + 2*Q*DK);
|
||||
|
||||
// final rescale with 1/S and store to global memory
|
||||
for (short j = sgitg; j < Q && iq1 + j < args.ne01; j += nsg) {
|
||||
@@ -3723,8 +3718,8 @@ kernel void kernel_flash_attn_ext(
|
||||
half, half4x4, simdgroup_half8x8, \
|
||||
float, simdgroup_float8x8, \
|
||||
float, simdgroup_float8x8, \
|
||||
float, float4, simdgroup_float8x8
|
||||
//half, half4, simdgroup_half8x8
|
||||
half, half4, simdgroup_half8x8
|
||||
//float, float4, simdgroup_float8x8
|
||||
|
||||
#define FA_TYPES_BF \
|
||||
bfloat, bfloat4, simdgroup_bfloat8x8, \
|
||||
@@ -3732,8 +3727,8 @@ kernel void kernel_flash_attn_ext(
|
||||
bfloat, bfloat4x4, simdgroup_bfloat8x8, \
|
||||
float, simdgroup_float8x8, \
|
||||
float, simdgroup_float8x8, \
|
||||
float, float4, simdgroup_float8x8
|
||||
//half, half4, simdgroup_half8x8
|
||||
half, half4, simdgroup_half8x8
|
||||
//float, float4, simdgroup_float8x8
|
||||
|
||||
typedef decltype(kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 64, 64>) flash_attn_ext_t;
|
||||
|
||||
|
||||
@@ -80,6 +80,7 @@ set(GGML_OPENCL_KERNELS
|
||||
mul_mv_q4_0_f32_1d_8x_flat
|
||||
mul_mv_q4_0_f32_1d_16x_flat
|
||||
mul_mv_q6_k
|
||||
mul_mv_id_q4_0_f32_8x_flat
|
||||
mul
|
||||
norm
|
||||
relu
|
||||
|
||||
@@ -321,6 +321,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_program program_upscale;
|
||||
cl_program program_concat;
|
||||
cl_program program_tsembd;
|
||||
cl_program program_mul_mv_id_q4_0_f32_8x_flat;
|
||||
|
||||
cl_kernel kernel_add, kernel_add_row;
|
||||
cl_kernel kernel_mul, kernel_mul_row;
|
||||
@@ -366,6 +367,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_concat_f32_contiguous;
|
||||
cl_kernel kernel_concat_f32_non_contiguous;
|
||||
cl_kernel kernel_timestep_embedding;
|
||||
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// Transpose kernels
|
||||
@@ -1112,7 +1114,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// repeat
|
||||
// repeat
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
@@ -1256,6 +1258,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
}
|
||||
}
|
||||
|
||||
// mul_mv_id_q4_0_f32_8x_flat
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "mul_mv_id_q4_0_f32_8x_flat.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("mul_mv_id_q4_0_f32_8x_flat.cl");
|
||||
#endif
|
||||
backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat, "kernel_mul_mv_id_q4_0_f32_8x_flat", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// Adreno kernels
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// transpose
|
||||
@@ -2178,6 +2196,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
||||
}
|
||||
return false;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
if (op->src[0]->type == GGML_TYPE_Q4_0) {
|
||||
if (op->src[1]->type == GGML_TYPE_F32) {
|
||||
return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
@@ -5536,6 +5561,136 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
const ggml_tensor * src2 = dst->src[2];
|
||||
GGML_ASSERT(src2);
|
||||
GGML_ASSERT(src2->extra);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offset2 = extra2->offset + src2->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
#ifdef GGML_OPENCL_SOA_Q
|
||||
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
|
||||
#endif
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
const cl_ulong nb00 = src0->nb[0];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
|
||||
const int ne10 = src1->ne[0];
|
||||
const int ne11 = src1->ne[1];
|
||||
const int ne12 = src1->ne[2];
|
||||
const int ne13 = src1->ne[3];
|
||||
|
||||
const cl_ulong nb11 = src1->nb[1];
|
||||
const cl_ulong nb12 = src1->nb[2];
|
||||
|
||||
const int ne20 = src2->ne[0];
|
||||
const int ne21 = src2->ne[1];
|
||||
|
||||
const cl_ulong nb21 = src2->nb[1];
|
||||
|
||||
const int ne0 = dst->ne[0];
|
||||
const int ne1 = dst->ne[1];
|
||||
|
||||
const int r2 = ne12/ne02;
|
||||
const int r3 = ne13/ne03;
|
||||
const int dst_rows = ne20*ne21; // ne20 = n_used_experts, ne21 = n_rows
|
||||
|
||||
GGML_ASSERT(ne00 == ne10);
|
||||
|
||||
int sgs = 32; // subgroup size
|
||||
int nsg = 1; // number of subgroups
|
||||
int nrows = 1; // number of row in src1
|
||||
int ndst = 4; // number of values produced by each subgroup
|
||||
|
||||
cl_kernel kernel;
|
||||
|
||||
// subgroup mat vec
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0: {
|
||||
kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat;
|
||||
|
||||
if (backend_ctx->gpu_family == INTEL) {
|
||||
sgs = 16;
|
||||
nsg = 1;
|
||||
ndst = 8;
|
||||
} else if (backend_ctx->gpu_family == ADRENO) {
|
||||
sgs = 64;
|
||||
nsg = 1;
|
||||
ndst = 8;
|
||||
} else {
|
||||
GGML_ASSERT(false && "TODO: Unknown GPU");
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne20));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne21));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb21));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &r3));
|
||||
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ASSERT(false && "not implemented");;
|
||||
}
|
||||
|
||||
int _ne1 = 1;
|
||||
int ne123 = dst_rows;
|
||||
|
||||
size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
|
||||
size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@@ -6444,6 +6599,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_mul_mat;
|
||||
break;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_mul_mat_id;
|
||||
break;
|
||||
case GGML_OP_SCALE:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
|
||||
283
ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
Normal file
283
ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
Normal file
@@ -0,0 +1,283 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#define QK4_0 32
|
||||
|
||||
typedef char int8_t;
|
||||
typedef uchar uint8_t;
|
||||
typedef short int16_t;
|
||||
typedef ushort uint16_t;
|
||||
typedef int int32_t;
|
||||
typedef uint uint32_t;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_q4_0
|
||||
//------------------------------------------------------------------------------
|
||||
struct block_q4_0
|
||||
{
|
||||
half d;
|
||||
uint8_t qs[QK4_0 / 2];
|
||||
};
|
||||
|
||||
// This function requires the original shuffled weights.
|
||||
// As a reminder, the original weights are shuffled so that (q[0], q[16]) are
|
||||
// packed together in a byte, so are (q[1], q[17]) and so on.
|
||||
inline float block_q_4_0_dot_y_flat(
|
||||
global uchar * x,
|
||||
global half * dh,
|
||||
float sumy,
|
||||
float16 yl,
|
||||
int il
|
||||
) {
|
||||
float d = *dh;
|
||||
global ushort * qs = ((global ushort *)x + il/2);
|
||||
float acc = 0.f;
|
||||
|
||||
acc += yl.s0 * (qs[0] & 0x000F);
|
||||
acc += yl.s1 * (qs[0] & 0x0F00);
|
||||
acc += yl.s8 * (qs[0] & 0x00F0);
|
||||
acc += yl.s9 * (qs[0] & 0xF000);
|
||||
|
||||
acc += yl.s2 * (qs[1] & 0x000F);
|
||||
acc += yl.s3 * (qs[1] & 0x0F00);
|
||||
acc += yl.sa * (qs[1] & 0x00F0);
|
||||
acc += yl.sb * (qs[1] & 0xF000);
|
||||
|
||||
acc += yl.s4 * (qs[2] & 0x000F);
|
||||
acc += yl.s5 * (qs[2] & 0x0F00);
|
||||
acc += yl.sc * (qs[2] & 0x00F0);
|
||||
acc += yl.sd * (qs[2] & 0xF000);
|
||||
|
||||
acc += yl.s6 * (qs[3] & 0x000F);
|
||||
acc += yl.s7 * (qs[3] & 0x0F00);
|
||||
acc += yl.se * (qs[3] & 0x00F0);
|
||||
acc += yl.sf * (qs[3] & 0xF000);
|
||||
|
||||
return d * (sumy * -8.f + acc);
|
||||
}
|
||||
|
||||
//
|
||||
// This variant outputs 8 values.
|
||||
//
|
||||
#undef N_DST
|
||||
#undef N_SIMDGROUP
|
||||
#undef N_SIMDWIDTH
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
#define N_DST 8 // each SIMD group works on 8 rows
|
||||
#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
|
||||
#define N_SIMDWIDTH 16 // subgroup size
|
||||
#elif defined (ADRENO_GPU)
|
||||
#define N_DST 8
|
||||
#define N_SIMDGROUP 1
|
||||
#define N_SIMDWIDTH 64
|
||||
#endif
|
||||
|
||||
inline void mul_vec_q_n_f32_8x_flat(
|
||||
global char * src0_q,
|
||||
global half * src0_d,
|
||||
global float * src1,
|
||||
global float * dst,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne10,
|
||||
int ne12,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
const ulong nb = ne00/QK4_0;
|
||||
|
||||
int r0 = get_group_id(0);
|
||||
int r1 = get_group_id(1);
|
||||
int im = 0;
|
||||
|
||||
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
|
||||
|
||||
int i12 = im%ne12;
|
||||
int i13 = im/ne12;
|
||||
|
||||
// The number of scales is the same as the number of blocks.
|
||||
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
// Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
|
||||
ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
|
||||
|
||||
global uchar * x = (global uchar *) src0_q + offset0_q;
|
||||
global half * d = (global half *) src0_d + offset0_d;
|
||||
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
float16 yl;
|
||||
float8 sumf = 0.f;
|
||||
|
||||
int ix = get_sub_group_local_id()/2;
|
||||
int il = 8*(get_sub_group_local_id()%2);
|
||||
|
||||
global float * yb = y + ix*QK4_0 + il;
|
||||
|
||||
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
|
||||
float sumy = 0.f;
|
||||
|
||||
sumy += yb[0];
|
||||
sumy += yb[1];
|
||||
sumy += yb[2];
|
||||
sumy += yb[3];
|
||||
sumy += yb[4];
|
||||
sumy += yb[5];
|
||||
sumy += yb[6];
|
||||
sumy += yb[7];
|
||||
|
||||
sumy += yb[16];
|
||||
sumy += yb[17];
|
||||
sumy += yb[18];
|
||||
sumy += yb[19];
|
||||
sumy += yb[20];
|
||||
sumy += yb[21];
|
||||
sumy += yb[22];
|
||||
sumy += yb[23];
|
||||
|
||||
yl.s0 = yb[0];
|
||||
yl.s1 = yb[1]/256.f;
|
||||
|
||||
yl.s2 = yb[2];
|
||||
yl.s3 = yb[3]/256.f;
|
||||
|
||||
yl.s4 = yb[4];
|
||||
yl.s5 = yb[5]/256.f;
|
||||
|
||||
yl.s6 = yb[6];
|
||||
yl.s7 = yb[7]/256.f;
|
||||
|
||||
yl.s8 = yb[16]/16.f;
|
||||
yl.s9 = yb[17]/4096.f;
|
||||
|
||||
yl.sa = yb[18]/16.f;
|
||||
yl.sb = yb[19]/4096.f;
|
||||
|
||||
yl.sc = yb[20]/16.f;
|
||||
yl.sd = yb[21]/4096.f;
|
||||
|
||||
yl.se = yb[22]/16.f;
|
||||
yl.sf = yb[23]/4096.f;
|
||||
|
||||
sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
|
||||
sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
|
||||
sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
|
||||
sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
|
||||
|
||||
sumf.s4 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
|
||||
sumf.s5 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
|
||||
sumf.s6 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
|
||||
sumf.s7 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
|
||||
|
||||
yb += QK4_0 * (N_SIMDWIDTH/2);
|
||||
}
|
||||
|
||||
float8 tot = (float8)(
|
||||
sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
|
||||
sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
|
||||
sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
|
||||
sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
|
||||
);
|
||||
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
if (first_row + 0 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
|
||||
}
|
||||
if (first_row + 1 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
|
||||
}
|
||||
if (first_row + 2 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
|
||||
}
|
||||
if (first_row + 3 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
|
||||
}
|
||||
|
||||
if (first_row + 4 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
|
||||
}
|
||||
if (first_row + 5 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
|
||||
}
|
||||
if (first_row + 6 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
|
||||
}
|
||||
if (first_row + 7 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
REQD_SUBGROUP_SIZE_16
|
||||
#elif defined (ADRENO_GPU)
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_mul_mv_id_q4_0_f32_8x_flat(
|
||||
global char * src0_q,
|
||||
global half * src0_d,
|
||||
global float * src1,
|
||||
ulong offset1,
|
||||
global char * src2,
|
||||
ulong offset2,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
ulong nb00,
|
||||
ulong nb02,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
int ne20,
|
||||
int ne21,
|
||||
ulong nb21,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src1 = (global float *)((global char *)src1 + offset1);
|
||||
src2 = (global char *)((global char *)src2 + offset2);
|
||||
dst = (global float *)((global char *)dst + offsetd);
|
||||
|
||||
const int iid1 = get_group_id(2)/ne20;
|
||||
const int idx = get_group_id(2)%ne20;
|
||||
|
||||
const int i02 = ((global int *)(src2 + iid1*nb21))[idx];
|
||||
|
||||
const int i11 = idx%ne11;
|
||||
const int i12 = iid1;
|
||||
|
||||
const int i1 = idx;
|
||||
const int i2 = i12;
|
||||
|
||||
global char * src0_q_cur = src0_q + (i02*nb02/nb00)*(QK4_0/2);
|
||||
global half * src0_d_cur = src0_d + (i02*nb02/nb00);
|
||||
global float * src1_cur = (global float *)((global char *) src1 + i11*nb11 + i12*nb12);
|
||||
global float * dst_cur = dst + i1*ne0 + i2*ne1*ne0;
|
||||
|
||||
mul_vec_q_n_f32_8x_flat(src0_q_cur, src0_d_cur, src1_cur, dst_cur, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
|
||||
}
|
||||
@@ -53,6 +53,9 @@ struct socket_t {
|
||||
}
|
||||
};
|
||||
|
||||
// macro for nicer error messages on server crash
|
||||
#define RPC_STATUS_ASSERT(x) if (!(x)) GGML_ABORT("Remote RPC server crashed or returned malformed response")
|
||||
|
||||
// all RPC structures must be packed
|
||||
#pragma pack(push, 1)
|
||||
// ggml_tensor is serialized into rpc_tensor
|
||||
@@ -425,7 +428,7 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
|
||||
static bool check_server_version(const std::shared_ptr<socket_t> & sock) {
|
||||
rpc_msg_hello_rsp response;
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_HELLO, nullptr, 0, &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
if (response.major != RPC_PROTO_MAJOR_VERSION || response.minor > RPC_PROTO_MINOR_VERSION) {
|
||||
fprintf(stderr, "RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
|
||||
return false;
|
||||
@@ -481,7 +484,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
rpc_msg_free_buffer_req request = {ctx->remote_ptr};
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
@@ -493,7 +496,7 @@ static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
rpc_msg_buffer_get_base_req request = {ctx->remote_ptr};
|
||||
rpc_msg_buffer_get_base_rsp response;
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_GET_BASE, &request, sizeof(request), &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
ctx->base_ptr = reinterpret_cast<void *>(response.base_ptr);
|
||||
return ctx->base_ptr;
|
||||
}
|
||||
@@ -545,7 +548,7 @@ static enum ggml_status ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_
|
||||
request.tensor = serialize_tensor(tensor);
|
||||
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_INIT_TENSOR, &request, sizeof(request), nullptr, 0);
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
}
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -560,7 +563,7 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
|
||||
request.hash = fnv_hash((const uint8_t*)data, size);
|
||||
rpc_msg_set_tensor_hash_rsp response;
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, &request, sizeof(request), &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
if (response.result) {
|
||||
// the server has the same data, no need to send it
|
||||
return;
|
||||
@@ -573,7 +576,7 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
|
||||
memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
|
||||
memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size());
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
}
|
||||
|
||||
static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
@@ -583,7 +586,7 @@ static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, con
|
||||
request.offset = offset;
|
||||
request.size = size;
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_GET_TENSOR, &request, sizeof(request), data, size);
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
}
|
||||
|
||||
static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
@@ -601,7 +604,7 @@ static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
|
||||
request.dst = serialize_tensor(dst);
|
||||
rpc_msg_copy_tensor_rsp response;
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_COPY_TENSOR, &request, sizeof(request), &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
return response.result;
|
||||
}
|
||||
|
||||
@@ -609,7 +612,7 @@ static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
rpc_msg_buffer_clear_req request = {ctx->remote_ptr, value};
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_CLEAR, &request, sizeof(request), nullptr, 0);
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
|
||||
@@ -635,7 +638,7 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
|
||||
rpc_msg_alloc_buffer_rsp response;
|
||||
auto sock = get_socket(buft_ctx->endpoint);
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_ALLOC_BUFFER, &request, sizeof(request), &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
if (response.remote_ptr != 0) {
|
||||
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
||||
ggml_backend_rpc_buffer_interface,
|
||||
@@ -650,7 +653,7 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
|
||||
static size_t get_alignment(const std::shared_ptr<socket_t> & sock) {
|
||||
rpc_msg_get_alignment_rsp response;
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALIGNMENT, nullptr, 0, &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
return response.alignment;
|
||||
}
|
||||
|
||||
@@ -662,7 +665,7 @@ static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_typ
|
||||
static size_t get_max_size(const std::shared_ptr<socket_t> & sock) {
|
||||
rpc_msg_get_max_size_rsp response;
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_GET_MAX_SIZE, nullptr, 0, &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
return response.max_size;
|
||||
}
|
||||
|
||||
@@ -683,7 +686,7 @@ static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_ty
|
||||
|
||||
rpc_msg_get_alloc_size_rsp response;
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALLOC_SIZE, &request, sizeof(request), &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
|
||||
return response.alloc_size;
|
||||
} else {
|
||||
@@ -761,7 +764,7 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g
|
||||
rpc_msg_graph_compute_rsp response;
|
||||
auto sock = get_socket(rpc_ctx->endpoint);
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_COMPUTE, input.data(), input.size(), &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
return (enum ggml_status)response.result;
|
||||
}
|
||||
|
||||
@@ -835,7 +838,7 @@ bool ggml_backend_is_rpc(ggml_backend_t backend) {
|
||||
static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * free, size_t * total) {
|
||||
rpc_msg_get_device_memory_rsp response;
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_GET_DEVICE_MEMORY, nullptr, 0, &response, sizeof(response));
|
||||
GGML_ASSERT(status);
|
||||
RPC_STATUS_ASSERT(status);
|
||||
*free = response.free_mem;
|
||||
*total = response.total_mem;
|
||||
}
|
||||
|
||||
@@ -142,7 +142,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
ONEMATH
|
||||
GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
|
||||
GIT_TAG c255b1b4c41e2ee3059455c1f96a965d6a62568a
|
||||
GIT_TAG 8efe85f5aaebb37f1d8c503b7af66315feabf142
|
||||
)
|
||||
FetchContent_MakeAvailable(ONEMATH)
|
||||
# Create alias to match with find_package targets name
|
||||
|
||||
@@ -513,9 +513,9 @@ constexpr size_t ceil_div(const size_t m, const size_t n) {
|
||||
|
||||
bool gpu_has_xmx(sycl::device &dev);
|
||||
|
||||
template <int N, class T> void debug_print_array(const std::string & prefix, const T array[N]) {
|
||||
template <int N, class T> std::string debug_get_array_str(const std::string & prefix, const T array[N]) {
|
||||
if (LIKELY(!g_ggml_sycl_debug)) {
|
||||
return;
|
||||
return "";
|
||||
}
|
||||
std::stringstream ss;
|
||||
ss << prefix << "=[";
|
||||
@@ -526,29 +526,26 @@ template <int N, class T> void debug_print_array(const std::string & prefix, con
|
||||
ss << array[N - 1];
|
||||
}
|
||||
ss << "]";
|
||||
GGML_SYCL_DEBUG("%s", ss.str().c_str());
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
inline void debug_print_tensor(const std::string & prefix, const ggml_tensor * tensor,
|
||||
const std::string & suffix = "") {
|
||||
if (LIKELY(!g_ggml_sycl_debug)) {
|
||||
return;
|
||||
}
|
||||
GGML_SYCL_DEBUG("%s=", prefix.c_str());
|
||||
inline std::string debug_get_tensor_str(const std::string &prefix,
|
||||
const ggml_tensor *tensor, const std::string &suffix = "") {
|
||||
std::stringstream ss;
|
||||
if (LIKELY(!g_ggml_sycl_debug)) { return ss.str(); }
|
||||
ss << prefix.c_str() << "=";
|
||||
if (tensor) {
|
||||
GGML_SYCL_DEBUG("'%s':type=%s", tensor->name, ggml_type_name(tensor->type));
|
||||
debug_print_array<GGML_MAX_DIMS>(";ne", tensor->ne);
|
||||
debug_print_array<GGML_MAX_DIMS>(";nb", tensor->nb);
|
||||
if (!ggml_is_contiguous(tensor)) {
|
||||
GGML_SYCL_DEBUG(";strided");
|
||||
}
|
||||
if (ggml_is_permuted(tensor)) {
|
||||
GGML_SYCL_DEBUG(";permuted");
|
||||
}
|
||||
ss << "'" << tensor->name << "':type=" << ggml_type_name(tensor->type);
|
||||
ss << debug_get_array_str<GGML_MAX_DIMS>(";ne", tensor->ne);
|
||||
ss << debug_get_array_str<GGML_MAX_DIMS>(";nb", tensor->nb);
|
||||
|
||||
if (!ggml_is_contiguous(tensor)) { ss << ";strided"; }
|
||||
if (ggml_is_permuted(tensor)) { ss << ";permuted"; }
|
||||
} else {
|
||||
GGML_SYCL_DEBUG("nullptr");
|
||||
ss << "nullptr";
|
||||
}
|
||||
GGML_SYCL_DEBUG("%s", suffix.c_str());
|
||||
ss << suffix;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
// Use scope_op_debug_print to log operations coming from running a model
|
||||
@@ -564,10 +561,10 @@ struct scope_op_debug_print {
|
||||
return;
|
||||
}
|
||||
GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data());
|
||||
debug_print_tensor(" dst", dst);
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" dst", dst).c_str());
|
||||
if (dst) {
|
||||
for (std::size_t i = 0; i < num_src; ++i) {
|
||||
debug_print_tensor("\tsrc" + std::to_string(i), dst->src[i]);
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str("\tsrc" + std::to_string(i), dst->src[i]).c_str());
|
||||
}
|
||||
}
|
||||
GGML_SYCL_DEBUG("%s\n", suffix.data());
|
||||
|
||||
@@ -723,8 +723,7 @@ static void ggml_cpy_q4_1_q4_1(const char * cx, char * cdst, const int ne, const
|
||||
|
||||
void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
|
||||
// Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field
|
||||
scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0,
|
||||
std::string(" src0 type=") + ggml_type_name(src0->type));
|
||||
scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0, debug_get_tensor_str("\tsrc0", src0));
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
GGML_ASSERT(ne == ggml_nelements(src1));
|
||||
|
||||
|
||||
@@ -65,6 +65,9 @@ public:
|
||||
|
||||
dnnl::primitive_attr primitive_attr;
|
||||
primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
|
||||
#ifdef GGML_SYCL_F16
|
||||
primitive_attr.set_fpmath_mode(dnnl::fpmath_mode::f16);
|
||||
#endif
|
||||
|
||||
auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
|
||||
auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
|
||||
|
||||
@@ -347,7 +347,7 @@ static enum ggml_status
|
||||
ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *tensor) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor, "\n");
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str());
|
||||
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
|
||||
if (tensor->view_src != NULL) {
|
||||
@@ -385,7 +385,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
const void *data, size_t offset,
|
||||
size_t size) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
ggml_sycl_set_device(ctx->device);
|
||||
@@ -413,7 +413,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
void *data, size_t offset,
|
||||
size_t size) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
|
||||
@@ -444,8 +444,8 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *dst) try {
|
||||
bool is_cpy_supported = ggml_backend_buffer_is_sycl(src->buffer);
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": dst=", dst);
|
||||
debug_print_tensor(" src=", src);
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str());
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str());
|
||||
GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
|
||||
if (is_cpy_supported) {
|
||||
ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context;
|
||||
@@ -525,7 +525,7 @@ catch (sycl::exception const &exc) {
|
||||
static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value,
|
||||
size_t offset, size_t size) {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu value=%u\n", size, offset, value);
|
||||
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
|
||||
SYCL_CHECK(ggml_sycl_set_device(ctx->device));
|
||||
@@ -805,7 +805,7 @@ static enum ggml_status
|
||||
ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *tensor) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor, "\n");
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str());
|
||||
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
||||
|
||||
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
||||
@@ -891,7 +891,7 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *tensor, const void *data,
|
||||
size_t offset, size_t size) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
// split tensors must always be set in their entirety at once
|
||||
GGML_ASSERT(offset == 0);
|
||||
@@ -947,7 +947,7 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
const ggml_tensor *tensor, void *data,
|
||||
size_t offset, size_t size) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
// split tensors must always be set in their entirety at once
|
||||
GGML_ASSERT(offset == 0);
|
||||
@@ -2127,21 +2127,18 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16
|
||||
? (const sycl::half *)src1->data + src1_padded_row_size
|
||||
: src1_as_f16.get();
|
||||
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
|
||||
|
||||
#if GGML_SYCL_DNNL
|
||||
if (!g_ggml_sycl_disable_dnn) {
|
||||
DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
|
||||
DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
|
||||
dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
|
||||
scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
|
||||
" : converting dst to fp32");
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
||||
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
|
||||
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
|
||||
|
||||
const sycl::half alpha_f16 = 1.0f;
|
||||
const sycl::half beta_f16 = 0.0f;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
|
||||
@@ -3866,7 +3863,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
||||
const void *data, size_t offset,
|
||||
size_t size) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
@@ -3887,7 +3884,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
|
||||
void *data, size_t offset,
|
||||
size_t size) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": tensor=", tensor);
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
|
||||
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
@@ -3910,8 +3907,8 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
|
||||
bool is_cpy_supported = dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) &&
|
||||
ggml_backend_buffer_is_sycl(src->buffer);
|
||||
GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
|
||||
debug_print_tensor(": dst=", dst);
|
||||
debug_print_tensor(" src=", src);
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str());
|
||||
GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str());
|
||||
GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
|
||||
if (is_cpy_supported) {
|
||||
/*
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -291,6 +291,7 @@ class MODEL_ARCH(IntEnum):
|
||||
BERT = auto()
|
||||
NOMIC_BERT = auto()
|
||||
NOMIC_BERT_MOE = auto()
|
||||
NEO_BERT = auto()
|
||||
JINA_BERT_V2 = auto()
|
||||
BLOOM = auto()
|
||||
STABLELM = auto()
|
||||
@@ -343,6 +344,8 @@ class MODEL_ARCH(IntEnum):
|
||||
WAVTOKENIZER_DEC = auto()
|
||||
PLM = auto()
|
||||
BAILINGMOE = auto()
|
||||
DOTS1 = auto()
|
||||
ARCEE = auto()
|
||||
|
||||
|
||||
class VISION_PROJECTOR_TYPE(IntEnum):
|
||||
@@ -571,6 +574,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.BERT: "bert",
|
||||
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
||||
MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe",
|
||||
MODEL_ARCH.NEO_BERT: "neo-bert",
|
||||
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
|
||||
MODEL_ARCH.BLOOM: "bloom",
|
||||
MODEL_ARCH.STABLELM: "stablelm",
|
||||
@@ -623,6 +627,8 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
|
||||
MODEL_ARCH.PLM: "plm",
|
||||
MODEL_ARCH.BAILINGMOE: "bailingmoe",
|
||||
MODEL_ARCH.DOTS1: "dots1",
|
||||
MODEL_ARCH.ARCEE: "arcee",
|
||||
}
|
||||
|
||||
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
|
||||
@@ -1077,6 +1083,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||
],
|
||||
MODEL_ARCH.NEO_BERT: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.ENC_OUTPUT_NORM,
|
||||
MODEL_TENSOR.CLS,
|
||||
MODEL_TENSOR.CLS_OUT,
|
||||
],
|
||||
MODEL_ARCH.JINA_BERT_V2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||
@@ -2044,6 +2062,45 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||
],
|
||||
MODEL_ARCH.DOTS1: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_EXP_PROBS_B,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||
],
|
||||
MODEL_ARCH.ARCEE: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
||||
@@ -271,7 +271,7 @@ class GGUFWriter:
|
||||
|
||||
def add_key_value(self, key: str, val: Any, vtype: GGUFValueType, sub_type: GGUFValueType | None = None) -> None:
|
||||
if any(key in kv_data for kv_data in self.kv_data):
|
||||
raise ValueError(f'Duplicated key name {key!r}')
|
||||
logger.warning(f'Duplicated key name {key!r}, overwriting it with new value {val!r} of type {vtype.name}')
|
||||
|
||||
self.kv_data[0][key] = GGUFValue(value=val, type=vtype, sub_type=sub_type)
|
||||
|
||||
|
||||
@@ -31,6 +31,7 @@ class TensorNameMap:
|
||||
"model.embeddings", # rwkv7
|
||||
"model.word_embeddings", # bailingmoe
|
||||
"language_model.model.embed_tokens", # llama4
|
||||
"encoder", # neobert
|
||||
),
|
||||
|
||||
# Token type embeddings
|
||||
@@ -134,6 +135,7 @@ class TensorNameMap:
|
||||
"rwkv.blocks.{bid}.ln1", # rwkv6
|
||||
"model.layers.{bid}.ln1", # rwkv7
|
||||
"model.layers.{bid}.input_layernorm", # llama4
|
||||
"transformer_encoder.{bid}.attention_norm", # neobert
|
||||
),
|
||||
|
||||
# Attention norm 2
|
||||
@@ -161,6 +163,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.self_attn.qkv_proj", # phi3
|
||||
"encoder.layers.{bid}.self_attention.query_key_value", # chatglm
|
||||
"transformer.layers.{bid}.attn.qkv_proj", # openelm
|
||||
"transformer_encoder.{bid}.qkv", # neobert
|
||||
),
|
||||
|
||||
# Attention query
|
||||
@@ -236,6 +239,7 @@ class TensorNameMap:
|
||||
"transformer.layers.{bid}.attn.out_proj", # openelm
|
||||
"transformer.h.{bid}.attn.attention.out_proj", # exaone
|
||||
"model.layers.{bid}.self_attn.o_proj", # llama4
|
||||
"transformer_encoder.{bid}.wo", # neobert
|
||||
),
|
||||
|
||||
# Attention output norm
|
||||
@@ -276,6 +280,7 @@ class TensorNameMap:
|
||||
"encoder.layers.{bid}.post_attention_layernorm", # chatglm
|
||||
"transformer.layers.{bid}.ffn_norm", # openelm
|
||||
"model.layers.{bid}.post_attention_layernorm", # llama4
|
||||
"transformer_encoder.{bid}.ffn_norm", # neobert
|
||||
),
|
||||
|
||||
# Post feed-forward norm
|
||||
@@ -305,7 +310,7 @@ class TensorNameMap:
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_EXP_PROBS_B: (
|
||||
"model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3
|
||||
"model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 dots1
|
||||
),
|
||||
|
||||
# Feed-forward up
|
||||
@@ -333,11 +338,14 @@ class TensorNameMap:
|
||||
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
||||
"encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe
|
||||
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
||||
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
|
||||
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 (split up/gate, no longer used)
|
||||
"encoder.layer.{bid}.mlp.gated_layers", # jina-bert-v2 (GEGLU)
|
||||
"encoder.layer.{bid}.mlp.up_gated_layer", # jina-v2-code (GEGLU)
|
||||
"model.layers.{bid}.residual_mlp.w3", # arctic
|
||||
"encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
|
||||
"transformer.h.{bid}.mlp.c_fc_1", # exaone
|
||||
"model.layers.{bid}.feed_forward.up_proj", # llama4
|
||||
"transformer_encoder.{bid}.ffn.w12", # neobert
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_EXP: (
|
||||
@@ -370,7 +378,7 @@ class TensorNameMap:
|
||||
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
|
||||
"model.layers.{bid}.feed_forward.w1", # internlm2
|
||||
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
||||
"encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2
|
||||
"encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 (split up/gate, no longer used)
|
||||
"transformer.h.{bid}.mlp.linear_1", # refact
|
||||
"model.layers.{bid}.residual_mlp.w1", # arctic
|
||||
"transformer.h.{bid}.mlp.c_fc_0", # exaone
|
||||
@@ -420,6 +428,7 @@ class TensorNameMap:
|
||||
"encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm
|
||||
"model.layers.h.{bid}.mlp.c_proj", # exaone
|
||||
"model.layers.{bid}.feed_forward.down_proj", # llama4
|
||||
"transformer_encoder.{bid}.ffn.w3", # neobert
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||
@@ -830,12 +839,14 @@ class TensorNameMap:
|
||||
# TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg
|
||||
MODEL_TENSOR.ENC_OUTPUT_NORM: (
|
||||
"encoder.final_layer_norm", # t5
|
||||
"layer_norm", # neobert
|
||||
),
|
||||
|
||||
MODEL_TENSOR.CLS: (
|
||||
"classifier", # jina
|
||||
"classifier.dense", # roberta
|
||||
"pre_classifier", # distillbert
|
||||
"dense", # neobert
|
||||
),
|
||||
|
||||
MODEL_TENSOR.CLS_OUT: (
|
||||
|
||||
@@ -243,18 +243,21 @@ extern "C" {
|
||||
|
||||
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
||||
|
||||
// Input data for llama_decode
|
||||
// Input data for llama_encode/llama_decode
|
||||
// A llama_batch object can contain input about one or many sequences
|
||||
// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
|
||||
//
|
||||
// - token : the token ids of the input (used when embd is NULL)
|
||||
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
||||
// - pos : the positions of the respective token in the sequence
|
||||
// (if set to NULL, the token position will be tracked automatically by llama_decode)
|
||||
// (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
|
||||
// - seq_id : the sequence to which the respective token belongs
|
||||
// (if set to NULL, the sequence ID will be assumed to be 0)
|
||||
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
|
||||
// (if set to NULL, only the logits for last token will be returned)
|
||||
// (if set to NULL:
|
||||
// - if embeddings: all tokens are output
|
||||
// - if not: only the last token is output
|
||||
// )
|
||||
//
|
||||
typedef struct llama_batch {
|
||||
int32_t n_tokens;
|
||||
@@ -262,8 +265,8 @@ extern "C" {
|
||||
llama_token * token;
|
||||
float * embd;
|
||||
llama_pos * pos;
|
||||
int32_t * n_seq_id; // TODO: remove, should belong to only 1 sequence
|
||||
llama_seq_id ** seq_id; // TODO: become llama_seq_id * seq_id;
|
||||
int32_t * n_seq_id;
|
||||
llama_seq_id ** seq_id;
|
||||
int8_t * logits; // TODO: rename this to "output"
|
||||
} llama_batch;
|
||||
|
||||
@@ -961,8 +964,7 @@ extern "C" {
|
||||
// Get the number of threads used for prompt and batch processing (multiple token).
|
||||
LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
|
||||
|
||||
// Set whether the model is in embeddings mode or not
|
||||
// If true, embeddings will be returned but logits will not
|
||||
// Set whether the context outputs embeddings or not
|
||||
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
|
||||
|
||||
// Set whether to use causal attention or not
|
||||
|
||||
@@ -1,2 +1,3 @@
|
||||
tabulate~=0.9.0
|
||||
GitPython~=3.1.43
|
||||
matplotlib~=3.10.0
|
||||
|
||||
@@ -19,6 +19,7 @@ except ImportError as e:
|
||||
print("the following Python libraries are required: GitPython, tabulate.") # noqa: NP100
|
||||
raise e
|
||||
|
||||
|
||||
logger = logging.getLogger("compare-llama-bench")
|
||||
|
||||
# All llama-bench SQL fields
|
||||
@@ -122,11 +123,15 @@ help_s = (
|
||||
parser.add_argument("--check", action="store_true", help="check if all required Python libraries are installed")
|
||||
parser.add_argument("-s", "--show", help=help_s)
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
parser.add_argument("--plot", help="generate a performance comparison plot and save to specified file (e.g., plot.png)")
|
||||
parser.add_argument("--plot_x", help="parameter to use as x axis for plotting (default: n_depth)", default="n_depth")
|
||||
parser.add_argument("--plot_log_scale", action="store_true", help="use log scale for x axis in plots (off by default)")
|
||||
|
||||
known_args, unknown_args = parser.parse_known_args()
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO)
|
||||
|
||||
|
||||
if known_args.check:
|
||||
# Check if all required Python libraries are installed. Would have failed earlier if not.
|
||||
sys.exit(0)
|
||||
@@ -499,7 +504,6 @@ else:
|
||||
|
||||
name_compare = bench_data.get_commit_name(hexsha8_compare)
|
||||
|
||||
|
||||
# If the user provided columns to group the results by, use them:
|
||||
if known_args.show is not None:
|
||||
show = known_args.show.split(",")
|
||||
@@ -544,6 +548,14 @@ else:
|
||||
show.remove(prop)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Add plot_x parameter to parameters to show if it's not already present:
|
||||
if known_args.plot:
|
||||
for k, v in PRETTY_NAMES.items():
|
||||
if v == known_args.plot_x and k not in show:
|
||||
show.append(k)
|
||||
break
|
||||
|
||||
rows_show = bench_data.get_rows(show, hexsha8_baseline, hexsha8_compare)
|
||||
|
||||
if not rows_show:
|
||||
@@ -600,6 +612,161 @@ if "gpu_info" in show:
|
||||
headers = [PRETTY_NAMES[p] for p in show]
|
||||
headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
|
||||
|
||||
if known_args.plot:
|
||||
def create_performance_plot(table_data: list[list[str]], headers: list[str], baseline_name: str, compare_name: str, output_file: str, plot_x_param: str, log_scale: bool = False):
|
||||
try:
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib
|
||||
matplotlib.use('Agg')
|
||||
except ImportError as e:
|
||||
logger.error("matplotlib is required for --plot.")
|
||||
raise e
|
||||
|
||||
data_headers = headers[:-4] # Exclude the last 4 columns (Test, baseline t/s, compare t/s, Speedup)
|
||||
plot_x_index = None
|
||||
plot_x_label = plot_x_param
|
||||
|
||||
if plot_x_param not in ["n_prompt", "n_gen", "n_depth"]:
|
||||
pretty_name = PRETTY_NAMES.get(plot_x_param, plot_x_param)
|
||||
if pretty_name in data_headers:
|
||||
plot_x_index = data_headers.index(pretty_name)
|
||||
plot_x_label = pretty_name
|
||||
elif plot_x_param in data_headers:
|
||||
plot_x_index = data_headers.index(plot_x_param)
|
||||
plot_x_label = plot_x_param
|
||||
else:
|
||||
logger.error(f"Parameter '{plot_x_param}' not found in current table columns. Available columns: {', '.join(data_headers)}")
|
||||
return
|
||||
|
||||
grouped_data = {}
|
||||
|
||||
for i, row in enumerate(table_data):
|
||||
group_key_parts = []
|
||||
test_name = row[-4]
|
||||
|
||||
base_test = ""
|
||||
x_value = None
|
||||
|
||||
if plot_x_param in ["n_prompt", "n_gen", "n_depth"]:
|
||||
for j, val in enumerate(row[:-4]):
|
||||
header_name = data_headers[j]
|
||||
if val is not None and str(val).strip():
|
||||
group_key_parts.append(f"{header_name}={val}")
|
||||
|
||||
if plot_x_param == "n_prompt" and "pp" in test_name:
|
||||
base_test = test_name.split("@")[0]
|
||||
x_value = base_test
|
||||
elif plot_x_param == "n_gen" and "tg" in test_name:
|
||||
x_value = test_name.split("@")[0]
|
||||
elif plot_x_param == "n_depth" and "@d" in test_name:
|
||||
base_test = test_name.split("@d")[0]
|
||||
x_value = int(test_name.split("@d")[1])
|
||||
else:
|
||||
base_test = test_name
|
||||
|
||||
if base_test.strip():
|
||||
group_key_parts.append(f"Test={base_test}")
|
||||
else:
|
||||
for j, val in enumerate(row[:-4]):
|
||||
if j != plot_x_index:
|
||||
header_name = data_headers[j]
|
||||
if val is not None and str(val).strip():
|
||||
group_key_parts.append(f"{header_name}={val}")
|
||||
else:
|
||||
x_value = val
|
||||
|
||||
group_key_parts.append(f"Test={test_name}")
|
||||
|
||||
group_key = tuple(group_key_parts)
|
||||
|
||||
if group_key not in grouped_data:
|
||||
grouped_data[group_key] = []
|
||||
|
||||
grouped_data[group_key].append({
|
||||
'x_value': x_value,
|
||||
'baseline': float(row[-3]),
|
||||
'compare': float(row[-2]),
|
||||
'speedup': float(row[-1])
|
||||
})
|
||||
|
||||
if not grouped_data:
|
||||
logger.error("No data available for plotting")
|
||||
return
|
||||
|
||||
def make_axes(num_groups, max_cols=2, base_size=(8, 4)):
|
||||
from math import ceil
|
||||
cols = 1 if num_groups == 1 else min(max_cols, num_groups)
|
||||
rows = ceil(num_groups / cols)
|
||||
|
||||
# Scale figure size by grid dimensions
|
||||
w, h = base_size
|
||||
fig, ax_arr = plt.subplots(rows, cols,
|
||||
figsize=(w * cols, h * rows),
|
||||
squeeze=False)
|
||||
|
||||
axes = ax_arr.flatten()[:num_groups]
|
||||
return fig, axes
|
||||
|
||||
num_groups = len(grouped_data)
|
||||
fig, axes = make_axes(num_groups)
|
||||
|
||||
plot_idx = 0
|
||||
|
||||
for group_key, points in grouped_data.items():
|
||||
if plot_idx >= len(axes):
|
||||
break
|
||||
ax = axes[plot_idx]
|
||||
|
||||
try:
|
||||
points_sorted = sorted(points, key=lambda p: float(p['x_value']) if p['x_value'] is not None else 0)
|
||||
x_values = [float(p['x_value']) if p['x_value'] is not None else 0 for p in points_sorted]
|
||||
except ValueError:
|
||||
points_sorted = sorted(points, key=lambda p: group_key)
|
||||
x_values = [p['x_value'] for p in points_sorted]
|
||||
|
||||
baseline_vals = [p['baseline'] for p in points_sorted]
|
||||
compare_vals = [p['compare'] for p in points_sorted]
|
||||
|
||||
ax.plot(x_values, baseline_vals, 'o-', color='skyblue',
|
||||
label=f'{baseline_name}', linewidth=2, markersize=6)
|
||||
ax.plot(x_values, compare_vals, 's--', color='lightcoral', alpha=0.8,
|
||||
label=f'{compare_name}', linewidth=2, markersize=6)
|
||||
|
||||
if log_scale:
|
||||
ax.set_xscale('log', base=2)
|
||||
unique_x = sorted(set(x_values))
|
||||
ax.set_xticks(unique_x)
|
||||
ax.set_xticklabels([str(int(x)) for x in unique_x])
|
||||
|
||||
title_parts = []
|
||||
for part in group_key:
|
||||
if '=' in part:
|
||||
key, value = part.split('=', 1)
|
||||
title_parts.append(f"{key}: {value}")
|
||||
|
||||
title = ', '.join(title_parts) if title_parts else "Performance comparison"
|
||||
|
||||
ax.set_xlabel(plot_x_label, fontsize=12, fontweight='bold')
|
||||
ax.set_ylabel('Tokens per second (t/s)', fontsize=12, fontweight='bold')
|
||||
ax.set_title(title, fontsize=12, fontweight='bold')
|
||||
ax.legend(loc='best', fontsize=10)
|
||||
ax.grid(True, alpha=0.3)
|
||||
|
||||
plot_idx += 1
|
||||
|
||||
for i in range(plot_idx, len(axes)):
|
||||
axes[i].set_visible(False)
|
||||
|
||||
fig.suptitle(f'Performance comparison: {compare_name} vs. {baseline_name}',
|
||||
fontsize=14, fontweight='bold')
|
||||
fig.subplots_adjust(top=1)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
create_performance_plot(table, headers, name_baseline, name_compare, known_args.plot, known_args.plot_x, known_args.plot_log_scale)
|
||||
|
||||
print(tabulate( # noqa: NP100
|
||||
table,
|
||||
headers=headers,
|
||||
|
||||
@@ -1 +1 @@
|
||||
94a83ba5a725ae2aee79df75dd99b2119d0478cc
|
||||
6a7d170c04789f6ebcf320ed03c1b16973f93bd7
|
||||
|
||||
@@ -20,6 +20,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_BERT, "bert" },
|
||||
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
||||
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
|
||||
{ LLM_ARCH_NEO_BERT, "neo-bert" },
|
||||
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
||||
{ LLM_ARCH_BLOOM, "bloom" },
|
||||
{ LLM_ARCH_STABLELM, "stablelm" },
|
||||
@@ -72,6 +73,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
||||
{ LLM_ARCH_PLM, "plm" },
|
||||
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
|
||||
{ LLM_ARCH_DOTS1, "dots1" },
|
||||
{ LLM_ARCH_ARCEE, "arcee" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@@ -243,6 +246,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_ARCEE,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_LLAMA4,
|
||||
{
|
||||
@@ -494,6 +515,21 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_NEO_BERT,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
|
||||
{ LLM_TENSOR_CLS, "cls" },
|
||||
{ LLM_TENSOR_CLS_OUT, "cls.output" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_JINA_BERT_V2,
|
||||
{
|
||||
@@ -1555,6 +1591,34 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_DOTS1,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
||||
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
||||
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
||||
}
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
|
||||
@@ -24,6 +24,7 @@ enum llm_arch {
|
||||
LLM_ARCH_BERT,
|
||||
LLM_ARCH_NOMIC_BERT,
|
||||
LLM_ARCH_NOMIC_BERT_MOE,
|
||||
LLM_ARCH_NEO_BERT,
|
||||
LLM_ARCH_JINA_BERT_V2,
|
||||
LLM_ARCH_BLOOM,
|
||||
LLM_ARCH_STABLELM,
|
||||
@@ -76,6 +77,8 @@ enum llm_arch {
|
||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
LLM_ARCH_PLM,
|
||||
LLM_ARCH_BAILINGMOE,
|
||||
LLM_ARCH_DOTS1,
|
||||
LLM_ARCH_ARCEE,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
|
||||
@@ -1,8 +1,14 @@
|
||||
#include "llama-batch.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-cparams.h"
|
||||
#include "llama-vocab.h"
|
||||
#include "llama-memory.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <algorithm>
|
||||
#include <sstream>
|
||||
|
||||
llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
|
||||
// clear empty sequences
|
||||
@@ -105,12 +111,7 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s
|
||||
ubatch.seq_id = batch->seq_id + seq.offset;
|
||||
}
|
||||
}
|
||||
if (logits_all) {
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
ubatch.output[ubatch.n_tokens + i] = 1;
|
||||
out_ids.push_back(ids[seq.offset + i]);
|
||||
}
|
||||
} else if (batch->logits) {
|
||||
if (batch->logits) {
|
||||
if (ubatch.equal_seqs) {
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
size_t id = ids[seq.offset + i];
|
||||
@@ -197,11 +198,10 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
|
||||
return ubatch;
|
||||
}
|
||||
|
||||
llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
|
||||
llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split) {
|
||||
GGML_ASSERT(batch.n_tokens >= 0);
|
||||
this->batch = &batch;
|
||||
this->n_embd = n_embd;
|
||||
this->logits_all = logits_all;
|
||||
|
||||
n_tokens = batch.n_tokens;
|
||||
ids.resize(n_tokens);
|
||||
@@ -285,17 +285,56 @@ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple
|
||||
);
|
||||
}
|
||||
|
||||
llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0) {
|
||||
batch = in_batch;
|
||||
GGML_ASSERT(batch.n_tokens > 0);
|
||||
if (!batch.pos) {
|
||||
assert(p0 >= 0);
|
||||
pos.resize(batch.n_tokens);
|
||||
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
||||
pos[i] = p0 + i;
|
||||
}
|
||||
batch.pos = pos.data();
|
||||
llama_batch_allocr::llama_batch_allocr() {
|
||||
const char * LLAMA_BATCH_DEBUG = getenv("LLAMA_BATCH_DEBUG");
|
||||
debug = LLAMA_BATCH_DEBUG ? atoi(LLAMA_BATCH_DEBUG) : 0;
|
||||
|
||||
seq_pos.resize(LLAMA_MAX_SEQ);
|
||||
seq_cpl.resize(LLAMA_MAX_SEQ);
|
||||
for (auto & cur : seq_cpl) {
|
||||
cur.resize(LLAMA_MAX_SEQ);
|
||||
}
|
||||
}
|
||||
|
||||
bool llama_batch_allocr::init(
|
||||
const llama_batch & batch_inp,
|
||||
const llama_vocab & vocab,
|
||||
const llama_memory_i * memory,
|
||||
bool embd_all) {
|
||||
clear();
|
||||
|
||||
batch = batch_inp;
|
||||
|
||||
GGML_ASSERT(batch.n_tokens > 0);
|
||||
|
||||
//
|
||||
// validate input batch
|
||||
//
|
||||
|
||||
if (batch.token) {
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab.n_tokens()) {
|
||||
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (batch.seq_id) {
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||
for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
|
||||
if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= LLAMA_MAX_SEQ)) {
|
||||
LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], LLAMA_MAX_SEQ);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// auto-generate missing fields
|
||||
//
|
||||
|
||||
if (!batch.n_seq_id) {
|
||||
n_seq_id.resize(batch.n_tokens);
|
||||
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
||||
@@ -303,6 +342,7 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
|
||||
}
|
||||
batch.n_seq_id = n_seq_id.data();
|
||||
}
|
||||
|
||||
if (!batch.seq_id) {
|
||||
seq_id.resize(batch.n_tokens + 1);
|
||||
seq_id[batch.n_tokens] = NULL;
|
||||
@@ -311,10 +351,221 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
|
||||
}
|
||||
batch.seq_id = seq_id.data();
|
||||
}
|
||||
|
||||
if (!batch.pos) {
|
||||
pos.resize(batch.n_tokens);
|
||||
|
||||
// initialize the starting position for each sequence based on the positions in the memory
|
||||
llama_pos p0[LLAMA_MAX_SEQ];
|
||||
for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
if (!memory) {
|
||||
p0[s] = 0;
|
||||
} else {
|
||||
p0[s] = memory->seq_pos_max(s) + 1;
|
||||
}
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
||||
const llama_seq_id seq_id = batch.seq_id[i][0];
|
||||
|
||||
pos[i] = p0[seq_id];
|
||||
|
||||
for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
|
||||
p0[batch.seq_id[i][s]] = pos[i] + 1;
|
||||
}
|
||||
}
|
||||
|
||||
batch.pos = pos.data();
|
||||
}
|
||||
|
||||
if (!batch.logits) {
|
||||
logits.resize(batch.n_tokens);
|
||||
logits[logits.size() - 1] = true;
|
||||
batch.logits = logits.data();
|
||||
if (embd_all) {
|
||||
// return the output for all tokens
|
||||
output.resize(batch.n_tokens, true);
|
||||
} else {
|
||||
// return the output only for the last token
|
||||
output.resize(batch.n_tokens, false);
|
||||
output[output.size() - 1] = true;
|
||||
}
|
||||
|
||||
batch.logits = output.data();
|
||||
} else if (embd_all) {
|
||||
bool warn = false;
|
||||
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||
if (batch.logits[i] == 0) {
|
||||
warn = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (warn) {
|
||||
LLAMA_LOG_WARN("%s: embeddings required but some input tokens were not marked as outputs -> overriding\n", __func__);
|
||||
|
||||
output.resize(batch.n_tokens, true);
|
||||
batch.logits = output.data();
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// compute stats
|
||||
//
|
||||
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||
n_outputs += batch.logits[i] != 0;
|
||||
}
|
||||
|
||||
// determine coupled sequences
|
||||
// these are pairs of sequences that have at least one token in the input batch that is assigned to both of them
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||
for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
|
||||
seq_pos[batch.seq_id[i][s]].insert(batch.pos[i]);
|
||||
|
||||
if (s > 0) {
|
||||
const llama_seq_id s0 = batch.seq_id[i][0];
|
||||
const llama_seq_id s1 = batch.seq_id[i][s];
|
||||
|
||||
// mark that sequence s1 is coupled to s0
|
||||
seq_cpl[s1][s0] = true;
|
||||
|
||||
// note: the other way around is not necessary for now
|
||||
//seq_cpl[s0][s1] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (debug > 0) {
|
||||
LLAMA_LOG_DEBUG("%s: input batch info:\n", __func__);
|
||||
LLAMA_LOG_DEBUG("%s: n_tokens = %d\n", __func__, batch.n_tokens);
|
||||
LLAMA_LOG_DEBUG("%s: token = %p\n", __func__, (void *) batch.token);
|
||||
LLAMA_LOG_DEBUG("%s: embd = %p\n", __func__, (void *) batch.embd);
|
||||
LLAMA_LOG_DEBUG("%s: pos = %p\n", __func__, (void *) batch.pos);
|
||||
LLAMA_LOG_DEBUG("%s: n_seq_id = %p\n", __func__, (void *) batch.n_seq_id);
|
||||
LLAMA_LOG_DEBUG("%s: seq_id = %p\n", __func__, (void *) batch.seq_id);
|
||||
LLAMA_LOG_DEBUG("%s: logits = %p\n", __func__, (void *) batch.logits);
|
||||
LLAMA_LOG_DEBUG("%s: n_outputs = %d\n", __func__, n_outputs);
|
||||
|
||||
if (debug > 1) {
|
||||
int seq_id_max = 0;
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
||||
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
||||
seq_id_max = std::max(seq_id_max, batch.seq_id[i][s]);
|
||||
}
|
||||
}
|
||||
}
|
||||
++seq_id_max;
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: token = [\n", __func__);
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||
std::vector<int8_t> seq_id(seq_id_max);
|
||||
|
||||
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
||||
seq_id[batch.seq_id[i][s]] = 1;
|
||||
}
|
||||
|
||||
std::stringstream ss;
|
||||
for (int s = 0; s < seq_id_max; ++s) {
|
||||
if (seq_id[s]) {
|
||||
ss << s%10;
|
||||
} else {
|
||||
ss << ".";
|
||||
}
|
||||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: %4d: id = %6d (%16s), pos = %4d, n_seq_id = %2d, seq_id = [%s], output = %d\n",
|
||||
__func__, i, batch.token[i], vocab.token_to_piece(batch.token[i]).c_str(),
|
||||
batch.pos[i], batch.n_seq_id[i], ss.str().c_str(), batch.logits[i]);
|
||||
}
|
||||
LLAMA_LOG_DEBUG("%s: ]\n", __func__);
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: seq = [\n", __func__);
|
||||
for (int s0 = 0; s0 < (int) seq_pos.size(); ++s0) {
|
||||
if (seq_pos[s0].empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::stringstream ss;
|
||||
for (int s1 = 0; s1 < (int) seq_cpl[s0].size(); ++s1) {
|
||||
if (seq_cpl[s0][s1]) {
|
||||
ss << s1 << " ";
|
||||
}
|
||||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: %4d: pos = [%4d, %4d], cpl = %s\n",
|
||||
__func__, s0, seq_pos_min(s0), seq_pos_max(s0), ss.str().empty() ? "-" : ss.str().c_str());
|
||||
}
|
||||
LLAMA_LOG_DEBUG("%s: ]\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// consistency checks
|
||||
//
|
||||
|
||||
for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
if (seq_pos[s].empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (memory && seq_pos_min(s) != memory->seq_pos_max(s) + 1) {
|
||||
LLAMA_LOG_ERROR("%s: sequence %d does not start from the last position stored in the memory\n", __func__, s);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (seq_pos_max(s) - seq_pos_min(s) + 1 > (int) seq_pos[s].size()) {
|
||||
LLAMA_LOG_ERROR("%s: sequence %d positions are not continuous\n", __func__, s);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (memory) {
|
||||
for (int32_t s0 = 0; s0 < LLAMA_MAX_SEQ; ++s0) {
|
||||
for (int32_t s1 = 0; s1 < LLAMA_MAX_SEQ; ++s1) {
|
||||
if (seq_cpl[s0][s1]) {
|
||||
if (memory->seq_pos_min(s0) != memory->seq_pos_min(s1) ||
|
||||
memory->seq_pos_max(s0) != memory->seq_pos_max(s1)) {
|
||||
LLAMA_LOG_ERROR("%s: sequence %d is coupled to %d in the input batch, but have divereged\n", __func__, s0, s1);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
const llama_batch & llama_batch_allocr::get_batch() const {
|
||||
return batch;
|
||||
}
|
||||
|
||||
uint32_t llama_batch_allocr::get_n_outputs() const {
|
||||
return n_outputs;
|
||||
}
|
||||
|
||||
llama_pos llama_batch_allocr::seq_pos_min(llama_seq_id seq_id) const {
|
||||
return seq_pos[seq_id].empty() ? -1 : *seq_pos[seq_id].begin();
|
||||
}
|
||||
|
||||
llama_pos llama_batch_allocr::seq_pos_max(llama_seq_id seq_id) const {
|
||||
return seq_pos[seq_id].empty() ? -1 : *seq_pos[seq_id].rbegin();
|
||||
}
|
||||
|
||||
void llama_batch_allocr::clear() {
|
||||
n_outputs = 0;
|
||||
|
||||
batch = {};
|
||||
pos.clear();
|
||||
n_seq_id.clear();
|
||||
seq_id.clear();
|
||||
output.clear();
|
||||
|
||||
for (auto & cur : seq_pos) {
|
||||
cur.clear();
|
||||
}
|
||||
|
||||
for (auto & cur : seq_cpl) {
|
||||
std::fill(cur.begin(), cur.end(), false);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
|
||||
#include <array>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
|
||||
// very similar to llama_batch,
|
||||
// but has more metadata about sequences
|
||||
@@ -18,8 +19,8 @@ struct llama_ubatch {
|
||||
llama_token * token; // [n_tokens]
|
||||
float * embd; // [n_embd, n_tokens]
|
||||
llama_pos * pos; // [n_tokens]
|
||||
int32_t * n_seq_id; // [n_seqs] // TODO: remove, should belong to only 1 sequence
|
||||
llama_seq_id ** seq_id; // [n_seqs] // TODO: become llama_seq_id * seq_id;
|
||||
int32_t * n_seq_id; // [n_seqs]
|
||||
llama_seq_id ** seq_id; // [n_seqs]
|
||||
int8_t * output; // [n_tokens]
|
||||
};
|
||||
|
||||
@@ -39,8 +40,6 @@ struct llama_sbatch {
|
||||
|
||||
size_t n_embd;
|
||||
|
||||
bool logits_all; // TODO: remove once lctx.logits_all is removed too
|
||||
|
||||
// sorted indices into the batch
|
||||
std::vector<int64_t> ids;
|
||||
// batch indices of the output
|
||||
@@ -76,19 +75,45 @@ struct llama_sbatch {
|
||||
llama_ubatch split_seq(size_t n_ubatch);
|
||||
|
||||
llama_sbatch() = default;
|
||||
llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
|
||||
llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false);
|
||||
};
|
||||
|
||||
// temporary allocate memory for the input batch if needed
|
||||
struct llama_batch_allocr {
|
||||
struct llama_batch batch;
|
||||
// a helper for sanitizing and fulfilling a batch
|
||||
class llama_batch_allocr {
|
||||
public:
|
||||
llama_batch_allocr();
|
||||
|
||||
// sanitize and auto-gen missing data in the input batch
|
||||
// memory is optional. if provided will be used to check for sequence continuity and to determine the positions
|
||||
bool init(
|
||||
const llama_batch & batch_inp,
|
||||
const llama_vocab & vocab,
|
||||
const llama_memory_i * memory,
|
||||
bool embd_all);
|
||||
|
||||
const llama_batch & get_batch() const;
|
||||
|
||||
uint32_t get_n_outputs() const;
|
||||
|
||||
llama_pos seq_pos_min(llama_seq_id seq_id) const;
|
||||
llama_pos seq_pos_max(llama_seq_id seq_id) const;
|
||||
|
||||
private:
|
||||
void clear();
|
||||
|
||||
llama_batch batch;
|
||||
|
||||
uint32_t n_outputs;
|
||||
|
||||
std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
|
||||
|
||||
std::vector<llama_pos> pos;
|
||||
std::vector<int32_t> n_seq_id;
|
||||
std::vector<llama_seq_id *> seq_id;
|
||||
std::vector<int8_t> logits;
|
||||
std::vector<int8_t> output;
|
||||
|
||||
// optionally fulfill the batch returned by llama_batch_get_one
|
||||
llama_batch_allocr(struct llama_batch in_batch, llama_pos p0);
|
||||
std::vector<std::set<llama_pos>> seq_pos; // seq_pos[s]: the set of positions in sequence s
|
||||
std::vector<std::vector<bool>> seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1
|
||||
|
||||
int debug;
|
||||
};
|
||||
|
||||
@@ -183,6 +183,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||
return LLM_CHAT_TEMPLATE_BAILING;
|
||||
} else if (tmpl_contains("<|header_start|>") && tmpl_contains("<|header_end|>")) {
|
||||
return LLM_CHAT_TEMPLATE_LLAMA4;
|
||||
} else if (tmpl_contains("<|endofuserprompt|>")) {
|
||||
return LLM_CHAT_TEMPLATE_DOTS1;
|
||||
}
|
||||
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
||||
}
|
||||
@@ -643,6 +645,21 @@ int32_t llm_chat_apply_template(
|
||||
if (add_ass) {
|
||||
ss << "Assistant:";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_DOTS1) {
|
||||
// dots.llm1.inst (DOTS1)
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
if (role == "system") {
|
||||
ss << "<|system|>" << message->content << "<|endofsystem|>";
|
||||
} else if (role == "user") {
|
||||
ss << "<|userprompt|>" << message->content << "<|endofuserprompt|>";
|
||||
} else {
|
||||
ss << "<|response|>" << message->content << "<|endofresponse|>";
|
||||
}
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|response|>";
|
||||
}
|
||||
} else {
|
||||
// template not supported
|
||||
return -1;
|
||||
|
||||
@@ -43,6 +43,7 @@ enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_BAILING,
|
||||
LLM_CHAT_TEMPLATE_LLAMA4,
|
||||
LLM_CHAT_TEMPLATE_SMOLVLM,
|
||||
LLM_CHAT_TEMPLATE_DOTS1,
|
||||
LLM_CHAT_TEMPLATE_UNKNOWN,
|
||||
};
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "llama-context.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-batch.h"
|
||||
#include "llama-io.h"
|
||||
#include "llama-memory.h"
|
||||
#include "llama-mmap.h"
|
||||
@@ -18,7 +19,8 @@
|
||||
llama_context::llama_context(
|
||||
const llama_model & model,
|
||||
llama_context_params params) :
|
||||
model(model) {
|
||||
model(model),
|
||||
batch_allocr(std::make_unique<llama_batch_allocr>()) {
|
||||
LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
|
||||
|
||||
t_start_us = model.t_start_us;
|
||||
@@ -27,8 +29,8 @@ llama_context::llama_context(
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
cparams.n_seq_max = std::max(1u, params.n_seq_max);
|
||||
if (cparams.n_seq_max > LLAMA_MAX_PARALLEL_SEQUENCES) {
|
||||
throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_PARALLEL_SEQUENCES));
|
||||
if (cparams.n_seq_max > LLAMA_MAX_SEQ) {
|
||||
throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_SEQ));
|
||||
}
|
||||
|
||||
cparams.n_threads = params.n_threads;
|
||||
@@ -494,7 +496,7 @@ float * llama_context::get_logits() {
|
||||
}
|
||||
|
||||
float * llama_context::get_logits_ith(int32_t i) {
|
||||
int32_t j = -1;
|
||||
int64_t j = -1;
|
||||
|
||||
try {
|
||||
if (logits == nullptr) {
|
||||
@@ -517,7 +519,7 @@ float * llama_context::get_logits_ith(int32_t i) {
|
||||
}
|
||||
if (j >= n_outputs) {
|
||||
// This should not happen
|
||||
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
|
||||
throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
|
||||
}
|
||||
|
||||
return logits + j*model.vocab.n_tokens();
|
||||
@@ -536,7 +538,7 @@ float * llama_context::get_embeddings() {
|
||||
}
|
||||
|
||||
float * llama_context::get_embeddings_ith(int32_t i) {
|
||||
int32_t j = -1;
|
||||
int64_t j = -1;
|
||||
|
||||
try {
|
||||
if (embd == nullptr) {
|
||||
@@ -559,7 +561,7 @@ float * llama_context::get_embeddings_ith(int32_t i) {
|
||||
}
|
||||
if (j >= n_outputs) {
|
||||
// This should not happen
|
||||
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
|
||||
throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
|
||||
}
|
||||
|
||||
return embd + j*model.hparams.n_embd;
|
||||
@@ -719,52 +721,41 @@ llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch,
|
||||
return res;
|
||||
}
|
||||
|
||||
int llama_context::encode(llama_batch & inp_batch) {
|
||||
if (inp_batch.n_tokens == 0) {
|
||||
int llama_context::encode(const llama_batch & batch_inp) {
|
||||
if (batch_inp.n_tokens == 0) {
|
||||
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// temporary allocate memory for the input batch if needed
|
||||
// note: during encode, we always pass the full sequence starting from pos = 0
|
||||
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : 0);
|
||||
if (!batch_allocr->init(batch_inp, model.vocab, nullptr, true)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
const llama_batch & batch = batch_allocr.batch;
|
||||
const int32_t n_tokens = batch.n_tokens;
|
||||
const llama_batch & batch = batch_allocr->get_batch();
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
const uint32_t n_tokens = batch.n_tokens;
|
||||
|
||||
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
||||
|
||||
// TODO: move the validation to the llama_batch_allocr
|
||||
if (batch.token) {
|
||||
for (int32_t i = 0; i < n_tokens; ++i) {
|
||||
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
|
||||
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
|
||||
LLAMA_LOG_ERROR("%s: invalid seq_id[%d] = %d > %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES);
|
||||
throw -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
|
||||
GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens");
|
||||
GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
|
||||
|
||||
if (t_compute_start_us == 0) {
|
||||
t_compute_start_us = ggml_time_us();
|
||||
}
|
||||
|
||||
// TODO: this clear of the buffer can easily be forgotten - need something better
|
||||
embd_seq.clear();
|
||||
|
||||
n_queued_tokens += n_tokens;
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
|
||||
llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true);
|
||||
|
||||
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
|
||||
|
||||
@@ -774,7 +765,7 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
return -2;
|
||||
};
|
||||
|
||||
for (int32_t i = 0; i < n_tokens; ++i) {
|
||||
for (uint32_t i = 0; i < n_tokens; ++i) {
|
||||
output_ids[i] = i;
|
||||
}
|
||||
|
||||
@@ -830,7 +821,8 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
|
||||
GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
|
||||
|
||||
for (int32_t i = 0; i < n_tokens; i++) {
|
||||
// TODO: fix indexing [UBATCH_IDX]
|
||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||
const llama_seq_id seq_id = ubatch.seq_id[i][0];
|
||||
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
|
||||
continue;
|
||||
@@ -845,6 +837,7 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
auto & embd_seq_out = embd_seq;
|
||||
const uint32_t n_cls_out = hparams.n_cls_out;
|
||||
|
||||
// TODO: fix indexing [UBATCH_IDX]
|
||||
for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
|
||||
const llama_seq_id seq_id = ubatch.seq_id[s][0];
|
||||
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
|
||||
@@ -878,10 +871,10 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
|
||||
// remember the sequence ids used during the encoding - needed for cross attention later
|
||||
cross.seq_ids_enc.resize(n_tokens);
|
||||
for (int32_t i = 0; i < n_tokens; i++) {
|
||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||
cross.seq_ids_enc[i].clear();
|
||||
for (int s = 0; s < ubatch.n_seq_id[i]; s++) {
|
||||
llama_seq_id seq_id = ubatch.seq_id[i][s];
|
||||
for (int s = 0; s < batch.n_seq_id[i]; s++) {
|
||||
llama_seq_id seq_id = batch.seq_id[i][s];
|
||||
cross.seq_ids_enc[i].insert(seq_id);
|
||||
}
|
||||
}
|
||||
@@ -890,51 +883,45 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int llama_context::decode(llama_batch & inp_batch) {
|
||||
int llama_context::decode(const llama_batch & batch_inp) {
|
||||
if (!memory) {
|
||||
LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__);
|
||||
return encode(inp_batch);
|
||||
return encode(batch_inp);
|
||||
}
|
||||
|
||||
if (inp_batch.n_tokens == 0) {
|
||||
if (batch_inp.n_tokens == 0) {
|
||||
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!inp_batch.pos) {
|
||||
if (inp_batch.seq_id) {
|
||||
LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
// when computing embeddings, all tokens are output
|
||||
const bool embd_all = cparams.embeddings;
|
||||
|
||||
if (!batch_allocr->init(batch_inp, model.vocab, memory.get(), embd_all)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// temporary allocate memory for the input batch if needed
|
||||
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : memory->seq_pos_max(0) + 1);
|
||||
|
||||
const llama_batch & batch = batch_allocr.batch;
|
||||
const llama_batch & batch = batch_allocr->get_batch();
|
||||
|
||||
const auto & vocab = model.vocab;
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int32_t n_vocab = vocab.n_tokens();
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
const int64_t n_tokens_all = batch.n_tokens;
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const uint32_t n_tokens_all = batch.n_tokens;
|
||||
|
||||
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
||||
|
||||
// TODO: move the validation to the llama_batch_allocr
|
||||
if (batch.token) {
|
||||
for (int64_t i = 0; i < n_tokens_all; ++i) {
|
||||
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
|
||||
LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]);
|
||||
return -1;
|
||||
}
|
||||
const uint32_t n_outputs_all = batch_allocr->get_n_outputs();
|
||||
|
||||
if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
|
||||
LLAMA_LOG_ERROR("%s: invalid seq_id[%" PRId64 "] = %d >= %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES);
|
||||
return -1;
|
||||
}
|
||||
if (embd_all) {
|
||||
// require that all tokens are output
|
||||
if (n_outputs_all != n_tokens_all) {
|
||||
LLAMA_LOG_ERROR("%s: pooled embedding requires that all tokens are output (n_outputs_all = %d, n_tokens_all = %d)\n",
|
||||
__func__, n_outputs_all, n_tokens_all);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -947,25 +934,9 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
}
|
||||
n_queued_tokens += n_tokens_all;
|
||||
|
||||
// this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
|
||||
const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
|
||||
|
||||
// TODO: this clear of the buffer can easily be forgotten - need something better
|
||||
embd_seq.clear();
|
||||
|
||||
int64_t n_outputs_all = 0;
|
||||
|
||||
// count outputs
|
||||
if (batch.logits && !embd_pooled) {
|
||||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
||||
n_outputs_all += batch.logits[i] != 0;
|
||||
}
|
||||
} else if (embd_pooled) {
|
||||
n_outputs_all = n_tokens_all;
|
||||
} else {
|
||||
// keep last output only
|
||||
n_outputs_all = 1;
|
||||
}
|
||||
|
||||
bool did_optimize = false;
|
||||
|
||||
// handle any pending defrags/shifts
|
||||
@@ -974,7 +945,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
llama_memory_state_ptr mstate;
|
||||
|
||||
while (true) {
|
||||
mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all);
|
||||
mstate = memory->init_batch(batch, cparams.n_ubatch, embd_all);
|
||||
if (!mstate) {
|
||||
return -2;
|
||||
}
|
||||
@@ -1018,7 +989,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
|
||||
// reserve output buffer
|
||||
if (output_reserve(n_outputs_all) < n_outputs_all) {
|
||||
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
|
||||
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
|
||||
return -2;
|
||||
};
|
||||
|
||||
@@ -1027,7 +998,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
do {
|
||||
const auto & ubatch = mstate->get_ubatch();
|
||||
|
||||
// count the outputs in this u_batch
|
||||
// count the outputs in this ubatch
|
||||
{
|
||||
int32_t n_outputs_new = 0;
|
||||
|
||||
@@ -1052,18 +1023,19 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
|
||||
if (!res) {
|
||||
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
|
||||
llama_pos pos_min[LLAMA_MAX_PARALLEL_SEQUENCES];
|
||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
llama_pos pos_min[LLAMA_MAX_SEQ];
|
||||
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
pos_min[s] = std::numeric_limits<llama_pos>::max();
|
||||
}
|
||||
|
||||
// TODO: fix sequence indexing
|
||||
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
|
||||
const auto & seq_id = ubatch.seq_id[i][0];
|
||||
|
||||
pos_min[seq_id] = std::min(pos_min[seq_id], ubatch.pos[i]);
|
||||
}
|
||||
|
||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
if (pos_min[s] == std::numeric_limits<llama_pos>::max()) {
|
||||
continue;
|
||||
}
|
||||
@@ -1086,7 +1058,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
||||
//}
|
||||
|
||||
auto * t_logits = cparams.embeddings ? nullptr : res->get_logits();
|
||||
auto * t_logits = res->get_logits();
|
||||
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
|
||||
|
||||
if (t_embd && res->get_embd_pooled()) {
|
||||
@@ -1170,14 +1142,14 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
n_outputs = n_outputs_all;
|
||||
|
||||
// set output mappings
|
||||
{
|
||||
if (n_outputs > 0) {
|
||||
bool sorted_output = true;
|
||||
|
||||
auto & out_ids = mstate->out_ids();
|
||||
|
||||
GGML_ASSERT(out_ids.size() == (size_t) n_outputs_all);
|
||||
GGML_ASSERT(out_ids.size() == (size_t) n_outputs);
|
||||
|
||||
for (int64_t i = 0; i < n_outputs_all; ++i) {
|
||||
for (int64_t i = 0; i < n_outputs; ++i) {
|
||||
int64_t out_id = out_ids[i];
|
||||
output_ids[out_id] = i;
|
||||
if (out_id != i) {
|
||||
@@ -1189,20 +1161,22 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
// note: this is mostly relevant for recurrent models atm
|
||||
if (!sorted_output) {
|
||||
const uint32_t n_vocab = model.vocab.n_tokens();
|
||||
const uint32_t n_embd = model.hparams.n_embd;
|
||||
const uint64_t n_embd = model.hparams.n_embd;
|
||||
|
||||
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
||||
|
||||
// TODO: is there something more efficient which also minimizes swaps?
|
||||
// selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
|
||||
for (int32_t i = 0; i < n_outputs - 1; ++i) {
|
||||
int32_t j_min = i;
|
||||
for (int32_t j = i + 1; j < n_outputs; ++j) {
|
||||
for (uint32_t i = 0; i < n_outputs - 1; ++i) {
|
||||
uint32_t j_min = i;
|
||||
for (uint32_t j = i + 1; j < n_outputs; ++j) {
|
||||
if (out_ids[j] < out_ids[j_min]) {
|
||||
j_min = j;
|
||||
}
|
||||
}
|
||||
if (j_min == i) { continue; }
|
||||
if (j_min == i) {
|
||||
continue;
|
||||
}
|
||||
std::swap(out_ids[i], out_ids[j_min]);
|
||||
if (logits_size > 0) {
|
||||
for (uint32_t k = 0; k < n_vocab; k++) {
|
||||
@@ -1215,8 +1189,10 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::fill(output_ids.begin(), output_ids.end(), -1);
|
||||
for (int32_t i = 0; i < n_outputs; ++i) {
|
||||
|
||||
for (uint32_t i = 0; i < n_outputs; ++i) {
|
||||
output_ids[out_ids[i]] = i;
|
||||
}
|
||||
}
|
||||
@@ -1236,7 +1212,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
// output
|
||||
//
|
||||
|
||||
int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
const auto & hparams = model.hparams;
|
||||
const auto & vocab = model.vocab;
|
||||
|
||||
@@ -1246,9 +1222,8 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
const auto n_vocab = vocab.n_tokens();
|
||||
const auto n_embd = hparams.n_embd;
|
||||
|
||||
// TODO: use a per-batch flag for logits presence instead
|
||||
bool has_logits = !cparams.embeddings;
|
||||
bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
||||
bool has_logits = true;
|
||||
bool has_embd = cparams.embeddings;
|
||||
|
||||
// TODO: hacky enc-dec support
|
||||
if (model.arch == LLM_ARCH_T5) {
|
||||
@@ -1302,8 +1277,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
// set all ids as invalid (negative)
|
||||
std::fill(output_ids.begin(), output_ids.end(), -1);
|
||||
|
||||
this->n_outputs = 0;
|
||||
this->n_outputs_max = n_outputs_max;
|
||||
this->n_outputs = 0;
|
||||
|
||||
return n_outputs_max;
|
||||
}
|
||||
@@ -1332,7 +1306,7 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
||||
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
|
||||
if (n_tokens % n_seqs != 0) {
|
||||
n_tokens = (n_tokens / n_seqs) * n_seqs;
|
||||
n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
|
||||
n_outputs = std::min(n_outputs, n_tokens);
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
@@ -1794,14 +1768,12 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
||||
|
||||
std::vector<int32_t> w_output_pos;
|
||||
|
||||
GGML_ASSERT(n_outputs <= n_outputs_max);
|
||||
|
||||
w_output_pos.resize(n_outputs);
|
||||
|
||||
// build a more compact representation of the output ids
|
||||
for (size_t i = 0; i < n_batch(); ++i) {
|
||||
// map an output id to a position in the batch
|
||||
int32_t pos = output_ids[i];
|
||||
int64_t pos = output_ids[i];
|
||||
if (pos >= 0) {
|
||||
GGML_ASSERT(pos < n_outputs);
|
||||
w_output_pos[pos] = i;
|
||||
@@ -2071,14 +2043,11 @@ void llama_context::opt_epoch_iter(
|
||||
|
||||
n_queued_tokens += n_tokens_all;
|
||||
|
||||
// this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
|
||||
const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
|
||||
|
||||
embd_seq.clear();
|
||||
|
||||
int64_t n_outputs_all = n_tokens_all;
|
||||
uint32_t n_outputs_all = n_tokens_all;
|
||||
|
||||
auto mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ true);
|
||||
auto mstate = memory->init_batch(batch, cparams.n_ubatch, true);
|
||||
if (!mstate || mstate->get_status() != LLAMA_MEMORY_STATUS_SUCCESS) {
|
||||
LLAMA_LOG_ERROR("%s: could not initialize batch\n", __func__);
|
||||
break;
|
||||
@@ -2086,7 +2055,7 @@ void llama_context::opt_epoch_iter(
|
||||
|
||||
// reserve output buffer
|
||||
if (output_reserve(n_outputs_all) < n_outputs_all) {
|
||||
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
|
||||
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
|
||||
GGML_ABORT("TODO: handle this error");
|
||||
};
|
||||
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
#include "llama-batch.h"
|
||||
#include "llama-cparams.h"
|
||||
#include "llama-graph.h"
|
||||
#include "llama-adapter.h"
|
||||
@@ -13,6 +12,7 @@
|
||||
#include <vector>
|
||||
|
||||
struct llama_model;
|
||||
class llama_batch_allocr;
|
||||
|
||||
class llama_io_read_i;
|
||||
class llama_io_write_i;
|
||||
@@ -102,8 +102,8 @@ struct llama_context {
|
||||
llama_memory_state_i * mstate,
|
||||
ggml_status & ret);
|
||||
|
||||
int encode(llama_batch & inp_batch);
|
||||
int decode(llama_batch & inp_batch);
|
||||
int encode(const llama_batch & batch_inp);
|
||||
int decode(const llama_batch & batch_inp);
|
||||
|
||||
//
|
||||
// state save/load
|
||||
@@ -181,7 +181,7 @@ private:
|
||||
|
||||
// Make sure enough space is available for outputs.
|
||||
// Returns max number of outputs for which space was reserved.
|
||||
int32_t output_reserve(int32_t n_outputs);
|
||||
uint32_t output_reserve(int32_t n_outputs);
|
||||
|
||||
//
|
||||
// graph
|
||||
@@ -246,8 +246,10 @@ private:
|
||||
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
|
||||
std::map<llama_seq_id, std::vector<float>> embd_seq;
|
||||
|
||||
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
|
||||
int32_t n_outputs_max = 0; // capacity (of tokens positions) for the output buffers
|
||||
// reuse the batch_allocr to avoid unnecessary memory allocations
|
||||
std::unique_ptr<llama_batch_allocr> batch_allocr;
|
||||
|
||||
uint32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
|
||||
|
||||
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#include "llama-cparams.h"
|
||||
|
||||
size_t llama_max_parallel_sequences(void) {
|
||||
return LLAMA_MAX_PARALLEL_SEQUENCES;
|
||||
return LLAMA_MAX_SEQ;
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#define LLAMA_MAX_PARALLEL_SEQUENCES 64
|
||||
#define LLAMA_MAX_SEQ 64
|
||||
|
||||
struct llama_cparams {
|
||||
uint32_t n_ctx; // context size used during inference
|
||||
|
||||
@@ -139,6 +139,7 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
|
||||
|
||||
std::vector<uint64_t> sum(n_tokens, 0);
|
||||
|
||||
// TODO: fix indexing [UBATCH_IDX]
|
||||
for (int s = 0; s < n_seqs; ++s) {
|
||||
const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
||||
|
||||
@@ -156,6 +157,7 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: fix indexing [UBATCH_IDX]
|
||||
for (int s = 0; s < n_seqs; ++s) {
|
||||
const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
||||
|
||||
@@ -180,6 +182,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
|
||||
uint32_t * data = (uint32_t *) cls->data;
|
||||
memset(cls->data, 0, n_tokens * ggml_element_size(cls));
|
||||
|
||||
// TODO: fix indexing [UBATCH_IDX]
|
||||
for (int s = 0; s < n_seqs; ++s) {
|
||||
const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
||||
|
||||
@@ -210,6 +213,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
|
||||
std::vector<int> last_pos(n_tokens, -1);
|
||||
std::vector<int> last_row(n_tokens, -1);
|
||||
|
||||
// TODO: fix indexing [UBATCH_IDX]
|
||||
for (int s = 0; s < n_seqs; ++s) {
|
||||
const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
||||
|
||||
@@ -250,22 +254,6 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
|
||||
GGML_UNUSED(ubatch);
|
||||
|
||||
const int64_t n_kv = kv_state->get_n_kv();
|
||||
|
||||
if (s_mask) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(s_mask->buffer));
|
||||
float * data = (float *) s_mask->data;
|
||||
|
||||
// clear unused states
|
||||
for (int i = 0; i < n_kv; ++i) {
|
||||
data[i] = kv_state->s_mask(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
|
||||
GGML_UNUSED(ubatch);
|
||||
|
||||
@@ -299,6 +287,7 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
||||
const int32_t ti = s0*n_seq_tokens + i;
|
||||
float f = -INFINITY;
|
||||
|
||||
// TODO: fix indexing [UBATCH_IDX]
|
||||
for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
|
||||
if (ubatch->seq_id[s0][s] == seq_id && ubatch->pos[ti] <= ubatch->pos[tj]) {
|
||||
if (hparams.use_alibi) {
|
||||
@@ -338,6 +327,7 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
||||
const int32_t ti = s0*n_seq_tokens + i;
|
||||
float f = -INFINITY;
|
||||
|
||||
// TODO: fix indexing [UBATCH_IDX]
|
||||
for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
|
||||
if (ubatch->seq_id[s0][s] == seq_id) {
|
||||
if (hparams.use_alibi) {
|
||||
@@ -393,6 +383,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||
for (int j = 0; j < n_tokens; ++j) {
|
||||
for (int i = 0; i < n_enc; ++i) {
|
||||
float f = -INFINITY;
|
||||
// TODO: fix indexing [UBATCH_IDX]
|
||||
for (int s = 0; s < ubatch->n_seq_id[j]; ++s) {
|
||||
const llama_seq_id seq_id = ubatch->seq_id[j][s];
|
||||
if (cross->seq_ids_enc[i].find(seq_id) != cross->seq_ids_enc[i].end()) {
|
||||
@@ -650,6 +641,7 @@ ggml_tensor * llm_graph_context::build_ffn(
|
||||
{
|
||||
// Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
|
||||
int64_t split_point = cur->ne[0] / 2;
|
||||
// TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217
|
||||
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
|
||||
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
|
||||
|
||||
@@ -663,7 +655,7 @@ ggml_tensor * llm_graph_context::build_ffn(
|
||||
{
|
||||
// Split into two equal parts
|
||||
int64_t split_point = cur->ne[0] / 2;
|
||||
// TODO: these conts should not be needed
|
||||
// TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217
|
||||
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
|
||||
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
|
||||
|
||||
@@ -986,23 +978,6 @@ ggml_tensor * llm_graph_context::build_inp_s_copy() const {
|
||||
return cur;
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_s_mask() const {
|
||||
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_s_mask>(kv_state);
|
||||
|
||||
const auto n_kv = kv_state->get_n_kv();
|
||||
|
||||
auto & cur = inp->s_mask;
|
||||
|
||||
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
|
||||
ggml_set_input(cur);
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
|
||||
auto inp = std::make_unique<llm_graph_input_cross_embd>(cross);
|
||||
|
||||
@@ -1455,43 +1430,53 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
return cur;
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_copy_mask_state(
|
||||
ggml_tensor * llm_graph_context::build_recurrent_state(
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * s,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
int32_t n_state,
|
||||
int32_t n_seqs) const {
|
||||
int32_t state_size,
|
||||
int32_t n_seqs,
|
||||
bool avoid_copies) const {
|
||||
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
||||
|
||||
const auto n_kv = kv_state->get_n_kv();
|
||||
const auto kv_head = kv_state->get_head();
|
||||
const auto rs_zero = kv_state->get_rs_z();
|
||||
|
||||
ggml_tensor * states = ggml_reshape_2d(ctx0, s, n_state, kv_state->get_size());
|
||||
ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, kv_state->get_size());
|
||||
|
||||
// copy states
|
||||
// NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
|
||||
// this shrinks the tensors's ne[1] to n_kv
|
||||
states = ggml_get_rows(ctx0, states, state_copy);
|
||||
// Clear a single state which will then be copied to the other cleared states.
|
||||
// Note that this is a no-op when the view is zero-sized.
|
||||
ggml_tensor * state_zero = ggml_view_1d(ctx0, states, state_size*(rs_zero >= 0), rs_zero*states->nb[1]*(rs_zero >= 0));
|
||||
ggml_build_forward_expand(gf, ggml_scale_inplace(ctx0, state_zero, 0));
|
||||
|
||||
// clear states of sequences which are starting at the beginning of this batch
|
||||
// FIXME: zero-out NANs?
|
||||
states = ggml_mul(ctx0, states, state_mask);
|
||||
ggml_tensor * output_states;
|
||||
|
||||
// copy states which won't be changed further (between n_seqs and n_kv)
|
||||
if (!avoid_copies) {
|
||||
// copy states
|
||||
// NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
|
||||
// {state_size, kv_size} -> {state_size, n_seqs}
|
||||
output_states = ggml_get_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_seqs, 0));
|
||||
ggml_build_forward_expand(gf, output_states);
|
||||
} else {
|
||||
// FIXME: make the gathering operation happen before the copy below
|
||||
// (maybe with an optional lambda function passed as a parameter instead of `avoid_copies`?)
|
||||
output_states = states;
|
||||
}
|
||||
|
||||
// copy extra states which won't be changed further (between n_seqs and n_kv)
|
||||
ggml_tensor * states_extra = ggml_get_rows(ctx0, states, ggml_view_1d(ctx0, state_copy, n_kv - n_seqs, n_seqs*state_copy->nb[0]));
|
||||
ggml_build_forward_expand(gf,
|
||||
ggml_cpy(ctx0,
|
||||
ggml_view_1d(ctx0, states, n_state*(n_kv - n_seqs), (n_seqs )*n_state*ggml_element_size(states)),
|
||||
ggml_view_1d(ctx0, s, n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s))));
|
||||
states_extra,
|
||||
ggml_view_1d(ctx0, s, state_size*(n_kv - n_seqs), (kv_head + n_seqs)*state_size*ggml_element_size(s))));
|
||||
|
||||
// the part of the states that will be used and modified
|
||||
return ggml_view_2d(ctx0, states, n_state, n_seqs, states->nb[1], 0);
|
||||
return output_states;
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
||||
@@ -1502,8 +1487,8 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
|
||||
|
||||
ggml_tensor * token_shift_all = kv_state->get_k_l(il);
|
||||
|
||||
ggml_tensor * token_shift = build_copy_mask_state(
|
||||
gf, token_shift_all, state_copy, state_mask,
|
||||
ggml_tensor * token_shift = build_recurrent_state(
|
||||
gf, token_shift_all, state_copy,
|
||||
hparams.n_embd_k_s(), n_seqs);
|
||||
|
||||
token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs);
|
||||
@@ -1578,23 +1563,30 @@ void llm_graph_context::build_pooling(
|
||||
ggml_tensor * inp_cls = build_inp_cls();
|
||||
inp = ggml_get_rows(ctx0, inp, inp_cls);
|
||||
|
||||
if (cls != nullptr && cls_b != nullptr) {
|
||||
if (cls) {
|
||||
// classification head
|
||||
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
|
||||
cur = ggml_mul_mat(ctx0, cls, inp);
|
||||
if (cls_b) {
|
||||
cur = ggml_add(ctx0, cur, cls_b);
|
||||
}
|
||||
cur = ggml_tanh(ctx0, cur);
|
||||
|
||||
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
||||
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
|
||||
if (cls_out) {
|
||||
GGML_ASSERT(cls_out_b != nullptr);
|
||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
|
||||
cur = ggml_mul_mat(ctx0, cls_out, cur);
|
||||
if (cls_out_b) {
|
||||
cur = ggml_add(ctx0, cur, cls_out_b);
|
||||
}
|
||||
}
|
||||
} else if (cls_out) {
|
||||
// Single layer classification head (direct projection)
|
||||
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
|
||||
GGML_ASSERT(cls_out_b != nullptr);
|
||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, inp), cls_out_b);
|
||||
cur = ggml_mul_mat(ctx0, cls_out, inp);
|
||||
if (cls_out_b) {
|
||||
cur = ggml_add(ctx0, cur, cls_out_b);
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
|
||||
}
|
||||
|
||||
@@ -200,18 +200,6 @@ public:
|
||||
const llama_kv_cache_recurrent_state * kv_state;
|
||||
};
|
||||
|
||||
class llm_graph_input_s_mask : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_s_mask(const llama_kv_cache_recurrent_state * kv_state) : kv_state(kv_state) {}
|
||||
virtual ~llm_graph_input_s_mask() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
ggml_tensor * s_mask; // F32 [1, n_kv]
|
||||
|
||||
const llama_kv_cache_recurrent_state * kv_state;
|
||||
};
|
||||
|
||||
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_cross_embd(
|
||||
@@ -390,7 +378,7 @@ struct llm_graph_params {
|
||||
const llama_memory_state_i * mstate;
|
||||
const llama_cross * cross;
|
||||
|
||||
int32_t n_outputs;
|
||||
uint32_t n_outputs;
|
||||
|
||||
const llm_graph_cb & cb;
|
||||
};
|
||||
@@ -424,8 +412,8 @@ struct llm_graph_context {
|
||||
const float norm_eps;
|
||||
const float norm_rms_eps;
|
||||
|
||||
const int32_t n_tokens;
|
||||
const int32_t n_outputs;
|
||||
const int64_t n_tokens;
|
||||
const int64_t n_outputs;
|
||||
const int32_t n_ctx_orig; // yarn
|
||||
|
||||
const enum llama_pooling_type pooling_type;
|
||||
@@ -521,7 +509,6 @@ struct llm_graph_context {
|
||||
ggml_tensor * build_inp_mean() const;
|
||||
ggml_tensor * build_inp_cls() const;
|
||||
ggml_tensor * build_inp_s_copy() const;
|
||||
ggml_tensor * build_inp_s_mask() const;
|
||||
|
||||
ggml_tensor * build_inp_cross_embd() const;
|
||||
ggml_tensor * build_inp_pos_bucket_enc() const;
|
||||
@@ -606,18 +593,17 @@ struct llm_graph_context {
|
||||
// recurrent
|
||||
//
|
||||
|
||||
ggml_tensor * build_copy_mask_state(
|
||||
ggml_tensor * build_recurrent_state(
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * s,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
int32_t n_state,
|
||||
int32_t n_seqs) const;
|
||||
int32_t state_size,
|
||||
int32_t n_seqs,
|
||||
bool avoid_copies = false) const;
|
||||
|
||||
ggml_tensor * build_rwkv_token_shift_load(
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const;
|
||||
|
||||
|
||||
@@ -359,18 +359,16 @@ llama_pos llama_kv_cache_recurrent::seq_pos_max(llama_seq_id seq_id) const {
|
||||
return result;
|
||||
}
|
||||
|
||||
llama_memory_state_ptr llama_kv_cache_recurrent::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) {
|
||||
GGML_UNUSED(embd_pooled);
|
||||
|
||||
auto sbatch = llama_sbatch(batch, hparams.n_embd, false, logits_all);
|
||||
llama_memory_state_ptr llama_kv_cache_recurrent::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_all) {
|
||||
auto sbatch = llama_sbatch(batch, hparams.n_embd, false);
|
||||
|
||||
std::vector<llama_ubatch> ubatches;
|
||||
|
||||
while (sbatch.n_tokens > 0) {
|
||||
llama_ubatch ubatch;
|
||||
|
||||
if (embd_pooled) {
|
||||
// Pooled embeddings cannot be split across ubatches (yet)
|
||||
if (embd_all) {
|
||||
// if all tokens are output, split by sequence
|
||||
ubatch = sbatch.split_seq(n_ubatch);
|
||||
} else {
|
||||
ubatch = sbatch.split_equal(n_ubatch);
|
||||
@@ -406,21 +404,12 @@ bool llama_kv_cache_recurrent::prepare(const std::vector<llama_ubatch> & ubatche
|
||||
|
||||
bool success = true;
|
||||
|
||||
// TODO: here we have to verify that all ubatches can fit in the cells
|
||||
// however, the current implementation is broken because it relies on s_copy() and s_mask() to update the cells
|
||||
// during the compute of each ubatch. to reproduce, uncomment the following loop and run:
|
||||
//
|
||||
// $ llama-parallel -m ./mamba-130m/ggml-model-f16.gguf -np 5 -ns 8
|
||||
//
|
||||
// recovery from failures when the batch does not fit in the KV cache will not work correctly until this is fixed
|
||||
//
|
||||
GGML_UNUSED(ubatches);
|
||||
//for (const auto & ubatch : ubatches) {
|
||||
// if (!find_slot(ubatch)) {
|
||||
// success = false;
|
||||
// break;
|
||||
// }
|
||||
//}
|
||||
for (const auto & ubatch : ubatches) {
|
||||
if (!find_slot(ubatch)) {
|
||||
success = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// restore the original state
|
||||
cells = std::move(org_cells);
|
||||
@@ -431,14 +420,13 @@ bool llama_kv_cache_recurrent::prepare(const std::vector<llama_ubatch> & ubatche
|
||||
}
|
||||
|
||||
bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||
const uint32_t n_tokens = ubatch.n_tokens;
|
||||
const uint32_t n_seqs = ubatch.n_seqs;
|
||||
const uint32_t n_seqs = ubatch.n_seqs;
|
||||
|
||||
const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
|
||||
// if we have enough unused cells before the current head ->
|
||||
// better to start searching from the beginning of the cache, hoping to fill it
|
||||
if (head > used + 2*n_tokens) {
|
||||
if (head > used + 2*n_seqs) {
|
||||
head = 0;
|
||||
}
|
||||
|
||||
@@ -534,16 +522,16 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||
empty_cell.src = orig_cell.src;
|
||||
orig_cell.seq_id.erase(seq_id);
|
||||
empty_cell.seq_id.insert(seq_id); // will be overwritten
|
||||
GGML_ASSERT(!orig_cell.is_empty()); // has at least one remaining seq_id
|
||||
}
|
||||
seq_meta.tail = next_empty_cell;
|
||||
// find next empty cell
|
||||
if (s + 1 < n_seqs) {
|
||||
next_empty_cell += 1;
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
next_empty_cell += 1;
|
||||
if (next_empty_cell >= size) { next_empty_cell -= size; }
|
||||
kv_cell & cell = cells[next_empty_cell];
|
||||
if (cell.is_empty()) { break; }
|
||||
next_empty_cell += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -553,8 +541,8 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||
|
||||
// gather and re-order
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
int32_t dst_id = s + min;
|
||||
int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
|
||||
const int32_t dst_id = s + min;
|
||||
const int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
|
||||
if (dst_id != src_id) {
|
||||
kv_cell & dst_cell = cells[dst_id];
|
||||
kv_cell & src_cell = cells[src_id];
|
||||
@@ -563,12 +551,14 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||
std::swap(dst_cell.src, src_cell.src);
|
||||
std::swap(dst_cell.seq_id, src_cell.seq_id);
|
||||
|
||||
// swap tails (assuming they NEVER overlap)
|
||||
for (const llama_seq_id seq_id : src_cell.seq_id) {
|
||||
cells[seq_id].tail = src_id;
|
||||
}
|
||||
for (const llama_seq_id seq_id : dst_cell.seq_id) {
|
||||
cells[seq_id].tail = dst_id;
|
||||
// swap tails
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
int32_t & tail = cells[i].tail;
|
||||
if (tail == src_id) {
|
||||
tail = dst_id;
|
||||
} else if (tail == dst_id) {
|
||||
tail = src_id;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -576,7 +566,7 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||
// update the pos of the used seqs
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
|
||||
int32_t cell_id = s + min;
|
||||
const int32_t cell_id = s + min;
|
||||
kv_cell & cell = cells[cell_id];
|
||||
|
||||
if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
|
||||
@@ -594,6 +584,38 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
// Find first cell without src refs, to use as the zero-ed state
|
||||
{
|
||||
// TODO: bake-in src refcounts in the cell metadata
|
||||
std::vector<int32_t> refcounts(size, 0);
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
const int32_t src = cells[i].src;
|
||||
if (src >= 0) {
|
||||
refcounts[src] += 1;
|
||||
}
|
||||
}
|
||||
|
||||
rs_z = -1;
|
||||
for (int i = min; i <= max; ++i) {
|
||||
if (refcounts[i] == 0) {
|
||||
rs_z = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = min; i <= max; ++i) {
|
||||
if (cells[i].src < 0) {
|
||||
GGML_ASSERT(rs_z >= 0);
|
||||
cells[i].src0 = rs_z;
|
||||
} else {
|
||||
// Stage the source ids for all used cells to allow correct seq_* behavior
|
||||
// and still make these values available when setting the inputs
|
||||
cells[i].src0 = cells[i].src;
|
||||
}
|
||||
cells[i].src = i; // avoid moving or clearing twice
|
||||
}
|
||||
}
|
||||
|
||||
// allow getting the range of used cells, from head to head + n
|
||||
head = min;
|
||||
n = max - min + 1;
|
||||
@@ -605,47 +627,8 @@ bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||
}
|
||||
|
||||
bool llama_kv_cache_recurrent::get_can_shift() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
int32_t llama_kv_cache_recurrent::s_copy(int i) const {
|
||||
const uint32_t cell_id = i + head;
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// TODO: this should not mutate the KV cache !
|
||||
kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
|
||||
|
||||
// prevent out-of-bound sources
|
||||
if (cell.src < 0 || (uint32_t) cell.src >= size) {
|
||||
cell.src = cell_id;
|
||||
}
|
||||
|
||||
int32_t res = cell.src;
|
||||
|
||||
// TODO: do not mutate the KV cache
|
||||
// ensure copy only happens once
|
||||
if (cell.src != (int32_t) cell_id) {
|
||||
cell.src = cell_id;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
float llama_kv_cache_recurrent::s_mask(int i) const {
|
||||
const uint32_t cell_id = i + head;
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// TODO: this should not mutate the KV cache !
|
||||
kv_cell & cell = const_cast<kv_cell &>(cells[cell_id]);
|
||||
|
||||
float res = (float) (cell.src >= 0);
|
||||
|
||||
// only clear once
|
||||
if (cell.src < 0) {
|
||||
cell.src = cell_id;
|
||||
}
|
||||
|
||||
return res;
|
||||
// shifting the pos is trivial for recurrent models
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t llama_kv_cache_recurrent::total_size() const {
|
||||
@@ -1111,6 +1094,10 @@ uint32_t llama_kv_cache_recurrent_state::get_head() const {
|
||||
return is_full ? 0 : kv->head;
|
||||
}
|
||||
|
||||
int32_t llama_kv_cache_recurrent_state::get_rs_z() const {
|
||||
return is_full ? 0 : kv->rs_z;
|
||||
}
|
||||
|
||||
uint32_t llama_kv_cache_recurrent_state::get_size() const {
|
||||
return kv->size;
|
||||
}
|
||||
@@ -1124,9 +1111,5 @@ ggml_tensor * llama_kv_cache_recurrent_state::get_v_l(int32_t il) const {
|
||||
}
|
||||
|
||||
int32_t llama_kv_cache_recurrent_state::s_copy(int i) const {
|
||||
return kv->s_copy(i);
|
||||
}
|
||||
|
||||
float llama_kv_cache_recurrent_state::s_mask(int i) const {
|
||||
return kv->s_mask(i);
|
||||
return kv->cells[i + kv->head].src0;
|
||||
}
|
||||
|
||||
@@ -32,8 +32,7 @@ public:
|
||||
llama_memory_state_ptr init_batch(
|
||||
const llama_batch & batch,
|
||||
uint32_t n_ubatch,
|
||||
bool embd_pooled,
|
||||
bool logits_all) override;
|
||||
bool embd_all) override;
|
||||
|
||||
llama_memory_state_ptr init_full() override;
|
||||
|
||||
@@ -57,10 +56,6 @@ public:
|
||||
|
||||
bool get_can_shift() const override;
|
||||
|
||||
// TODO: temporary methods - they are not really const as they do const_cast<>, fix this
|
||||
int32_t s_copy(int i) const;
|
||||
float s_mask(int i) const;
|
||||
|
||||
// state write/load
|
||||
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
||||
@@ -73,10 +68,14 @@ public:
|
||||
// computed before each graph build
|
||||
uint32_t n = 0;
|
||||
|
||||
// first zero-ed state
|
||||
int32_t rs_z = -1;
|
||||
|
||||
// TODO: optimize for recurrent state needs
|
||||
struct kv_cell {
|
||||
llama_pos pos = -1;
|
||||
int32_t src = -1; // used to copy states
|
||||
int32_t src = -1; // used to know where states should be copied from
|
||||
int32_t src0 = -1; // like src, but only used when setting the inputs (allowing to copy once)
|
||||
int32_t tail = -1;
|
||||
|
||||
std::set<llama_seq_id> seq_id;
|
||||
@@ -157,13 +156,13 @@ public:
|
||||
|
||||
uint32_t get_n_kv() const;
|
||||
uint32_t get_head() const;
|
||||
int32_t get_rs_z() const;
|
||||
uint32_t get_size() const;
|
||||
|
||||
ggml_tensor * get_k_l(int32_t il) const;
|
||||
ggml_tensor * get_v_l(int32_t il) const;
|
||||
|
||||
int32_t s_copy(int i) const;
|
||||
float s_mask(int i) const;
|
||||
|
||||
private:
|
||||
const llama_memory_status status;
|
||||
|
||||
@@ -95,36 +95,69 @@ llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
|
||||
return kv_swa->seq_pos_max(seq_id);
|
||||
}
|
||||
|
||||
llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) {
|
||||
GGML_UNUSED(embd_pooled);
|
||||
llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_all) {
|
||||
GGML_UNUSED(embd_all);
|
||||
|
||||
// TODO: if we fail with split_simple, we should attempt different splitting strategies
|
||||
// first try simple split
|
||||
do {
|
||||
auto sbatch = llama_sbatch(batch, hparams.n_embd, true);
|
||||
|
||||
std::vector<llama_ubatch> ubatches;
|
||||
|
||||
while (sbatch.n_tokens > 0) {
|
||||
auto ubatch = sbatch.split_simple(n_ubatch);
|
||||
|
||||
ubatches.push_back(ubatch);
|
||||
}
|
||||
|
||||
auto heads_base = kv_base->prepare(ubatches);
|
||||
if (heads_base.empty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
auto heads_swa = kv_swa->prepare(ubatches);
|
||||
if (heads_swa.empty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
assert(heads_base.size() == heads_swa.size());
|
||||
|
||||
return std::make_unique<llama_kv_cache_unified_iswa_state>(
|
||||
this, std::move(sbatch), std::move(heads_base), std::move(heads_swa), std::move(ubatches));
|
||||
} while (false);
|
||||
|
||||
// if it fails, try equal split
|
||||
do {
|
||||
auto sbatch = llama_sbatch(batch, hparams.n_embd, false);
|
||||
|
||||
std::vector<llama_ubatch> ubatches;
|
||||
|
||||
while (sbatch.n_tokens > 0) {
|
||||
auto ubatch = sbatch.split_equal(n_ubatch);
|
||||
|
||||
ubatches.push_back(ubatch);
|
||||
}
|
||||
|
||||
auto heads_base = kv_base->prepare(ubatches);
|
||||
if (heads_base.empty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
auto heads_swa = kv_swa->prepare(ubatches);
|
||||
if (heads_swa.empty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
assert(heads_base.size() == heads_swa.size());
|
||||
|
||||
return std::make_unique<llama_kv_cache_unified_iswa_state>(
|
||||
this, std::move(sbatch), std::move(heads_base), std::move(heads_swa), std::move(ubatches));
|
||||
} while (false);
|
||||
|
||||
// TODO: if we fail again, we should attempt different splitting strategies
|
||||
// but to do that properly, we first have to refactor the batches to be more flexible
|
||||
|
||||
auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all);
|
||||
|
||||
std::vector<llama_ubatch> ubatches;
|
||||
|
||||
while (sbatch.n_tokens > 0) {
|
||||
auto ubatch = sbatch.split_simple(n_ubatch);
|
||||
|
||||
ubatches.push_back(ubatch);
|
||||
}
|
||||
|
||||
auto heads_base = kv_base->prepare(ubatches);
|
||||
if (heads_base.empty()) {
|
||||
return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||
}
|
||||
|
||||
auto heads_swa = kv_swa->prepare(ubatches);
|
||||
if (heads_swa.empty()) {
|
||||
return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||
}
|
||||
|
||||
assert(heads_base.size() == heads_swa.size());
|
||||
|
||||
return std::make_unique<llama_kv_cache_unified_iswa_state>(
|
||||
this, std::move(sbatch), std::move(heads_base), std::move(heads_swa), std::move(ubatches));
|
||||
return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||
}
|
||||
|
||||
llama_memory_state_ptr llama_kv_cache_unified_iswa::init_full() {
|
||||
|
||||
@@ -34,8 +34,7 @@ public:
|
||||
llama_memory_state_ptr init_batch(
|
||||
const llama_batch & batch,
|
||||
uint32_t n_ubatch,
|
||||
bool embd_pooled,
|
||||
bool logits_all) override;
|
||||
bool embd_all) override;
|
||||
|
||||
llama_memory_state_ptr init_full() override;
|
||||
|
||||
|
||||
@@ -127,6 +127,9 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
||||
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
||||
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
||||
}
|
||||
|
||||
const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
|
||||
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified::clear(bool data) {
|
||||
@@ -307,24 +310,27 @@ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
|
||||
llama_memory_state_ptr llama_kv_cache_unified::init_batch(
|
||||
const llama_batch & batch,
|
||||
uint32_t n_ubatch,
|
||||
bool embd_pooled,
|
||||
bool logits_all) {
|
||||
GGML_UNUSED(embd_pooled);
|
||||
bool embd_all) {
|
||||
GGML_UNUSED(embd_all);
|
||||
|
||||
auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all);
|
||||
do {
|
||||
auto sbatch = llama_sbatch(batch, hparams.n_embd, true);
|
||||
|
||||
std::vector<llama_ubatch> ubatches;
|
||||
while (sbatch.n_tokens > 0) {
|
||||
ubatches.push_back(sbatch.split_simple(n_ubatch));
|
||||
}
|
||||
std::vector<llama_ubatch> ubatches;
|
||||
while (sbatch.n_tokens > 0) {
|
||||
ubatches.push_back(sbatch.split_simple(n_ubatch));
|
||||
}
|
||||
|
||||
auto heads = prepare(ubatches);
|
||||
if (heads.empty()) {
|
||||
return std::make_unique<llama_kv_cache_unified_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||
}
|
||||
auto heads = prepare(ubatches);
|
||||
if (heads.empty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
return std::make_unique<llama_kv_cache_unified_state>(
|
||||
this, std::move(sbatch), std::move(heads), std::move(ubatches));
|
||||
return std::make_unique<llama_kv_cache_unified_state>(
|
||||
this, std::move(sbatch), std::move(heads), std::move(ubatches));
|
||||
} while (false);
|
||||
|
||||
return std::make_unique<llama_kv_cache_unified_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||
}
|
||||
|
||||
llama_memory_state_ptr llama_kv_cache_unified::init_full() {
|
||||
@@ -462,7 +468,7 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
|
||||
for (uint32_t i = 0; i < n_kv; ++i) {
|
||||
assert(dinfo.ids[i] <= n_kv);
|
||||
|
||||
if (dinfo.ids[i] == n_kv) {
|
||||
if (dinfo.ids[i] == n_kv || dinfo.ids[i] == i) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -512,43 +518,68 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||
head_cur = 0;
|
||||
}
|
||||
|
||||
// otherwise, one cell per token.
|
||||
|
||||
if (n_tokens > cells.size()) {
|
||||
LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size());
|
||||
return -1;
|
||||
}
|
||||
|
||||
//#define FIND_SLOT_DEBUG 1
|
||||
#if FIND_SLOT_DEBUG
|
||||
LLAMA_LOG_WARN("begin: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", cells.used_max_p1(), cells.get_used(), head, n_swa);
|
||||
if (debug > 0) {
|
||||
LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n", __func__, cells.used_max_p1(), cells.get_used(), head, get_size(), n_swa);
|
||||
|
||||
// for debugging
|
||||
{
|
||||
std::string ss;
|
||||
if (n_swa > 0) {
|
||||
if ((debug == 2 && n_swa > 0) || debug > 2) {
|
||||
std::string ss;
|
||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||
if (cells.is_empty(i)) {
|
||||
ss += '.';
|
||||
} else {
|
||||
ss += std::to_string(cells.seq_get(i));
|
||||
assert(cells.seq_count(i) >= 1);
|
||||
|
||||
if (cells.seq_count(i) == 1) {
|
||||
ss += std::to_string(cells.seq_get(i));
|
||||
} else {
|
||||
ss += 'M';
|
||||
}
|
||||
}
|
||||
if (i%256 == 255) {
|
||||
ss += " *";
|
||||
ss += '\n';
|
||||
}
|
||||
}
|
||||
}
|
||||
LLAMA_LOG_WARN("\n%s\n", ss.c_str());
|
||||
}
|
||||
|
||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
if (cells.seq_pos_min(s) < 0) {
|
||||
continue;
|
||||
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
|
||||
}
|
||||
|
||||
LLAMA_LOG_WARN("kv_cells: n_swa = %4d, min[%d] = %5d, max[%d] = %5d\n", n_swa, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
|
||||
if ((debug == 2 && n_swa > 0) || debug > 2) {
|
||||
std::string ss;
|
||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||
std::string cur;
|
||||
if (cells.is_empty(i)) {
|
||||
cur = '.';
|
||||
} else {
|
||||
cur = std::to_string(cells.pos_get(i));
|
||||
}
|
||||
const int n = cur.size();
|
||||
for (int j = 0; j < 5 - n; ++j) {
|
||||
cur += ' ';
|
||||
}
|
||||
ss += cur;
|
||||
if (i%256 == 255) {
|
||||
ss += " *";
|
||||
}
|
||||
if (i%64 == 63) {
|
||||
ss += '\n';
|
||||
}
|
||||
}
|
||||
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
|
||||
}
|
||||
|
||||
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
if (cells.seq_pos_min(s) < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: min[%d] = %5d, max[%d] = %5d\n", __func__, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
uint32_t n_tested = 0;
|
||||
|
||||
@@ -559,21 +590,15 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||
continue;
|
||||
}
|
||||
|
||||
// keep track of what the minimum sequence positions would be if we accept the ubatch
|
||||
llama_seq_id seq_pos_min[LLAMA_MAX_PARALLEL_SEQUENCES];
|
||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
seq_pos_min[s] = cells.seq_pos_min(s);
|
||||
}
|
||||
|
||||
bool found = true;
|
||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||
const llama_pos pos = ubatch.pos[i];
|
||||
const llama_seq_id seq_id = ubatch.seq_id[i][0];
|
||||
//const llama_pos pos = ubatch.pos[i];
|
||||
//const llama_seq_id seq_id = ubatch.seq_id[i][0];
|
||||
|
||||
// can we use this cell? either:
|
||||
// - the cell is empty
|
||||
// - the cell is occupied only by one sequence:
|
||||
// - mask causally, if the sequence is the same as the one we are inserting
|
||||
// - (disabled) mask causally, if the sequence is the same as the one we are inserting
|
||||
// - mask SWA, using current max pos for that sequence in the cache
|
||||
// always insert in the cell with minimum pos
|
||||
bool can_use = cells.is_empty(head_cur + i);
|
||||
@@ -581,21 +606,17 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||
if (!can_use && cells.seq_count(head_cur + i) == 1) {
|
||||
const llama_pos pos_cell = cells.pos_get(head_cur + i);
|
||||
|
||||
// causal mask
|
||||
if (cells.seq_has(head_cur + i, seq_id)) {
|
||||
can_use = pos_cell >= pos;
|
||||
}
|
||||
// (disabled) causal mask
|
||||
// note: it's better to purge any "future" tokens beforehand
|
||||
//if (cells.seq_has(head_cur + i, seq_id)) {
|
||||
// can_use = pos_cell >= pos;
|
||||
//}
|
||||
|
||||
if (!can_use) {
|
||||
const llama_seq_id seq_id_cell = cells.seq_get(head_cur + i);
|
||||
|
||||
// SWA mask
|
||||
// note: we insert only in the cell with minimum pos in order to preserve the invariant that
|
||||
// all positions between [pos_min, pos_max] for each sequence will be present in the cache
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
|
||||
if (pos_cell == seq_pos_min[seq_id_cell] &&
|
||||
is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
|
||||
seq_pos_min[seq_id_cell]++;
|
||||
if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
|
||||
can_use = true;
|
||||
}
|
||||
}
|
||||
@@ -623,18 +644,58 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch) {
|
||||
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
|
||||
if (!cells.is_empty(head_cur + i)) {
|
||||
cells.rm(head_cur + i);
|
||||
}
|
||||
if (debug > 0) {
|
||||
LLAMA_LOG_DEBUG("%s: ubatch info:\n", __func__);
|
||||
LLAMA_LOG_DEBUG("%s: n_tokens = %d, equal_seqs = %d\n", __func__, ubatch.n_tokens, ubatch.equal_seqs);
|
||||
LLAMA_LOG_DEBUG("%s: n_seq_tokens = %d, n_seqs = %d\n", __func__, ubatch.n_seq_tokens, ubatch.n_seqs);
|
||||
}
|
||||
|
||||
cells.pos_set(head_cur + i, ubatch.pos[i]);
|
||||
// keep track of the max sequence position that we would overwrite with this ubatch
|
||||
// for non-SWA cache, this would be always empty
|
||||
llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ];
|
||||
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
seq_pos_max_rm[s] = -1;
|
||||
}
|
||||
|
||||
for (int32_t j = 0; j < ubatch.n_seq_id[i]; j++) {
|
||||
cells.seq_add(head_cur + i, ubatch.seq_id[i][j]);
|
||||
for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
|
||||
for (uint32_t j = 0; j < ubatch.n_seq_tokens; ++j) {
|
||||
const uint32_t idx = s*ubatch.n_seq_tokens + j;
|
||||
|
||||
if (!cells.is_empty(head_cur + idx)) {
|
||||
assert(cells.seq_count(head_cur + idx) == 1);
|
||||
|
||||
const llama_seq_id seq_id = cells.seq_get(head_cur + idx);
|
||||
const llama_pos pos = cells.pos_get(head_cur + idx);
|
||||
|
||||
seq_pos_max_rm[seq_id] = std::max(seq_pos_max_rm[seq_id], pos);
|
||||
|
||||
cells.rm(head_cur + idx);
|
||||
}
|
||||
|
||||
cells.pos_set(head_cur + idx, ubatch.pos[idx]);
|
||||
|
||||
// TODO: fix indexing [UBATCH_IDX]
|
||||
for (int32_t i = 0; i < ubatch.n_seq_id[s]; i++) {
|
||||
cells.seq_add(head_cur + idx, ubatch.seq_id[s][i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// note: we want to preserve the invariant that all positions between [pos_min, pos_max] for each sequence
|
||||
// will be present in the cache. so we have to purge any position which is less than those we would overwrite
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
|
||||
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
if (seq_pos_max_rm[s] == -1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (cells.seq_pos_min(s) <= seq_pos_max_rm[s]) {
|
||||
LLAMA_LOG_DEBUG("%s: purging positions [%d, %d] of sequence %d from KV cache\n",
|
||||
__func__, cells.seq_pos_min(s), seq_pos_max_rm[s], s);
|
||||
|
||||
seq_rm(s, cells.seq_pos_min(s), seq_pos_max_rm[s] + 1);
|
||||
}
|
||||
}
|
||||
// move the head at the end of the slot
|
||||
head = head_cur + ubatch.n_tokens;
|
||||
}
|
||||
@@ -731,14 +792,14 @@ ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
const int64_t n_seq_tokens = ubatch->n_seq_tokens;
|
||||
const int64_t n_seqs = ubatch->n_seqs;
|
||||
const uint32_t n_tokens = ubatch->n_tokens;
|
||||
const uint32_t n_seq_tokens = ubatch->n_seq_tokens;
|
||||
const uint32_t n_seqs = ubatch->n_seqs;
|
||||
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
|
||||
float * data = (float *) dst->data;
|
||||
|
||||
const auto n_kv = dst->ne[0];
|
||||
const int64_t n_kv = dst->ne[0];
|
||||
|
||||
// Use only the previous KV cells of the correct sequence for each token of the ubatch.
|
||||
// It's assumed that if a token in the batch has multiple sequences, they are equivalent.
|
||||
@@ -752,12 +813,14 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub
|
||||
// xxxxx-----
|
||||
// xxxxx-----
|
||||
// To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
|
||||
for (int h = 0; h < 1; ++h) {
|
||||
for (int s = 0; s < n_seqs; ++s) {
|
||||
for (uint32_t h = 0; h < 1; ++h) {
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
const llama_seq_id seq_id = ubatch->seq_id[s][0];
|
||||
|
||||
for (int j = 0; j < n_seq_tokens; ++j) {
|
||||
const llama_pos p1 = ubatch->pos[s*n_seq_tokens + j];
|
||||
for (uint32_t j = 0; j < n_seq_tokens; ++j) {
|
||||
const uint32_t idx = s*n_seq_tokens + j;
|
||||
|
||||
const llama_pos p1 = ubatch->pos[idx];
|
||||
|
||||
for (uint32_t i = 0; i < n_kv; ++i) {
|
||||
float f = 0.0f;
|
||||
@@ -787,16 +850,16 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub
|
||||
f = -INFINITY;
|
||||
}
|
||||
|
||||
data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
|
||||
data[h*(n_kv*n_tokens) + idx*n_kv + i] = f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// mask padded tokens
|
||||
if (data) {
|
||||
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
|
||||
for (uint32_t j = 0; j < n_kv; ++j) {
|
||||
data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
|
||||
for (uint32_t j = n_tokens; j < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++j) {
|
||||
for (uint32_t i = 0; i < n_kv; ++i) {
|
||||
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -944,11 +1007,9 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
|
||||
const auto & n_embd_head_k = hparams.n_embd_head_k;
|
||||
//const auto & n_embd_head_v = hparams.n_embd_head_v;
|
||||
|
||||
//GGML_ASSERT(kv_self->size == n_ctx);
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_k_shift>(this);
|
||||
|
||||
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cparams.n_ctx);
|
||||
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cells.size());
|
||||
ggml_set_input(inp->k_shift);
|
||||
|
||||
for (const auto & layer : layers) {
|
||||
@@ -1449,9 +1510,11 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
|
||||
seq_rm(dest_seq_id, -1, -1);
|
||||
|
||||
llama_sbatch sbatch;
|
||||
llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
|
||||
llama_ubatch ubatch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
|
||||
|
||||
batch.n_tokens = cell_count;
|
||||
ubatch.n_tokens = cell_count;
|
||||
ubatch.n_seq_tokens = cell_count;
|
||||
ubatch.n_seqs = 1;
|
||||
|
||||
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||
llama_pos pos;
|
||||
@@ -1471,18 +1534,18 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
|
||||
io.read_to(&seq_id, sizeof(seq_id));
|
||||
}
|
||||
|
||||
batch.pos[i] = pos;
|
||||
batch.n_seq_id[i] = n_seq_id;
|
||||
batch.seq_id[i] = &dest_seq_id;
|
||||
ubatch.pos[i] = pos;
|
||||
ubatch.n_seq_id[i] = n_seq_id;
|
||||
ubatch.seq_id[i] = &dest_seq_id;
|
||||
}
|
||||
|
||||
const auto head_cur = find_slot(batch);
|
||||
const auto head_cur = find_slot(ubatch);
|
||||
if (head_cur < 0) {
|
||||
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
apply_ubatch(head_cur, batch);
|
||||
apply_ubatch(head_cur, ubatch);
|
||||
|
||||
// keep the head at the old position because we will read the KV data into it in state_read_data()
|
||||
head = head_cur;
|
||||
@@ -1490,8 +1553,8 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
|
||||
// DEBUG CHECK: head_cur should be our first cell, head_cur + cell_count - 1 should be our last cell (verify seq_id and pos values)
|
||||
// Assume that this is one contiguous block of cells
|
||||
GGML_ASSERT(head_cur + cell_count <= cells.size());
|
||||
GGML_ASSERT(cells.pos_get(head_cur) == batch.pos[0]);
|
||||
GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == batch.pos[cell_count - 1]);
|
||||
GGML_ASSERT(cells.pos_get(head_cur) == ubatch.pos[0]);
|
||||
GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == ubatch.pos[cell_count - 1]);
|
||||
GGML_ASSERT(cells.seq_has(head_cur, dest_seq_id));
|
||||
GGML_ASSERT(cells.seq_has(head_cur + cell_count - 1, dest_seq_id));
|
||||
} else {
|
||||
@@ -1676,7 +1739,7 @@ llama_kv_cache_unified_state::llama_kv_cache_unified_state(
|
||||
llama_context * lctx,
|
||||
bool do_shift,
|
||||
defrag_info dinfo) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), dinfo(std::move(dinfo)) {
|
||||
if (!do_shift && dinfo.empty()) {
|
||||
if (!do_shift && this->dinfo.empty()) {
|
||||
status = LLAMA_MEMORY_STATUS_NO_UPDATE;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -59,8 +59,7 @@ public:
|
||||
llama_memory_state_ptr init_batch(
|
||||
const llama_batch & batch,
|
||||
uint32_t n_ubatch,
|
||||
bool embd_pooled,
|
||||
bool logits_all) override;
|
||||
bool embd_all) override;
|
||||
|
||||
llama_memory_state_ptr init_full() override;
|
||||
|
||||
@@ -158,6 +157,8 @@ private:
|
||||
// SWA
|
||||
const uint32_t n_swa = 0;
|
||||
|
||||
int debug = 0;
|
||||
|
||||
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
|
||||
@@ -23,7 +23,7 @@ public:
|
||||
|
||||
used.clear();
|
||||
|
||||
for (uint32_t s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
for (uint32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
seq_pos[s].clear();
|
||||
}
|
||||
}
|
||||
@@ -80,6 +80,9 @@ public:
|
||||
assert(isrc < pos.size());
|
||||
assert(idst < pos.size());
|
||||
|
||||
assert(pos[idst] == -1);
|
||||
assert(pos[isrc] != -1);
|
||||
|
||||
pos [idst] = pos [isrc];
|
||||
shift[idst] = shift[isrc];
|
||||
seq [idst] = seq [isrc];
|
||||
@@ -144,9 +147,10 @@ public:
|
||||
assert(pos[i] != -1);
|
||||
|
||||
seq_pos_rm(i);
|
||||
seq[i].reset();
|
||||
|
||||
pos[i] = -1;
|
||||
seq[i].reset();
|
||||
shift[i] = 0;
|
||||
|
||||
used.erase(i);
|
||||
}
|
||||
@@ -164,6 +168,7 @@ public:
|
||||
|
||||
if (seq[i].none()) {
|
||||
pos[i] = -1;
|
||||
shift[i] = 0;
|
||||
|
||||
used.erase(i);
|
||||
|
||||
@@ -192,6 +197,7 @@ public:
|
||||
seq[i].reset();
|
||||
|
||||
pos[i] = -1;
|
||||
shift[i] = 0;
|
||||
|
||||
used.erase(i);
|
||||
|
||||
@@ -234,7 +240,7 @@ public:
|
||||
llama_seq_id seq_get(uint32_t i) const {
|
||||
assert(seq[i].count() == 1);
|
||||
|
||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
if (seq[i].test(s)) {
|
||||
return s;
|
||||
}
|
||||
@@ -247,7 +253,7 @@ public:
|
||||
// return -1 if the sequence is not present
|
||||
llama_pos seq_pos_min(llama_seq_id seq_id) const {
|
||||
assert(seq_id >= 0);
|
||||
assert(seq_id < LLAMA_MAX_PARALLEL_SEQUENCES);
|
||||
assert(seq_id < LLAMA_MAX_SEQ);
|
||||
|
||||
if (seq_pos[seq_id].empty()) {
|
||||
return -1;
|
||||
@@ -260,7 +266,7 @@ public:
|
||||
// return -1 if the sequence is not present
|
||||
llama_pos seq_pos_max(llama_seq_id seq_id) const {
|
||||
assert(seq_id >= 0);
|
||||
assert(seq_id < LLAMA_MAX_PARALLEL_SEQUENCES);
|
||||
assert(seq_id < LLAMA_MAX_SEQ);
|
||||
|
||||
if (seq_pos[seq_id].empty()) {
|
||||
return -1;
|
||||
@@ -317,21 +323,20 @@ public:
|
||||
pos[i] += d;
|
||||
shift[i] += d;
|
||||
|
||||
seq_pos_add(i);
|
||||
|
||||
has_shift = true;
|
||||
|
||||
if (pos[i] < 0) {
|
||||
seq_pos_rm(i);
|
||||
|
||||
seq[i].reset();
|
||||
pos[i] = -1;
|
||||
shift[i] = 0;
|
||||
|
||||
used.erase(i);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
seq_pos_add(i);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -379,20 +384,20 @@ private:
|
||||
//
|
||||
std::vector<llama_pos> shift;
|
||||
|
||||
using bits_t = std::bitset<LLAMA_MAX_PARALLEL_SEQUENCES>;
|
||||
using bits_t = std::bitset<LLAMA_MAX_SEQ>;
|
||||
|
||||
// the bitset seq[i] tells us which sequences are currently occupying the i-th cell
|
||||
std::vector<bits_t> seq;
|
||||
|
||||
// the set seq_pos[s] tells us which positions are currently present for sequence s
|
||||
// this way seq_pos[s].begin() and seq_pos[s].rbegin() give us the min/max positions currently in the cache
|
||||
std::set<llama_pos> seq_pos[LLAMA_MAX_PARALLEL_SEQUENCES];
|
||||
std::set<llama_pos> seq_pos[LLAMA_MAX_SEQ];
|
||||
|
||||
// helper functions for updating `seq_pos`, once cell at a time:
|
||||
|
||||
// remove cell i
|
||||
void seq_pos_rm(uint32_t i) {
|
||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
if (seq[i].test(s)) {
|
||||
seq_pos[s].erase(pos[i]);
|
||||
}
|
||||
@@ -401,7 +406,7 @@ private:
|
||||
|
||||
// add cell i
|
||||
void seq_pos_add(uint32_t i) {
|
||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
if (seq[i].test(s)) {
|
||||
seq_pos[s].insert(pos[i]);
|
||||
}
|
||||
|
||||
@@ -73,8 +73,7 @@ struct llama_memory_i {
|
||||
virtual llama_memory_state_ptr init_batch(
|
||||
const llama_batch & batch,
|
||||
uint32_t n_ubatch,
|
||||
bool embd_pooled,
|
||||
bool logits_all) = 0;
|
||||
bool embd_all) = 0;
|
||||
|
||||
// simulate full cache, used for allocating worst-case compute buffers
|
||||
virtual llama_memory_state_ptr init_full() = 0;
|
||||
|
||||
@@ -80,6 +80,7 @@ const char * llm_type_name(llm_type type) {
|
||||
case LLM_TYPE_40B: return "40B";
|
||||
case LLM_TYPE_65B: return "65B";
|
||||
case LLM_TYPE_70B: return "70B";
|
||||
case LLM_TYPE_142B: return "142B";
|
||||
case LLM_TYPE_236B: return "236B";
|
||||
case LLM_TYPE_290B: return "290B";
|
||||
case LLM_TYPE_314B: return "314B";
|
||||
@@ -598,6 +599,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
hparams.use_kq_norm = false;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_ARCEE:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
// Arcee uses the same structure as Llama
|
||||
switch (hparams.n_layer) {
|
||||
case 36: type = LLM_TYPE_4B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
@@ -738,6 +749,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_NEO_BERT:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
||||
|
||||
if (hparams.n_layer == 28) {
|
||||
type = LLM_TYPE_250M;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_BLOOM:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -1444,6 +1465,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_DOTS1:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
||||
switch (hparams.n_layer) {
|
||||
case 62: type = LLM_TYPE_142B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
default: throw std::runtime_error("unsupported model architecture");
|
||||
}
|
||||
|
||||
@@ -2187,6 +2222,32 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_NEO_BERT:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
|
||||
cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
||||
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff*2}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_JINA_BERT_V2:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
|
||||
@@ -2224,8 +2285,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, layer.ffn_gate ? n_ff : n_ff * 2}, 0);
|
||||
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
||||
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
||||
@@ -4123,6 +4184,89 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_DOTS1:
|
||||
{
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp;
|
||||
const int64_t n_expert_shared = hparams.n_expert_shared;
|
||||
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
if (i < (int) hparams.n_layer_dense_lead) {
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
} else {
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
||||
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
if (n_expert == 0) {
|
||||
throw std::runtime_error("n_expert must be > 0");
|
||||
}
|
||||
if (n_expert_used == 0) {
|
||||
throw std::runtime_error("n_expert_used must be > 0");
|
||||
}
|
||||
|
||||
// MoE branch
|
||||
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
||||
|
||||
// Shared expert branch
|
||||
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
||||
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_ARCEE:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
// output
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
// if output is NULL, init from the input tok embed
|
||||
if (output == NULL) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
||||
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
@@ -6043,7 +6187,7 @@ struct llm_build_bert : public llm_graph_context {
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
||||
model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else {
|
||||
cur = build_ffn(cur,
|
||||
@@ -6074,6 +6218,117 @@ struct llm_build_bert : public llm_graph_context {
|
||||
}
|
||||
};
|
||||
|
||||
struct llm_build_neo_bert : public llm_graph_context {
|
||||
llm_build_neo_bert(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// construct input embeddings (token, type, position)
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
auto * inp_attn = build_attn_inp_no_cache();
|
||||
|
||||
// iterate layers
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * cur = inpL;
|
||||
|
||||
ggml_tensor * Qcur;
|
||||
ggml_tensor * Kcur;
|
||||
ggml_tensor * Vcur;
|
||||
|
||||
// pre-norm
|
||||
cur = build_norm(inpL,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
|
||||
// self-attention
|
||||
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
||||
cb(cur, "wqkv", il);
|
||||
|
||||
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
||||
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
||||
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
// RoPE
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn, gf,
|
||||
model.layers[il].wo, nullptr,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||
// skip computing output for unused tokens
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||
}
|
||||
|
||||
// re-add the layer input
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
ggml_tensor * ffn_inp = cur;
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// pre-norm
|
||||
cur = build_norm(ffn_inp,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
// feed-forward network
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up,
|
||||
NULL, NULL, NULL, NULL, NULL,
|
||||
model.layers[il].ffn_down,
|
||||
NULL, NULL, NULL,
|
||||
LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
|
||||
|
||||
// attentions bypass the intermediate layer
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = build_norm(cur,
|
||||
model.output_norm_enc, NULL,
|
||||
LLM_NORM_RMS, -1);
|
||||
|
||||
cb(cur, "result_embd", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
};
|
||||
|
||||
struct llm_build_bloom : public llm_graph_context {
|
||||
llm_build_bloom(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
@@ -8857,7 +9112,6 @@ struct llm_build_mamba : public llm_graph_context {
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
ggml_tensor * state_copy = build_inp_s_copy();
|
||||
ggml_tensor * state_mask = build_inp_s_mask();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
// norm
|
||||
@@ -8866,8 +9120,7 @@ struct llm_build_mamba : public llm_graph_context {
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
//cur = build_mamba_layer(gf, cur, state_copy, state_mask, il);
|
||||
cur = build_mamba_layer(gf, cur, state_copy, state_mask, ubatch, il);
|
||||
cur = build_mamba_layer(gf, cur, state_copy, ubatch, il);
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
@@ -8908,7 +9161,6 @@ struct llm_build_mamba : public llm_graph_context {
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
||||
@@ -8935,12 +9187,12 @@ struct llm_build_mamba : public llm_graph_context {
|
||||
ggml_tensor * ssm_states_all = kv_state->get_v_l(il);
|
||||
|
||||
// (ab)using the KV cache to store the states
|
||||
ggml_tensor * conv = build_copy_mask_state(
|
||||
gf, conv_states_all, state_copy, state_mask,
|
||||
ggml_tensor * conv = build_recurrent_state(
|
||||
gf, conv_states_all, state_copy,
|
||||
hparams.n_embd_k_s(), n_seqs);
|
||||
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
|
||||
ggml_tensor * ssm = build_copy_mask_state(
|
||||
gf, ssm_states_all, state_copy, state_mask,
|
||||
ggml_tensor * ssm = build_recurrent_state(
|
||||
gf, ssm_states_all, state_copy,
|
||||
hparams.n_embd_v_s(), n_seqs);
|
||||
ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
|
||||
|
||||
@@ -11656,7 +11908,6 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * x_prev,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
const auto * kv_state = static_cast<const llama_kv_cache_recurrent_state *>(mstate);
|
||||
@@ -11780,8 +12031,8 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
||||
k = ggml_sub(ctx0, k, ggml_mul(ctx0, k, w));
|
||||
}
|
||||
|
||||
ggml_tensor * wkv_state = build_copy_mask_state(
|
||||
gf, kv_state->get_v_l(il), state_copy, state_mask,
|
||||
ggml_tensor * wkv_state = build_recurrent_state(
|
||||
gf, kv_state->get_v_l(il), state_copy,
|
||||
hparams.n_embd_v_s(), n_seqs);
|
||||
|
||||
ggml_tensor * wkv_output;
|
||||
@@ -11837,7 +12088,6 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
||||
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
|
||||
|
||||
ggml_tensor * state_copy = build_inp_s_copy();
|
||||
ggml_tensor * state_mask = build_inp_s_mask();
|
||||
|
||||
const auto n_embd = hparams.n_embd;
|
||||
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
||||
@@ -11848,7 +12098,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
||||
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
||||
|
||||
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
||||
gf, state_copy, state_mask, ubatch, il
|
||||
gf, state_copy, ubatch, il
|
||||
);
|
||||
|
||||
ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
|
||||
@@ -11864,7 +12114,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
||||
1
|
||||
);
|
||||
|
||||
cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
|
||||
cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, ubatch, il);
|
||||
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
@@ -11935,7 +12185,6 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
ggml_tensor * state_copy = build_inp_s_copy();
|
||||
ggml_tensor * state_mask = build_inp_s_mask();
|
||||
|
||||
const auto n_embd = hparams.n_embd;
|
||||
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
||||
@@ -11946,7 +12195,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
||||
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
||||
|
||||
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
||||
gf, state_copy, state_mask, ubatch, il
|
||||
gf, state_copy, ubatch, il
|
||||
);
|
||||
|
||||
ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
|
||||
@@ -11959,7 +12208,7 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base {
|
||||
1
|
||||
);
|
||||
|
||||
cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);
|
||||
cur = build_rwkv6_time_mix(gf, att_norm, x_prev, state_copy, ubatch, il);
|
||||
|
||||
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
|
||||
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
||||
@@ -12051,7 +12300,6 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * x_prev,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
ggml_tensor *& first_layer_value,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
@@ -12134,8 +12382,8 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
||||
v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens);
|
||||
a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens);
|
||||
|
||||
ggml_tensor * wkv_state = build_copy_mask_state(
|
||||
gf, kv_state->get_v_l(il), state_copy, state_mask,
|
||||
ggml_tensor * wkv_state = build_recurrent_state(
|
||||
gf, kv_state->get_v_l(il), state_copy,
|
||||
hparams.n_embd_v_s(), n_seqs);
|
||||
|
||||
ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state);
|
||||
@@ -12193,7 +12441,6 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
||||
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
|
||||
|
||||
ggml_tensor * state_copy = build_inp_s_copy();
|
||||
ggml_tensor * state_mask = build_inp_s_mask();
|
||||
|
||||
const auto n_embd = hparams.n_embd;
|
||||
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
||||
@@ -12204,7 +12451,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
||||
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
||||
|
||||
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
||||
gf, state_copy, state_mask, ubatch, il
|
||||
gf, state_copy, ubatch, il
|
||||
);
|
||||
|
||||
ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
|
||||
@@ -12220,7 +12467,7 @@ struct llm_build_rwkv7 : public llm_build_rwkv7_base {
|
||||
1
|
||||
);
|
||||
|
||||
cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il);
|
||||
cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, v_first, ubatch, il);
|
||||
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
@@ -12287,7 +12534,6 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
ggml_tensor * state_copy = build_inp_s_copy();
|
||||
ggml_tensor * state_mask = build_inp_s_mask();
|
||||
|
||||
const auto n_embd = hparams.n_embd;
|
||||
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
||||
@@ -12298,7 +12544,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
||||
inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
||||
|
||||
ggml_tensor * token_shift = build_rwkv_token_shift_load(
|
||||
gf, state_copy, state_mask, ubatch, il
|
||||
gf, state_copy, ubatch, il
|
||||
);
|
||||
|
||||
ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
|
||||
@@ -12311,7 +12557,7 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
||||
1
|
||||
);
|
||||
|
||||
cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il);
|
||||
cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, v_first, ubatch, il);
|
||||
|
||||
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
|
||||
ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il));
|
||||
@@ -13203,6 +13449,291 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
||||
}
|
||||
};
|
||||
|
||||
struct llm_build_dots1 : public llm_graph_context {
|
||||
llm_build_dots1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self_attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// MoE branch
|
||||
cur = build_norm(ffn_inp,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
if ((uint32_t) il < hparams.n_layer_dense_lead) {
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else {
|
||||
ggml_tensor * moe_out =
|
||||
build_moe_ffn(cur,
|
||||
model.layers[il].ffn_gate_inp,
|
||||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||
true, hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
||||
{
|
||||
ggml_tensor * ffn_shexp = build_ffn(cur,
|
||||
model.layers[il].ffn_up_shexp, NULL, NULL,
|
||||
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
||||
model.layers[il].ffn_down_shexp, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(ffn_shexp, "ffn_shexp", il);
|
||||
|
||||
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = build_norm(cur,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, -1);
|
||||
|
||||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
};
|
||||
|
||||
struct llm_build_arcee : public llm_graph_context {
|
||||
llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified();
|
||||
|
||||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
cur = build_norm(inpL,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
// ARCEE uses relu^2 instead of silu
|
||||
cur = build_norm(ffn_inp,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
NULL, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = build_norm(cur,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, -1);
|
||||
|
||||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
};
|
||||
|
||||
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
||||
llama_memory_i * res;
|
||||
|
||||
@@ -13211,6 +13742,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
case LLM_ARCH_JINA_BERT_V2:
|
||||
case LLM_ARCH_NOMIC_BERT:
|
||||
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||
case LLM_ARCH_NEO_BERT:
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
res = nullptr;
|
||||
@@ -13319,6 +13851,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
{
|
||||
llm = std::make_unique<llm_build_bert>(*this, params, gf);
|
||||
} break;
|
||||
case LLM_ARCH_NEO_BERT:
|
||||
{
|
||||
llm = std::make_unique<llm_build_neo_bert>(*this, params, gf);
|
||||
} break;
|
||||
case LLM_ARCH_BLOOM:
|
||||
{
|
||||
llm = std::make_unique<llm_build_bloom>(*this, params, gf);
|
||||
@@ -13541,6 +14077,14 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
{
|
||||
llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
|
||||
} break;
|
||||
case LLM_ARCH_DOTS1:
|
||||
{
|
||||
llm = std::make_unique<llm_build_dots1>(*this, params, gf);
|
||||
} break;
|
||||
case LLM_ARCH_ARCEE:
|
||||
{
|
||||
llm = std::make_unique<llm_build_arcee>(*this, params, gf);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
@@ -13690,6 +14234,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
case LLM_ARCH_BAILINGMOE:
|
||||
case LLM_ARCH_NEO_BERT:
|
||||
case LLM_ARCH_ARCEE:
|
||||
return LLAMA_ROPE_TYPE_NORM;
|
||||
|
||||
// the pairs of head values are offset by n_rot/2
|
||||
@@ -13723,6 +14269,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_NEMOTRON:
|
||||
case LLM_ARCH_EXAONE:
|
||||
case LLM_ARCH_MINICPM3:
|
||||
case LLM_ARCH_DOTS1:
|
||||
return LLAMA_ROPE_TYPE_NEOX;
|
||||
|
||||
case LLM_ARCH_QWEN2VL:
|
||||
|
||||
@@ -73,6 +73,7 @@ enum llm_type {
|
||||
LLM_TYPE_40B,
|
||||
LLM_TYPE_65B,
|
||||
LLM_TYPE_70B,
|
||||
LLM_TYPE_142B,
|
||||
LLM_TYPE_236B,
|
||||
LLM_TYPE_290B,
|
||||
LLM_TYPE_314B,
|
||||
|
||||
@@ -585,7 +585,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
|
||||
gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64);
|
||||
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
|
||||
gguf_set_val_i32(ctx_out.get(), o.key, o.val_i64);
|
||||
// Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context
|
||||
gguf_set_val_u32(ctx_out.get(), o.key, (uint32_t)abs(o.val_i64));
|
||||
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
|
||||
gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool);
|
||||
} else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
|
||||
|
||||
@@ -9,16 +9,16 @@
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cctype>
|
||||
#include <cfloat>
|
||||
#include <climits>
|
||||
#include <cstdarg>
|
||||
#include <cstring>
|
||||
#include <forward_list>
|
||||
#include <limits>
|
||||
#include <map>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
#include <cctype>
|
||||
|
||||
//
|
||||
// helpers
|
||||
@@ -1987,6 +1987,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|| t.first == "<|eom_id|>"
|
||||
|| t.first == "<EOT>"
|
||||
|| t.first == "_<EOT>"
|
||||
|| t.first == "<|end_of_text|>"
|
||||
) {
|
||||
special_eog_ids.insert(t.second);
|
||||
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
@@ -2572,6 +2573,10 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
|
||||
// copy piece chars to output text buffer
|
||||
// skip up to 'lstrip' leading spaces before copying
|
||||
auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
|
||||
if (size >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
|
||||
GGML_ABORT("invalid token size: %zu exceeds int32_t limit", size);
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
|
||||
token++;
|
||||
size--;
|
||||
@@ -2768,26 +2773,26 @@ void llama_vocab::impl::print_info() const {
|
||||
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
|
||||
|
||||
// special tokens
|
||||
if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token[special_bos_id].text.c_str() ); }
|
||||
if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token[special_eos_id].text.c_str() ); }
|
||||
if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token[special_eot_id].text.c_str() ); }
|
||||
if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token[special_eom_id].text.c_str() ); }
|
||||
if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token[special_unk_id].text.c_str() ); }
|
||||
if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token[special_sep_id].text.c_str() ); }
|
||||
if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token[special_pad_id].text.c_str() ); }
|
||||
if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token[special_mask_id].text.c_str() ); }
|
||||
if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
|
||||
if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
|
||||
if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
|
||||
if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
|
||||
if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
|
||||
if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
|
||||
if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
|
||||
if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
|
||||
|
||||
if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token[linefeed_id].text.c_str() ); }
|
||||
if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
|
||||
|
||||
if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token[special_fim_pre_id].text.c_str() ); }
|
||||
if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token[special_fim_suf_id].text.c_str() ); }
|
||||
if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token[special_fim_mid_id].text.c_str() ); }
|
||||
if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token[special_fim_pad_id].text.c_str() ); }
|
||||
if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token[special_fim_rep_id].text.c_str() ); }
|
||||
if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token[special_fim_sep_id].text.c_str() ); }
|
||||
if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
|
||||
if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
|
||||
if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
|
||||
if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
|
||||
if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
|
||||
if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
|
||||
|
||||
for (const auto & id : special_eog_ids) {
|
||||
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token[id].text.c_str() );
|
||||
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
|
||||
|
||||
@@ -42,6 +42,34 @@ function(llama_test target)
|
||||
set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
|
||||
endfunction()
|
||||
|
||||
function(llama_test_cmd target)
|
||||
include(CMakeParseArguments)
|
||||
set(options)
|
||||
set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
|
||||
set(multiValueArgs ARGS)
|
||||
cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
|
||||
if (NOT DEFINED LLAMA_TEST_LABEL)
|
||||
set(LLAMA_TEST_LABEL "main")
|
||||
endif()
|
||||
if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
|
||||
set(LLAMA_TEST_WORKING_DIRECTORY .)
|
||||
endif()
|
||||
if (DEFINED LLAMA_TEST_NAME)
|
||||
set(TEST_NAME ${LLAMA_TEST_NAME})
|
||||
else()
|
||||
set(TEST_NAME ${target})
|
||||
endif()
|
||||
|
||||
add_test(
|
||||
NAME ${TEST_NAME}
|
||||
WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
|
||||
COMMAND ${target}
|
||||
${LLAMA_TEST_ARGS})
|
||||
|
||||
set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
|
||||
endfunction()
|
||||
|
||||
# Builds and runs a test source file.
|
||||
# Optional args:
|
||||
# - NAME: name of the executable & test target (defaults to the source file name without extension)
|
||||
@@ -83,29 +111,35 @@ endfunction()
|
||||
# build test-tokenizer-0 target once and add many tests
|
||||
llama_build(test-tokenizer-0.cpp)
|
||||
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||
|
||||
# TODO: missing HF tokenizer for this model in convert_hf_to_gguf_update.py, see https://github.com/ggml-org/llama.cpp/pull/13847
|
||||
# llama_test(test-tokenizer-0 NAME test-tokenizer-0-nomic-bert-moe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-nomic-bert-moe.gguf)
|
||||
|
||||
if (LLAMA_LLGUIDANCE)
|
||||
llama_build_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
|
||||
endif ()
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-bert-bge.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-command-r.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-deepseek-coder.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-deepseek-llm.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-falcon.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-2.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-spm.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-mpt.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-phi-3.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-qwen2.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-refact.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-starcoder.gguf)
|
||||
|
||||
if (NOT WIN32)
|
||||
# these tests are disabled on Windows because they use internal functions not exported with LLAMA_API
|
||||
llama_test_cmd(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test-tokenizers-repo.sh
|
||||
NAME test-tokenizers-ggml-vocabs
|
||||
WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
|
||||
ARGS https://huggingface.co/ggml-org/vocabs ${PROJECT_SOURCE_DIR}/models/ggml-vocabs
|
||||
)
|
||||
endif()
|
||||
|
||||
if (LLAMA_LLGUIDANCE)
|
||||
llama_build_and_test(test-grammar-llguidance.cpp ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf)
|
||||
endif ()
|
||||
|
||||
if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
|
||||
# these tests are disabled on Windows because they use internal functions not exported with LLAMA_API (when building with shared libraries)
|
||||
llama_build_and_test(test-sampling.cpp)
|
||||
llama_build_and_test(test-grammar-parser.cpp)
|
||||
llama_build_and_test(test-grammar-integration.cpp)
|
||||
@@ -113,8 +147,8 @@ if (NOT WIN32)
|
||||
llama_build_and_test(test-chat.cpp)
|
||||
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
|
||||
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||
llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
||||
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../tools/server)
|
||||
llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
|
||||
target_include_directories(test-json-schema-to-grammar PRIVATE ${PROJECT_SOURCE_DIR}/tools/server)
|
||||
endif()
|
||||
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
@@ -127,20 +161,20 @@ if (NOT WIN32)
|
||||
llama_build(test-tokenizer-1-bpe.cpp)
|
||||
|
||||
# TODO: disabled due to slowness
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-aquila.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-falcon.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2 ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-2.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-neox.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf --ignore-merges)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-mpt.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-refact.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-starcoder.gguf)
|
||||
|
||||
# build test-tokenizer-1-spm target once and add many tests
|
||||
llama_build(test-tokenizer-1-spm.cpp)
|
||||
|
||||
llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
|
||||
#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
||||
llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-spm.gguf)
|
||||
#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-baichuan.gguf)
|
||||
|
||||
# llama_build_and_test(test-double-float.cpp) # SLOW
|
||||
endif()
|
||||
|
||||
@@ -7,6 +7,8 @@
|
||||
//
|
||||
#include "chat.h"
|
||||
|
||||
#include "log.h"
|
||||
|
||||
#include "../src/unicode.h"
|
||||
#include "../src/llama-grammar.h"
|
||||
|
||||
@@ -1428,6 +1430,8 @@ static void test_msg_diffs_compute() {
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
common_log_set_verbosity_thold(999);
|
||||
|
||||
// try {
|
||||
#ifndef _WIN32
|
||||
if (argc > 1) {
|
||||
|
||||
36
tests/test-tokenizers-repo.sh
Executable file
36
tests/test-tokenizers-repo.sh
Executable file
@@ -0,0 +1,36 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ $# -lt 2 ]; then
|
||||
printf "Usage: $0 <git-repo> <target-folder> [<test-exe>]\n"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $# -eq 3 ]; then
|
||||
toktest=$3
|
||||
else
|
||||
toktest="./test-tokenizer-0"
|
||||
fi
|
||||
|
||||
if [ ! -x $toktest ]; then
|
||||
printf "Test executable \"$toktest\" not found!\n"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
repo=$1
|
||||
folder=$2
|
||||
|
||||
if [ -d $folder ] && [ -d $folder/.git ]; then
|
||||
(cd $folder; git pull)
|
||||
else
|
||||
git clone $repo $folder
|
||||
fi
|
||||
|
||||
shopt -s globstar
|
||||
for gguf in $folder/**/*.gguf; do
|
||||
if [ -f $gguf.inp ] && [ -f $gguf.out ]; then
|
||||
$toktest $gguf
|
||||
else
|
||||
printf "Found \"$gguf\" without matching inp/out files, ignoring...\n"
|
||||
fi
|
||||
done
|
||||
|
||||
Binary file not shown.
@@ -88,6 +88,26 @@ enum error_type {
|
||||
ERROR_TYPE_NOT_SUPPORTED, // custom error
|
||||
};
|
||||
|
||||
static bool server_task_type_need_embd(server_task_type task_type) {
|
||||
switch (task_type) {
|
||||
case SERVER_TASK_TYPE_EMBEDDING:
|
||||
case SERVER_TASK_TYPE_RERANK:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static bool server_task_type_need_logits(server_task_type task_type) {
|
||||
switch (task_type) {
|
||||
case SERVER_TASK_TYPE_COMPLETION:
|
||||
case SERVER_TASK_TYPE_INFILL:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
struct slot_params {
|
||||
bool stream = true;
|
||||
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
||||
@@ -233,6 +253,7 @@ struct server_task {
|
||||
slot_params defaults;
|
||||
defaults.sampling = params_base.sampling;
|
||||
defaults.speculative = params_base.speculative;
|
||||
defaults.n_keep = params_base.n_keep;
|
||||
|
||||
// enabling this will output extra debug information in the HTTP responses from the server
|
||||
params.verbose = params_base.verbosity > 9;
|
||||
@@ -1329,13 +1350,16 @@ struct server_slot {
|
||||
n_draft_accepted = 0;
|
||||
}
|
||||
|
||||
bool is_non_causal() const {
|
||||
return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK;
|
||||
bool need_embd() const {
|
||||
return server_task_type_need_embd(task_type);
|
||||
}
|
||||
|
||||
bool need_logits() const {
|
||||
return server_task_type_need_logits(task_type);
|
||||
}
|
||||
|
||||
bool can_batch_with(server_slot & other_slot) const {
|
||||
return is_non_causal() == other_slot.is_non_causal()
|
||||
&& are_lora_equal(lora, other_slot.lora);
|
||||
return task_type == other_slot.task_type && are_lora_equal(lora, other_slot.lora);
|
||||
}
|
||||
|
||||
bool has_budget(const common_params & global_params) {
|
||||
@@ -1479,7 +1503,6 @@ struct server_slot {
|
||||
{"n_ctx", n_ctx},
|
||||
{"speculative", can_speculate()},
|
||||
{"is_processing", is_processing()},
|
||||
{"non_causal", is_non_causal()},
|
||||
{"params", params.to_json()},
|
||||
{"prompt", prompt_tokens.detokenize(ctx, true)},
|
||||
{"next_token",
|
||||
@@ -1906,6 +1929,14 @@ struct server_context {
|
||||
llama_batch_free(batch);
|
||||
}
|
||||
|
||||
// if the context does not have a memory module then all embeddings have to be computed within a single ubatch
|
||||
// also we cannot split if the pooling would require any past tokens
|
||||
bool can_split() const {
|
||||
return
|
||||
!llama_get_embeddings(ctx) ||
|
||||
(llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);
|
||||
}
|
||||
|
||||
bool load_model(const common_params & params) {
|
||||
SRV_INF("loading model '%s'\n", params.model.path.c_str());
|
||||
|
||||
@@ -2016,11 +2047,6 @@ struct server_context {
|
||||
params_base.n_cache_reuse = 0;
|
||||
SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled");
|
||||
}
|
||||
|
||||
if (!params_base.speculative.model.path.empty()) {
|
||||
SRV_ERR("%s\n", "err: speculative decode is not supported by this context");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -2060,6 +2086,7 @@ struct server_context {
|
||||
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
|
||||
|
||||
slot.params.sampling = params_base.sampling;
|
||||
slot.params.n_keep = params_base.n_keep;
|
||||
|
||||
slot.callback_on_release = [this](int) {
|
||||
queue_tasks.pop_deferred_task();
|
||||
@@ -2733,6 +2760,7 @@ struct server_context {
|
||||
queue_tasks.defer(std::move(task));
|
||||
break;
|
||||
}
|
||||
|
||||
if (slot->is_processing()) {
|
||||
// if requested slot is unavailable, we defer this task for processing later
|
||||
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
|
||||
@@ -3095,7 +3123,14 @@ struct server_context {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (slot.is_non_causal()) {
|
||||
// TODO: support memory-less logits computation
|
||||
if (slot.need_logits() && !llama_get_memory(ctx)) {
|
||||
slot.release();
|
||||
send_error(slot, "the current context does not logits computation. skipping", ERROR_TYPE_SERVER);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!can_split()) {
|
||||
if (slot.n_prompt_tokens > n_ubatch) {
|
||||
slot.release();
|
||||
send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
|
||||
@@ -3220,7 +3255,7 @@ struct server_context {
|
||||
}
|
||||
|
||||
const auto n_swa = llama_model_n_swa(model);
|
||||
if (pos_min > slot.n_past - n_swa) {
|
||||
if (pos_min > std::max(0, slot.n_past - n_swa)) {
|
||||
SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min, n_swa);
|
||||
SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
|
||||
"https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
|
||||
@@ -3230,8 +3265,7 @@ struct server_context {
|
||||
}
|
||||
|
||||
if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) {
|
||||
// we have to evaluate at least 1 token to generate logits.
|
||||
SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens);
|
||||
SLT_WRN(slot, "need to evaluate at least 1 token for each active slot, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens);
|
||||
|
||||
slot.n_past--;
|
||||
}
|
||||
@@ -3239,8 +3273,7 @@ struct server_context {
|
||||
slot.n_prompt_tokens_processed = 0;
|
||||
}
|
||||
|
||||
// non-causal tasks require to fit the entire prompt in the physical batch
|
||||
if (slot.is_non_causal()) {
|
||||
if (!can_split()) {
|
||||
// cannot fit the prompt in the current batch - will try next iter
|
||||
if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
|
||||
continue;
|
||||
@@ -3262,8 +3295,7 @@ struct server_context {
|
||||
slot.cache_tokens.keep_first(slot.n_past);
|
||||
|
||||
// check if we should process the image
|
||||
if (slot.n_past < slot.n_prompt_tokens
|
||||
&& slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
|
||||
if (slot.n_past < slot.n_prompt_tokens && slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
|
||||
// process the image
|
||||
int32_t new_n_past;
|
||||
int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
|
||||
@@ -3294,8 +3326,8 @@ struct server_context {
|
||||
break; // end of text chunk
|
||||
}
|
||||
|
||||
// without pooling, we want to output the embeddings for all the tokens in the batch
|
||||
const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
|
||||
// embedding requires all tokens in the batch to be output
|
||||
const bool need_embd = server_task_type_need_embd(slot.task_type);
|
||||
|
||||
common_batch_add(batch, cur_tok, slot.n_past, { slot.id }, need_embd);
|
||||
slot.cache_tokens.push_back(cur_tok);
|
||||
@@ -3349,17 +3381,15 @@ struct server_context {
|
||||
SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
|
||||
|
||||
if (slot_batched) {
|
||||
// make sure we're in the right embedding mode
|
||||
llama_set_embeddings(ctx, slot_batched->is_non_causal());
|
||||
// apply lora, only need to do it once per batch
|
||||
common_set_adapter_lora(ctx, slot_batched->lora);
|
||||
}
|
||||
|
||||
const bool do_encode = (params_base.embedding || params_base.reranking);
|
||||
llama_set_embeddings(ctx, slot_batched->need_embd());
|
||||
}
|
||||
|
||||
// pad the batch so that batch.n_tokens >= n_slots
|
||||
// TODO: temporary workaround for https://github.com/ggml-org/llama.cpp/issues/13689
|
||||
if (do_encode) {
|
||||
if (slot_batched->need_embd()) {
|
||||
const int n_slots = slots.size();
|
||||
|
||||
if (batch.n_tokens < n_slots) {
|
||||
@@ -3381,8 +3411,11 @@ struct server_context {
|
||||
SRV_WRN("adding %d dummy tokens to the batch, seq_id = %d\n", n_add, seq_id);
|
||||
|
||||
for (int j = 0; j < n_add; ++j) {
|
||||
common_batch_add(batch, 0, j, { seq_id }, false);
|
||||
common_batch_add(batch, 0, j, { seq_id }, true);
|
||||
}
|
||||
|
||||
slots[seq_id].cache_tokens.clear();
|
||||
llama_memory_seq_rm(llama_get_memory(ctx), seq_id, -1, -1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3556,9 +3589,6 @@ struct server_context {
|
||||
const llama_tokens & cached_text_tokens = slot.cache_tokens.get_text_tokens();
|
||||
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, id);
|
||||
|
||||
// keep track of total number of tokens generated in the draft
|
||||
slot.n_draft_total += draft.size();
|
||||
|
||||
// ignore small drafts
|
||||
if (slot.params.speculative.n_min > (int) draft.size()) {
|
||||
SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min);
|
||||
@@ -3566,6 +3596,9 @@ struct server_context {
|
||||
continue;
|
||||
}
|
||||
|
||||
// keep track of total number of drafted tokens tested
|
||||
slot.n_draft_total += draft.size();
|
||||
|
||||
// construct the speculation batch
|
||||
common_batch_clear(slot.batch_spec);
|
||||
common_batch_add (slot.batch_spec, id, slot.n_past, { slot.id }, true);
|
||||
@@ -3584,7 +3617,7 @@ struct server_context {
|
||||
slot.n_past += ids.size();
|
||||
slot.n_decoded += ids.size();
|
||||
|
||||
// update how many tokens out of draft was accepted
|
||||
// update how many tokens out of those tested were accepted
|
||||
slot.n_draft_accepted += ids.size() - 1;
|
||||
|
||||
slot.cache_tokens.push_back(id);
|
||||
@@ -4177,11 +4210,6 @@ int main(int argc, char ** argv) {
|
||||
oaicompat_type oaicompat) -> void {
|
||||
GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
|
||||
|
||||
if (ctx_server.params_base.embedding) {
|
||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
|
||||
auto completion_id = gen_chatcmplid();
|
||||
std::unordered_set<int> task_ids;
|
||||
try {
|
||||
@@ -4436,12 +4464,8 @@ int main(int argc, char ** argv) {
|
||||
OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
|
||||
};
|
||||
|
||||
const auto handle_chat_completions = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
|
||||
const auto handle_chat_completions = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
|
||||
LOG_DBG("request: %s\n", req.body.c_str());
|
||||
if (ctx_server.params_base.embedding) {
|
||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
|
||||
auto body = json::parse(req.body);
|
||||
std::vector<raw_buffer> files;
|
||||
@@ -4569,13 +4593,18 @@ int main(int argc, char ** argv) {
|
||||
};
|
||||
|
||||
const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, oaicompat_type oaicompat) {
|
||||
const json body = json::parse(req.body);
|
||||
if (!ctx_server.params_base.embedding) {
|
||||
res_error(res, format_error_response("This server does not support embeddings. Start it with `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
|
||||
if (oaicompat != OAICOMPAT_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
|
||||
res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST));
|
||||
return;
|
||||
}
|
||||
|
||||
const json body = json::parse(req.body);
|
||||
|
||||
// for the shape of input/content, see tokenize_input_prompts()
|
||||
json prompt;
|
||||
if (body.count("input") != 0) {
|
||||
@@ -4665,8 +4694,8 @@ int main(int argc, char ** argv) {
|
||||
};
|
||||
|
||||
const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
||||
if (!ctx_server.params_base.reranking || ctx_server.params_base.embedding) {
|
||||
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
if (!ctx_server.params_base.embedding || ctx_server.params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) {
|
||||
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -4881,7 +4910,9 @@ int main(int argc, char ** argv) {
|
||||
};
|
||||
|
||||
bool was_bound = false;
|
||||
bool is_sock = false;
|
||||
if (string_ends_with(std::string(params.hostname), ".sock")) {
|
||||
is_sock = true;
|
||||
LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
|
||||
svr->set_address_family(AF_UNIX);
|
||||
// bind_to_port requires a second arg, any value other than 0 should
|
||||
@@ -4959,7 +4990,9 @@ int main(int argc, char ** argv) {
|
||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
#endif
|
||||
|
||||
LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
|
||||
LOG_INF("%s: server is listening on %s - starting the main loop\n", __func__,
|
||||
is_sock ? string_format("unix://%s", params.hostname.c_str()).c_str() :
|
||||
string_format("http://%s:%d", params.hostname.c_str(), params.port).c_str());
|
||||
|
||||
// this call blocks the main thread until queue_tasks.terminate() is called
|
||||
ctx_server.queue_tasks.start_loop();
|
||||
|
||||
@@ -41,6 +41,10 @@ html {
|
||||
max-width: 900px;
|
||||
}
|
||||
|
||||
.chat-bubble {
|
||||
@apply break-words;
|
||||
}
|
||||
|
||||
.chat-bubble-base-300 {
|
||||
--tw-bg-opacity: 1;
|
||||
--tw-text-opacity: 1;
|
||||
|
||||
Reference in New Issue
Block a user