mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-05 00:24:07 +00:00
Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0d5c742161 | ||
|
|
42158ae2e8 | ||
|
|
797f2ac062 | ||
|
|
b44890df2e | ||
|
|
33983057d0 | ||
|
|
fb1cab201c | ||
|
|
b7a17463ec |
@@ -1,10 +1,10 @@
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG MUSA_VERSION=rc3.1.1
|
||||
ARG MUSA_VERSION=rc4.0.1
|
||||
# Target the MUSA build image
|
||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||
|
||||
@@ -21,21 +21,14 @@ RUN apt-get update && \
|
||||
libcurl4-openssl-dev \
|
||||
libgomp1
|
||||
|
||||
COPY requirements.txt requirements.txt
|
||||
COPY requirements requirements
|
||||
|
||||
RUN pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Use the default MUSA archs if not specified
|
||||
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
|
||||
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
@@ -351,7 +351,7 @@ jobs:
|
||||
|
||||
ubuntu-22-cmake-musa:
|
||||
runs-on: ubuntu-22.04
|
||||
container: mthreads/musa:rc3.1.1-devel-ubuntu22.04
|
||||
container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
||||
@@ -37,7 +37,7 @@ range of hardware - locally and in the cloud.
|
||||
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
||||
- AVX, AVX2, AVX512 and AMX support for x86 architectures
|
||||
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
||||
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
|
||||
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
|
||||
- Vulkan and SYCL backend support
|
||||
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
||||
|
||||
@@ -237,7 +237,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
| [BLAS](docs/build.md#blas-build) | All |
|
||||
| [BLIS](docs/backend/BLIS.md) | All |
|
||||
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
|
||||
| [MUSA](docs/build.md#musa) | Moore Threads MTT GPU |
|
||||
| [MUSA](docs/build.md#musa) | Moore Threads GPU |
|
||||
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
|
||||
| [HIP](docs/build.md#hip) | AMD GPU |
|
||||
| [Vulkan](docs/build.md#vulkan) | GPU |
|
||||
|
||||
@@ -54,7 +54,7 @@ docker run --privileged -it \
|
||||
-v $HOME/llama.cpp/ci-cache:/ci-cache \
|
||||
-v $HOME/llama.cpp/ci-results:/ci-results \
|
||||
-v $PWD:/ws -w /ws \
|
||||
mthreads/musa:rc3.1.1-devel-ubuntu22.04
|
||||
mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
|
||||
```
|
||||
|
||||
Inside the container, execute the following commands:
|
||||
|
||||
@@ -107,7 +107,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment
|
||||
|
||||
The defaults are:
|
||||
|
||||
- `MUSA_VERSION` set to `rc3.1.1`
|
||||
- `MUSA_VERSION` set to `rc4.0.1`
|
||||
|
||||
The resulting images, are essentially the same as the non-MUSA images:
|
||||
|
||||
|
||||
@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
|
||||
auto generate = [&](const std::string & prompt) {
|
||||
std::string response;
|
||||
|
||||
const bool is_first = llama_kv_self_used_cells(ctx) == 0;
|
||||
const bool is_first = llama_kv_self_seq_pos_max(ctx, 0) == 0;
|
||||
|
||||
// tokenize the prompt
|
||||
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
|
||||
@@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
|
||||
while (true) {
|
||||
// check if we have enough space in the context to evaluate this batch
|
||||
int n_ctx = llama_n_ctx(ctx);
|
||||
int n_ctx_used = llama_kv_self_used_cells(ctx);
|
||||
int n_ctx_used = llama_kv_self_seq_pos_max(ctx, 0);
|
||||
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
||||
printf("\033[0m\n");
|
||||
fprintf(stderr, "context size exceeded\n");
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
#include "cpy.cuh"
|
||||
#include "dequantize.cuh"
|
||||
#ifdef GGML_USE_MUSA
|
||||
#include "ggml-musa/mudnn.cuh"
|
||||
#endif // GGML_USE_MUSA
|
||||
|
||||
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
||||
|
||||
@@ -597,7 +600,14 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
#endif
|
||||
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
|
||||
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
||||
#ifdef GGML_USE_MUSA
|
||||
if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
|
||||
CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
|
||||
} else
|
||||
#endif // GGML_USE_MUSA
|
||||
{
|
||||
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
|
||||
|
||||
@@ -772,7 +772,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
|
||||
GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
|
||||
GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B);
|
||||
GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum);
|
||||
GGML_UNUSED(kb0);
|
||||
GGML_UNUSED(kb0); GGML_UNUSED(tile_Q);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
@@ -27,12 +27,15 @@ if (MUSAToolkit_FOUND)
|
||||
|
||||
file(GLOB GGML_HEADERS_MUSA "../ggml-cuda/*.cuh")
|
||||
list(APPEND GGML_HEADERS_MUSA "../../include/ggml-cuda.h")
|
||||
list(APPEND GGML_HEADERS_MUSA "../ggml-musa/mudnn.cuh")
|
||||
|
||||
file(GLOB GGML_SOURCES_MUSA "../ggml-cuda/*.cu")
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-mma*.cu")
|
||||
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu")
|
||||
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
||||
file(GLOB SRCS "../ggml-musa/*.cu")
|
||||
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
||||
|
||||
if (GGML_CUDA_FA_ALL_QUANTS)
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
|
||||
@@ -62,7 +65,9 @@ if (MUSAToolkit_FOUND)
|
||||
)
|
||||
|
||||
# TODO: do not use CUDA definitions for MUSA
|
||||
target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
|
||||
endif()
|
||||
|
||||
add_compile_definitions(GGML_USE_MUSA)
|
||||
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
|
||||
@@ -92,9 +97,10 @@ if (MUSAToolkit_FOUND)
|
||||
endif()
|
||||
|
||||
if (GGML_STATIC)
|
||||
# TODO: mudnn has not provided static libraries yet
|
||||
target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static)
|
||||
else()
|
||||
target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas)
|
||||
target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas mudnn)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_NO_VMM)
|
||||
|
||||
112
ggml/src/ggml-musa/mudnn.cu
Normal file
112
ggml/src/ggml-musa/mudnn.cu
Normal file
@@ -0,0 +1,112 @@
|
||||
#include <mutex>
|
||||
#include <mudnn.h>
|
||||
|
||||
#include "mudnn.cuh"
|
||||
|
||||
namespace mudnn = musa::dnn;
|
||||
|
||||
// Returns a human-readable error string for mudnn::Status
|
||||
const char* mudnnGetErrorString(mudnn::Status err) {
|
||||
switch (err) {
|
||||
case mudnn::Status::SUCCESS:
|
||||
return "Success";
|
||||
case mudnn::Status::INVALID_PARAMETER:
|
||||
return "Invalid parameter";
|
||||
case mudnn::Status::NOT_INITIALIZED:
|
||||
return "Not initialized";
|
||||
case mudnn::Status::ALLOC_FAILED:
|
||||
return "Allocation failed";
|
||||
case mudnn::Status::NOT_SUPPORTED:
|
||||
return "Not supported";
|
||||
case mudnn::Status::INTERNAL_ERROR:
|
||||
return "Internal error";
|
||||
case mudnn::Status::ARCH_MISMATCH:
|
||||
return "Architecture mismatch";
|
||||
case mudnn::Status::EXECUTION_FAILED:
|
||||
return "Execution failed";
|
||||
default:
|
||||
return "Unknown mudnn status";
|
||||
}
|
||||
}
|
||||
|
||||
// Error checking macro for MUDNN calls
|
||||
#define MUDNN_CHECK(err) CUDA_CHECK_GEN(err, mudnn::Status::SUCCESS, mudnnGetErrorString)
|
||||
|
||||
namespace {
|
||||
// Thread-safe cache for mudnn::Handle objects per device
|
||||
std::unordered_map<int, std::unique_ptr<mudnn::Handle>> handle_cache;
|
||||
std::mutex handle_cache_mutex;
|
||||
|
||||
mudnn::Handle* get_cached_handle(int device_id) {
|
||||
std::lock_guard<std::mutex> lock(handle_cache_mutex);
|
||||
auto it = handle_cache.find(device_id);
|
||||
if (it != handle_cache.end()) {
|
||||
return it->second.get();
|
||||
}
|
||||
auto handle = std::make_unique<mudnn::Handle>(device_id);
|
||||
mudnn::Handle* handle_ptr = handle.get();
|
||||
handle_cache[device_id] = std::move(handle);
|
||||
return handle_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
// Extracts dimensions and strides from a ggml_tensor
|
||||
int get_ggml_dims_and_strides(const ggml_tensor* tensor,
|
||||
std::vector<int64_t>& dims,
|
||||
std::vector<int64_t>& strides) {
|
||||
const int ndims = ggml_n_dims(tensor);
|
||||
const size_t element_size = ggml_element_size(tensor);
|
||||
|
||||
dims.resize(ndims);
|
||||
strides.resize(ndims);
|
||||
|
||||
for (int i = 0; i < ndims; ++i) {
|
||||
dims[i] = tensor->ne[i];
|
||||
strides[i] = tensor->nb[i] / static_cast<int64_t>(element_size);
|
||||
}
|
||||
return ndims;
|
||||
}
|
||||
|
||||
// Converts ggml_type to mudnn::Tensor::Type
|
||||
mudnn::Tensor::Type ggml_type_to_mudnn_type(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return mudnn::Tensor::Type::FLOAT;
|
||||
case GGML_TYPE_F16:
|
||||
return mudnn::Tensor::Type::HALF;
|
||||
|
||||
// TODO: Add support for other types
|
||||
|
||||
default:
|
||||
MUDNN_CHECK(mudnn::Status::NOT_SUPPORTED);
|
||||
}
|
||||
|
||||
return mudnn::Tensor::Type::FLOAT; // Default fallback
|
||||
}
|
||||
|
||||
// Asynchronous memory copy using mudnn::Unary::IDENTITY
|
||||
musaError_t mudnnMemcpyAsync(ggml_backend_cuda_context& ctx, const ggml_tensor* dst, const ggml_tensor* src) {
|
||||
mudnn::Tensor tensor_dst, tensor_src;
|
||||
|
||||
MUDNN_CHECK(tensor_dst.SetType(ggml_type_to_mudnn_type(dst->type)));
|
||||
MUDNN_CHECK(tensor_src.SetType(ggml_type_to_mudnn_type(src->type)));
|
||||
|
||||
std::vector<int64_t> dims, strides;
|
||||
const int ndims = get_ggml_dims_and_strides(src, dims, strides);
|
||||
|
||||
MUDNN_CHECK(tensor_dst.SetNdInfo(ndims, dims.data(), strides.data()));
|
||||
MUDNN_CHECK(tensor_src.SetNdInfo(ndims, dims.data(), strides.data()));
|
||||
MUDNN_CHECK(tensor_dst.SetAddr(dst->data));
|
||||
MUDNN_CHECK(tensor_src.SetAddr(src->data));
|
||||
|
||||
mudnn::Unary op;
|
||||
MUDNN_CHECK(op.SetMode(mudnn::Unary::Mode::IDENTITY));
|
||||
MUDNN_CHECK(op.SetAlpha(0.0f));
|
||||
MUDNN_CHECK(op.SetBeta(0.0f));
|
||||
|
||||
mudnn::Handle* handle = get_cached_handle(ctx.device);
|
||||
MUDNN_CHECK(handle->SetStream(ctx.stream()));
|
||||
MUDNN_CHECK(op.Run(*handle, tensor_dst, tensor_src));
|
||||
|
||||
return musaSuccess;
|
||||
}
|
||||
12
ggml/src/ggml-musa/mudnn.cuh
Normal file
12
ggml/src/ggml-musa/mudnn.cuh
Normal file
@@ -0,0 +1,12 @@
|
||||
#pragma once
|
||||
|
||||
#include "../include/ggml.h"
|
||||
#include "../ggml-cuda/common.cuh"
|
||||
|
||||
// Asynchronously copies data from src tensor to dst tensor using the provided context.
|
||||
// Returns a musaError_t indicating success or failure.
|
||||
musaError_t mudnnMemcpyAsync(
|
||||
ggml_backend_cuda_context &ctx,
|
||||
const ggml_tensor *dst,
|
||||
const ggml_tensor *src
|
||||
);
|
||||
@@ -4513,6 +4513,8 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
||||
return aligned ? mmp->a_m : mmp->m;
|
||||
}
|
||||
return aligned ? mmp->a_l : mmp->l;
|
||||
|
||||
GGML_UNUSED(src1_type);
|
||||
}
|
||||
|
||||
static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, ggml_type src0_type, ggml_type src1_type) {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#version 450
|
||||
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
|
||||
|
||||
#include "dequant_head.comp"
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
||||
#endif
|
||||
#if defined(DATA_A_IQ1_M)
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_BF16) && defined(COOPMAT)
|
||||
|
||||
@@ -610,10 +610,12 @@ extern "C" {
|
||||
|
||||
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
||||
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
||||
LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
|
||||
DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
|
||||
"Use llama_kv_self_seq_pos_max() instead");
|
||||
|
||||
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
||||
LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
|
||||
DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
|
||||
"Use llama_kv_self_seq_pos_max() instead");
|
||||
|
||||
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
||||
LLAMA_API void llama_kv_self_clear(
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "llama-batch.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <algorithm>
|
||||
|
||||
@@ -281,9 +282,10 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
|
||||
batch = in_batch;
|
||||
GGML_ASSERT(batch.n_tokens > 0);
|
||||
if (!batch.pos) {
|
||||
assert(p0 >= 0);
|
||||
pos.resize(batch.n_tokens);
|
||||
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
||||
pos[i] = i + p0;
|
||||
pos[i] = p0 + i;
|
||||
}
|
||||
batch.pos = pos.data();
|
||||
}
|
||||
|
||||
@@ -857,11 +857,17 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!inp_batch.pos) {
|
||||
if (inp_batch.seq_id) {
|
||||
LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
||||
|
||||
// temporary allocate memory for the input batch if needed
|
||||
// TODO: this is incorrect for multiple sequences because get_pos_max() is the maximum across all sequences
|
||||
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->get_pos_max() + 1);
|
||||
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->seq_pos_max(0) + 1);
|
||||
|
||||
const llama_batch & batch = batch_allocr.batch;
|
||||
|
||||
@@ -2292,22 +2298,47 @@ int32_t llama_apply_adapter_cvec(
|
||||
// kv cache
|
||||
//
|
||||
|
||||
// deprecated
|
||||
int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
|
||||
const auto * kv = ctx->get_kv_self();
|
||||
if (!kv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return kv->get_n_tokens();
|
||||
int32_t res = 0;
|
||||
|
||||
for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
|
||||
const llama_pos p0 = kv->seq_pos_min(s);
|
||||
const llama_pos p1 = kv->seq_pos_max(s);
|
||||
|
||||
if (p0 >= 0) {
|
||||
res += (p1 - p0) + 1;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// deprecated
|
||||
// note: this is the same as above - will be removed anyway, so it's ok
|
||||
int32_t llama_kv_self_used_cells(const llama_context * ctx) {
|
||||
const auto * kv = ctx->get_kv_self();
|
||||
if (!kv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return kv->get_used_cells();
|
||||
int32_t res = 0;
|
||||
|
||||
for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
|
||||
const llama_pos p0 = kv->seq_pos_min(s);
|
||||
const llama_pos p1 = kv->seq_pos_max(s);
|
||||
|
||||
if (p0 >= 0) {
|
||||
res += (p1 - p0) + 1;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llama_kv_self_clear(llama_context * ctx) {
|
||||
|
||||
@@ -1236,8 +1236,7 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified()
|
||||
auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);
|
||||
|
||||
{
|
||||
GGML_ASSERT(hparams.n_swa_pattern == 1 && "Use llama_kv_cache_unified_iswa for SWA");
|
||||
GGML_ASSERT(hparams.n_swa == 0 && "Use llama_kv_cache_unified_iswa for SWA");
|
||||
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA");
|
||||
|
||||
const auto n_kv = kv_self->get_n();
|
||||
|
||||
@@ -1312,8 +1311,8 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
|
||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||
}
|
||||
|
||||
if (hparams.n_swa_pattern > 1) {
|
||||
GGML_ASSERT(hparams.n_swa > 0 && "Use llama_kv_cache_unified for non-SWA");
|
||||
{
|
||||
GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified for non-SWA");
|
||||
|
||||
const auto n_kv = kv_self->get_kv_swa()->get_n();
|
||||
|
||||
|
||||
@@ -30,13 +30,14 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
||||
bool v_trans,
|
||||
bool offload,
|
||||
uint32_t kv_size,
|
||||
uint32_t padding,
|
||||
uint32_t n_seq_max,
|
||||
uint32_t n_pad,
|
||||
uint32_t n_swa,
|
||||
llama_swa_type swa_type) : model(model), hparams(model.hparams), v_trans(v_trans), padding(padding), n_swa(n_swa), swa_type(swa_type) {
|
||||
GGML_ASSERT(kv_size % padding == 0 && "kv_size must be a multiple of padding");
|
||||
llama_swa_type swa_type) :
|
||||
model(model), hparams(model.hparams), v_trans(v_trans),
|
||||
n_seq_max(n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
|
||||
|
||||
this->type_k = type_k;
|
||||
this->type_v = type_v;
|
||||
GGML_ASSERT(kv_size % n_pad == 0);
|
||||
|
||||
// create a context for each buffer type
|
||||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||
@@ -129,8 +130,8 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
||||
const size_t memory_size_k = size_k_bytes();
|
||||
const size_t memory_size_v = size_v_bytes();
|
||||
|
||||
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6d cells, %3d layers), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
|
||||
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(),
|
||||
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
|
||||
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max,
|
||||
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
||||
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
||||
}
|
||||
@@ -442,7 +443,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
|
||||
void llama_kv_cache_unified::defrag_sched(float thold) {
|
||||
// - do not defrag small contexts (i.e. < 2048 tokens)
|
||||
// - count the padding towards the number of used tokens
|
||||
const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(used + padding)/n)) : 0.0f;
|
||||
const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(used + n_pad)/n)) : 0.0f;
|
||||
|
||||
// queue defragmentation for next llama_kv_cache_update
|
||||
if (fragmentation > thold) {
|
||||
@@ -558,7 +559,7 @@ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
|
||||
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
||||
// after enough generations, the benefit from this heuristic disappears
|
||||
// if we start defragmenting the cache, the benefit from this will be more important
|
||||
n = std::min(size, std::max(padding, GGML_PAD(cell_max(), padding)));
|
||||
n = std::min(size, std::max(n_pad, GGML_PAD(cell_max(), n_pad)));
|
||||
|
||||
#ifdef FIND_SLOT_DEBUG
|
||||
LLAMA_LOG_WARN("end: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa);
|
||||
@@ -567,20 +568,6 @@ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
|
||||
return true;
|
||||
}
|
||||
|
||||
int32_t llama_kv_cache_unified::get_n_tokens() const {
|
||||
int32_t result = 0;
|
||||
|
||||
for (uint32_t i = 0; i < size; i++) {
|
||||
result += cells[i].seq_id.size();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
int32_t llama_kv_cache_unified::get_used_cells() const {
|
||||
return used;
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified::get_can_shift() const {
|
||||
return true;
|
||||
}
|
||||
@@ -802,16 +789,6 @@ void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama
|
||||
}
|
||||
}
|
||||
|
||||
llama_pos llama_kv_cache_unified::get_pos_max() const {
|
||||
llama_pos pos_max = -1;
|
||||
|
||||
for (const auto & cell : cells) {
|
||||
pos_max = std::max(pos_max, cell.pos);
|
||||
}
|
||||
|
||||
return pos_max;
|
||||
}
|
||||
|
||||
size_t llama_kv_cache_unified::total_size() const {
|
||||
size_t size = 0;
|
||||
|
||||
@@ -1501,11 +1478,8 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
|
||||
llama_seq_id seq_id;
|
||||
io.read_to(&seq_id, sizeof(seq_id));
|
||||
|
||||
// TODO: llama_kv_cache_unified should have a notion of max sequences
|
||||
//if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
|
||||
if (seq_id < 0) {
|
||||
//LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
|
||||
LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
|
||||
if (seq_id < 0 || (uint32_t) seq_id >= n_seq_max) {
|
||||
LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, n_seq_max);
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1655,17 +1629,17 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
|
||||
ggml_type type_v,
|
||||
bool v_trans,
|
||||
bool offload,
|
||||
uint32_t kv_size,
|
||||
bool swa_full,
|
||||
uint32_t kv_size,
|
||||
uint32_t n_seq_max,
|
||||
uint32_t n_batch,
|
||||
uint32_t padding) : hparams(model.hparams) {
|
||||
uint32_t n_pad) : hparams(model.hparams) {
|
||||
llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
|
||||
llama_kv_cache_unified::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); };
|
||||
|
||||
const uint32_t size_base = kv_size;
|
||||
|
||||
uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, padding));
|
||||
uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, n_pad));
|
||||
|
||||
// when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size and disable pruning
|
||||
if (swa_full) {
|
||||
@@ -1680,14 +1654,14 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
|
||||
|
||||
kv_base = std::make_unique<llama_kv_cache_unified>(
|
||||
model, std::move(filter_base), type_k, type_v,
|
||||
v_trans, offload, size_base, padding,
|
||||
v_trans, offload, size_base, n_seq_max, n_pad,
|
||||
0, LLAMA_SWA_TYPE_NONE);
|
||||
|
||||
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
|
||||
|
||||
kv_swa = std::make_unique<llama_kv_cache_unified>(
|
||||
model, std::move(filter_swa), type_k, type_v,
|
||||
v_trans, offload, size_swa, padding,
|
||||
v_trans, offload, size_swa, n_seq_max, n_pad,
|
||||
hparams.n_swa, hparams.swa_type);
|
||||
}
|
||||
|
||||
@@ -1810,18 +1784,6 @@ bool llama_kv_cache_unified_iswa::find_slot(const llama_ubatch & batch) {
|
||||
return res;
|
||||
}
|
||||
|
||||
int32_t llama_kv_cache_unified_iswa::get_n_tokens() const {
|
||||
return kv_base->get_n_tokens();
|
||||
}
|
||||
|
||||
int32_t llama_kv_cache_unified_iswa::get_used_cells() const {
|
||||
return kv_base->get_used_cells();
|
||||
}
|
||||
|
||||
llama_pos llama_kv_cache_unified_iswa::get_pos_max() const {
|
||||
return kv_base->get_pos_max();
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified_iswa::get_can_shift() const {
|
||||
return kv_base->get_size() == kv_swa->get_size();
|
||||
}
|
||||
@@ -1853,19 +1815,17 @@ llama_kv_cache_recurrent::llama_kv_cache_recurrent(
|
||||
ggml_type type_k,
|
||||
ggml_type type_v,
|
||||
bool offload,
|
||||
uint32_t kv_size) : hparams(model.hparams) {
|
||||
uint32_t kv_size,
|
||||
uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
|
||||
const int32_t n_layer = hparams.n_layer;
|
||||
|
||||
LLAMA_LOG_INFO("%s: kv_size = %d, type_k = '%s', type_v = '%s', n_layer = %d\n",
|
||||
__func__, kv_size, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
|
||||
LLAMA_LOG_INFO("%s: kv_size = %u, n_seq_max = %u, type_k = '%s', type_v = '%s', n_layer = %d\n",
|
||||
__func__, kv_size, n_seq_max, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
|
||||
|
||||
head = 0;
|
||||
size = kv_size;
|
||||
used = 0;
|
||||
|
||||
this->type_k = type_k;
|
||||
this->type_v = type_v;
|
||||
|
||||
cells.clear();
|
||||
cells.resize(kv_size);
|
||||
|
||||
@@ -2203,8 +2163,8 @@ void llama_kv_cache_recurrent::commit() {
|
||||
pending.ranges.clear();
|
||||
}
|
||||
|
||||
bool llama_kv_cache_recurrent::update(llama_context & lctx) {
|
||||
GGML_UNUSED(lctx);
|
||||
bool llama_kv_cache_recurrent::update(llama_context & ctx) {
|
||||
GGML_UNUSED(ctx);
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2265,7 +2225,7 @@ bool llama_kv_cache_recurrent::find_slot(
|
||||
if (seq_id < 0 || (uint32_t) seq_id >= size) {
|
||||
// too big seq_id
|
||||
// TODO: would it be possible to resize the cache instead?
|
||||
LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size);
|
||||
LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%u Try using a bigger --parallel value\n", __func__, seq_id, n_seq_max);
|
||||
return false;
|
||||
}
|
||||
if (j > 0) {
|
||||
@@ -2408,29 +2368,6 @@ bool llama_kv_cache_recurrent::find_slot(
|
||||
return n >= n_seqs;
|
||||
}
|
||||
|
||||
int32_t llama_kv_cache_recurrent::get_n_tokens() const {
|
||||
int32_t result = 0;
|
||||
|
||||
for (uint32_t i = 0; i < size; i++) {
|
||||
result += cells[i].seq_id.size();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
int32_t llama_kv_cache_recurrent::get_used_cells() const {
|
||||
return used;
|
||||
}
|
||||
|
||||
llama_pos llama_kv_cache_recurrent::get_pos_max() const {
|
||||
llama_pos pos_max = -1;
|
||||
for (const auto & cell : cells) {
|
||||
pos_max = std::max(pos_max, cell.pos);
|
||||
}
|
||||
|
||||
return pos_max;
|
||||
}
|
||||
|
||||
bool llama_kv_cache_recurrent::get_can_shift() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -55,10 +55,7 @@ struct llama_kv_cache : public llama_memory_i {
|
||||
// =============================================================================================================
|
||||
|
||||
// getters
|
||||
virtual int32_t get_n_tokens() const = 0;
|
||||
virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
|
||||
virtual llama_pos get_pos_max() const = 0;
|
||||
virtual bool get_can_shift() const = 0;
|
||||
virtual bool get_can_shift() const = 0;
|
||||
|
||||
bool get_can_edit() const override { return get_can_shift(); }
|
||||
|
||||
@@ -108,7 +105,8 @@ public:
|
||||
bool v_trans,
|
||||
bool offload,
|
||||
uint32_t kv_size,
|
||||
uint32_t padding,
|
||||
uint32_t n_seq_max,
|
||||
uint32_t n_pad,
|
||||
uint32_t n_swa,
|
||||
llama_swa_type swa_type);
|
||||
|
||||
@@ -150,12 +148,6 @@ public:
|
||||
// to the first cell of the slot.
|
||||
bool find_slot(const llama_ubatch & batch) override;
|
||||
|
||||
int32_t get_n_tokens() const override;
|
||||
int32_t get_used_cells() const override;
|
||||
|
||||
// TODO: better data structures to reduce the cost of this operation
|
||||
llama_pos get_pos_max() const override;
|
||||
|
||||
bool get_can_shift() const override;
|
||||
|
||||
// state write/load
|
||||
@@ -228,16 +220,15 @@ private:
|
||||
// computed before each graph build
|
||||
uint32_t n = 0;
|
||||
|
||||
// required padding
|
||||
uint32_t padding = 1;
|
||||
const uint32_t n_seq_max = 1;
|
||||
|
||||
ggml_type type_k = GGML_TYPE_F16;
|
||||
ggml_type type_v = GGML_TYPE_F16;
|
||||
// required padding
|
||||
const uint32_t n_pad = 1;
|
||||
|
||||
// SWA
|
||||
uint32_t n_swa = 0;
|
||||
const uint32_t n_swa = 0;
|
||||
|
||||
llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
@@ -317,11 +308,11 @@ public:
|
||||
ggml_type type_v,
|
||||
bool v_trans,
|
||||
bool offload,
|
||||
uint32_t kv_size,
|
||||
bool swa_full,
|
||||
uint32_t kv_size,
|
||||
uint32_t n_seq_max,
|
||||
uint32_t n_batch,
|
||||
uint32_t padding);
|
||||
uint32_t n_pad);
|
||||
|
||||
~llama_kv_cache_unified_iswa() = default;
|
||||
|
||||
@@ -358,12 +349,6 @@ public:
|
||||
|
||||
bool find_slot(const llama_ubatch & batch) override;
|
||||
|
||||
int32_t get_n_tokens() const override;
|
||||
int32_t get_used_cells() const override;
|
||||
|
||||
// TODO: better data structures to reduce the cost of this operation
|
||||
llama_pos get_pos_max() const override;
|
||||
|
||||
bool get_can_shift() const override;
|
||||
|
||||
// state write/load
|
||||
@@ -432,7 +417,8 @@ public:
|
||||
ggml_type type_k,
|
||||
ggml_type type_v,
|
||||
bool offload,
|
||||
uint32_t kv_size);
|
||||
uint32_t kv_size,
|
||||
uint32_t n_seq_max);
|
||||
|
||||
~llama_kv_cache_recurrent() = default;
|
||||
|
||||
@@ -444,7 +430,7 @@ public:
|
||||
|
||||
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
|
||||
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
|
||||
void seq_keep(llama_seq_id seq_id) override;
|
||||
void seq_keep(llama_seq_id seq_id) override;
|
||||
void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override;
|
||||
void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
|
||||
|
||||
@@ -458,7 +444,7 @@ public:
|
||||
void restore() override;
|
||||
void commit() override;
|
||||
|
||||
bool update(llama_context & lctx) override;
|
||||
bool update(llama_context & ctx) override;
|
||||
|
||||
void defrag_sched(float thold) override;
|
||||
|
||||
@@ -469,12 +455,6 @@ public:
|
||||
|
||||
bool find_slot(const llama_ubatch & batch) override;
|
||||
|
||||
int32_t get_n_tokens() const override;
|
||||
int32_t get_used_cells() const override;
|
||||
|
||||
// TODO: better data structures to reduce the cost of this operation
|
||||
llama_pos get_pos_max() const override;
|
||||
|
||||
bool get_can_shift() const override;
|
||||
|
||||
// TODO: temporary methods - they are not really const as they do const_cast<>, fix this
|
||||
@@ -514,8 +494,7 @@ private:
|
||||
std::vector<slot_range> ranges;
|
||||
} pending;
|
||||
|
||||
ggml_type type_k = GGML_TYPE_F16;
|
||||
ggml_type type_v = GGML_TYPE_F16;
|
||||
const uint32_t n_seq_max = 1;
|
||||
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
@@ -853,43 +853,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
// for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
|
||||
if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
|
||||
// default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
|
||||
LLAMA_LOG_WARN("%s: assuming n_swa = 2047 for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct\n", __func__);
|
||||
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
|
||||
hparams.n_swa = 2047;
|
||||
} else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
|
||||
// default value for Phi-3-mini-128k-instruct
|
||||
LLAMA_LOG_WARN("%s: assuming no SWA for Phi-3-mini-128k-instruct\n", __func__);
|
||||
if (found_swa && hparams.n_swa > 0) {
|
||||
LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
|
||||
__func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
|
||||
|
||||
// TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
|
||||
hparams.n_swa = hparams.n_ctx_train;
|
||||
hparams.n_swa_pattern = 1;
|
||||
} else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
|
||||
// default value for Phi-3-medium-128k-instruct
|
||||
LLAMA_LOG_WARN("%s: assuming no SWA for Phi-3-medium-128k-instruct\n", __func__);
|
||||
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
|
||||
hparams.n_swa = hparams.n_ctx_train;
|
||||
hparams.n_swa_pattern = 1;
|
||||
}
|
||||
|
||||
bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||
if (!found_swa && hparams.n_swa == 0) {
|
||||
throw std::runtime_error("invalid value for sliding_window");
|
||||
}
|
||||
|
||||
if (hparams.n_swa > hparams.n_ctx_train) {
|
||||
LLAMA_LOG_WARN("%s: unexpected n_swa: %d >= %d, disabling SWA\n", __func__, hparams.n_swa, hparams.n_ctx_train);
|
||||
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
|
||||
hparams.n_swa = hparams.n_ctx_train;
|
||||
hparams.n_swa = 0;
|
||||
hparams.n_swa_pattern = 1;
|
||||
}
|
||||
} break;
|
||||
@@ -7368,8 +7341,9 @@ struct llm_build_phi2 : public llm_graph_context {
|
||||
}
|
||||
};
|
||||
|
||||
struct llm_build_phi3_iswa : public llm_graph_context {
|
||||
llm_build_phi3_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
template<bool iswa>
|
||||
struct llm_build_phi3 : public llm_graph_context {
|
||||
llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
@@ -7383,7 +7357,14 @@ struct llm_build_phi3_iswa : public llm_graph_context {
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
||||
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
|
||||
inp_attn_type * inp_attn = nullptr;
|
||||
|
||||
if constexpr (iswa) {
|
||||
inp_attn = build_attn_inp_kv_unified_iswa();
|
||||
} else {
|
||||
inp_attn = build_attn_inp_kv_unified();
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
auto * residual = inpL;
|
||||
@@ -13222,7 +13203,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
GGML_TYPE_F32,
|
||||
GGML_TYPE_F32,
|
||||
cparams.offload_kqv,
|
||||
std::max((uint32_t) 1, cparams.n_seq_max));
|
||||
std::max((uint32_t) 1, cparams.n_seq_max),
|
||||
cparams.n_seq_max);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
@@ -13232,19 +13214,23 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
||||
|
||||
if (hparams.n_swa > 0) {
|
||||
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
||||
GGML_ASSERT(hparams.n_swa_pattern != 1);
|
||||
|
||||
res = new llama_kv_cache_unified_iswa(
|
||||
*this,
|
||||
params.type_k,
|
||||
params.type_v,
|
||||
!cparams.flash_attn,
|
||||
cparams.offload_kqv,
|
||||
cparams.n_ctx,
|
||||
params.swa_full,
|
||||
cparams.n_ctx,
|
||||
cparams.n_seq_max,
|
||||
cparams.n_batch,
|
||||
padding);
|
||||
} else {
|
||||
GGML_ASSERT(hparams.n_swa_pattern == 1);
|
||||
|
||||
res = new llama_kv_cache_unified(
|
||||
*this,
|
||||
nullptr,
|
||||
@@ -13253,6 +13239,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
||||
!cparams.flash_attn,
|
||||
cparams.offload_kqv,
|
||||
cparams.n_ctx,
|
||||
cparams.n_seq_max,
|
||||
padding,
|
||||
hparams.n_swa,
|
||||
hparams.swa_type);
|
||||
@@ -13353,7 +13340,11 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
case LLM_ARCH_PHI3:
|
||||
case LLM_ARCH_PHIMOE:
|
||||
{
|
||||
llm = std::make_unique<llm_build_phi3_iswa>(*this, params, gf);
|
||||
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
||||
llm = std::make_unique<llm_build_phi3<true>> (*this, params, gf);
|
||||
} else {
|
||||
llm = std::make_unique<llm_build_phi3<false>>(*this, params, gf);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_PLAMO:
|
||||
{
|
||||
|
||||
@@ -231,12 +231,14 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
|
||||
while (i < n_tokens) { // split into batches
|
||||
text_batch.n_tokens = 0; // clear the batch
|
||||
for (; i < n_tokens && text_batch.n_tokens < n_batch; i++) {
|
||||
int32_t j = text_batch.n_tokens;
|
||||
text_batch.token [j] = tokens[i];
|
||||
text_batch.pos [j] = n_past++;
|
||||
text_batch.n_seq_id[j] = 1;
|
||||
text_batch.seq_id [j][0] = seq_id;
|
||||
text_batch.logits [j] = false;
|
||||
|
||||
text_batch.n_tokens++;
|
||||
text_batch.token [i] = tokens[i];
|
||||
text_batch.pos [i] = n_past++;
|
||||
text_batch.n_seq_id[i] = 1;
|
||||
text_batch.seq_id [i][0] = seq_id;
|
||||
text_batch.logits [i] = false;
|
||||
}
|
||||
bool is_last_token = (i == n_tokens);
|
||||
if (logits_last && is_last_token) {
|
||||
|
||||
@@ -936,7 +936,7 @@ static int apply_chat_template(const struct common_chat_templates * tmpls, Llama
|
||||
// Function to tokenize the prompt
|
||||
static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
|
||||
std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
|
||||
const bool is_first = llama_kv_self_used_cells(llama_data.context.get()) == 0;
|
||||
const bool is_first = llama_kv_self_seq_pos_max(llama_data.context.get(), 0) == 0;
|
||||
|
||||
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
|
||||
prompt_tokens.resize(n_prompt_tokens);
|
||||
@@ -952,7 +952,7 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt
|
||||
// Check if we have enough space in the context to evaluate this batch
|
||||
static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
|
||||
const int n_ctx = llama_n_ctx(ctx.get());
|
||||
const int n_ctx_used = llama_kv_self_used_cells(ctx.get());
|
||||
const int n_ctx_used = llama_kv_self_seq_pos_max(ctx.get(), 0);
|
||||
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
||||
printf(LOG_COL_DEFAULT "\n");
|
||||
printe("context size exceeded\n");
|
||||
|
||||
@@ -951,7 +951,7 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
}
|
||||
|
||||
json to_json_oaicompat_chat() {
|
||||
bool first = n_decoded == 0;
|
||||
bool first = n_decoded == 1;
|
||||
std::time_t t = std::time(0);
|
||||
json choices;
|
||||
|
||||
@@ -962,15 +962,18 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
{"delta", json{{"role", "assistant"}}}}});
|
||||
} else {
|
||||
// We have to send this as two updates to conform to openai behavior
|
||||
// initial_ret is the role message for stream=True
|
||||
json initial_ret = json{{"choices", json::array({json{
|
||||
{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta", json{
|
||||
{"role", "assistant"}
|
||||
{"role", "assistant"},
|
||||
{"content", ""}
|
||||
}}}})},
|
||||
{"created", t},
|
||||
{"id", oaicompat_cmpl_id},
|
||||
{"model", oaicompat_model},
|
||||
{"system_fingerprint", build_info},
|
||||
{"object", "chat.completion.chunk"}};
|
||||
|
||||
json second_ret = json{
|
||||
@@ -982,8 +985,19 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
{"created", t},
|
||||
{"id", oaicompat_cmpl_id},
|
||||
{"model", oaicompat_model},
|
||||
{"system_fingerprint", build_info},
|
||||
{"object", "chat.completion.chunk"}};
|
||||
|
||||
if (prob_output.probs.size() > 0) {
|
||||
second_ret["choices"][0]["logprobs"] = json{
|
||||
{"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
|
||||
};
|
||||
}
|
||||
|
||||
if (timings.prompt_n >= 0) {
|
||||
second_ret.push_back({"timings", timings.to_json()});
|
||||
}
|
||||
|
||||
return std::vector<json>({initial_ret, second_ret});
|
||||
}
|
||||
} else {
|
||||
@@ -1137,9 +1151,6 @@ struct server_task_result_metrics : server_task_result {
|
||||
int n_tasks_deferred;
|
||||
int64_t t_start;
|
||||
|
||||
int32_t kv_cache_tokens_count;
|
||||
int32_t kv_cache_used_cells;
|
||||
|
||||
// TODO: somehow reuse server_metrics in the future, instead of duplicating the fields
|
||||
uint64_t n_prompt_tokens_processed_total = 0;
|
||||
uint64_t t_prompt_processing_total = 0;
|
||||
@@ -1179,9 +1190,6 @@ struct server_task_result_metrics : server_task_result {
|
||||
{ "n_decode_total", n_decode_total },
|
||||
{ "n_busy_slots_total", n_busy_slots_total },
|
||||
|
||||
{ "kv_cache_tokens_count", kv_cache_tokens_count },
|
||||
{ "kv_cache_used_cells", kv_cache_used_cells },
|
||||
|
||||
{ "slots", slots_data },
|
||||
};
|
||||
}
|
||||
@@ -2771,9 +2779,6 @@ struct server_context {
|
||||
res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size();
|
||||
res->t_start = metrics.t_start;
|
||||
|
||||
res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
|
||||
res->kv_cache_used_cells = llama_kv_self_used_cells(ctx);
|
||||
|
||||
res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
|
||||
res->t_prompt_processing_total = metrics.t_prompt_processing_total;
|
||||
res->n_tokens_predicted_total = metrics.n_tokens_predicted_total;
|
||||
@@ -3702,6 +3707,7 @@ int main(int argc, char ** argv) {
|
||||
"/health",
|
||||
"/models",
|
||||
"/v1/models",
|
||||
"/api/tags"
|
||||
};
|
||||
|
||||
// If API key is not set, skip validation
|
||||
@@ -3740,7 +3746,7 @@ int main(int argc, char ** argv) {
|
||||
if (req.path == "/" || tmp.back() == "html") {
|
||||
res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
|
||||
res.status = 503;
|
||||
} else if (req.path == "/models" || req.path == "/v1/models") {
|
||||
} else if (req.path == "/models" || req.path == "/v1/models" || req.path == "/api/tags") {
|
||||
// allow the models endpoint to be accessed during loading
|
||||
return true;
|
||||
} else {
|
||||
@@ -3883,14 +3889,6 @@ int main(int argc, char ** argv) {
|
||||
{"name", "predicted_tokens_seconds"},
|
||||
{"help", "Average generation throughput in tokens/s."},
|
||||
{"value", res_metrics->n_tokens_predicted ? 1.e3 / res_metrics->t_tokens_generation * res_metrics->n_tokens_predicted : 0.}
|
||||
},{
|
||||
{"name", "kv_cache_usage_ratio"},
|
||||
{"help", "KV-cache usage. 1 means 100 percent usage."},
|
||||
{"value", 1. * res_metrics->kv_cache_used_cells / params.n_ctx}
|
||||
},{
|
||||
{"name", "kv_cache_tokens"},
|
||||
{"help", "KV-cache tokens."},
|
||||
{"value", (uint64_t) res_metrics->kv_cache_tokens_count}
|
||||
},{
|
||||
{"name", "requests_processing"},
|
||||
{"help", "Number of requests processing."},
|
||||
@@ -4086,6 +4084,19 @@ int main(int argc, char ** argv) {
|
||||
{ "llama.context_length", ctx_server.slots.back().n_ctx, },
|
||||
}
|
||||
},
|
||||
{"modelfile", ""},
|
||||
{"parameters", ""},
|
||||
{"template", common_chat_templates_source(ctx_server.chat_templates.get())},
|
||||
{"details", {
|
||||
{"parent_model", ""},
|
||||
{"format", "gguf"},
|
||||
{"family", ""},
|
||||
{"families", {""}},
|
||||
{"parameter_size", ""},
|
||||
{"quantization_level", ""}
|
||||
}},
|
||||
{"model_info", ""},
|
||||
{"capabilities", {"completion"}}
|
||||
};
|
||||
|
||||
res_ok(res, data);
|
||||
@@ -4411,6 +4422,28 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
json models = {
|
||||
{"models", {
|
||||
{
|
||||
{"name", params.model_alias.empty() ? params.model.path : params.model_alias},
|
||||
{"model", params.model_alias.empty() ? params.model.path : params.model_alias},
|
||||
{"modified_at", ""},
|
||||
{"size", ""},
|
||||
{"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash
|
||||
{"type", "model"},
|
||||
{"description", ""},
|
||||
{"tags", {""}},
|
||||
{"capabilities", {"completion"}},
|
||||
{"parameters", ""},
|
||||
{"details", {
|
||||
{"parent_model", ""},
|
||||
{"format", "gguf"},
|
||||
{"family", ""},
|
||||
{"families", {""}},
|
||||
{"parameter_size", ""},
|
||||
{"quantization_level", ""}
|
||||
}}
|
||||
}
|
||||
}},
|
||||
{"object", "list"},
|
||||
{"data", {
|
||||
{
|
||||
@@ -4420,7 +4453,7 @@ int main(int argc, char ** argv) {
|
||||
{"owned_by", "llamacpp"},
|
||||
{"meta", model_meta},
|
||||
},
|
||||
}}
|
||||
}}
|
||||
};
|
||||
|
||||
res_ok(res, models);
|
||||
@@ -4748,11 +4781,13 @@ int main(int argc, char ** argv) {
|
||||
svr->Post("/api/show", handle_api_show);
|
||||
svr->Get ("/models", handle_models); // public endpoint (no API key check)
|
||||
svr->Get ("/v1/models", handle_models); // public endpoint (no API key check)
|
||||
svr->Get ("/api/tags", handle_models); // ollama specific endpoint. public endpoint (no API key check)
|
||||
svr->Post("/completion", handle_completions); // legacy
|
||||
svr->Post("/completions", handle_completions);
|
||||
svr->Post("/v1/completions", handle_completions_oai);
|
||||
svr->Post("/chat/completions", handle_chat_completions);
|
||||
svr->Post("/v1/chat/completions", handle_chat_completions);
|
||||
svr->Post("/api/chat", handle_chat_completions); // ollama specific endpoint
|
||||
svr->Post("/infill", handle_infill);
|
||||
svr->Post("/embedding", handle_embeddings); // legacy
|
||||
svr->Post("/embeddings", handle_embeddings);
|
||||
|
||||
@@ -71,8 +71,14 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
|
||||
})
|
||||
content = ""
|
||||
last_cmpl_id = None
|
||||
for data in res:
|
||||
for i, data in enumerate(res):
|
||||
choice = data["choices"][0]
|
||||
if i == 0:
|
||||
# Check first role message for stream=True
|
||||
assert choice["delta"]["content"] == ""
|
||||
assert choice["delta"]["role"] == "assistant"
|
||||
else:
|
||||
assert "role" not in choice["delta"]
|
||||
assert data["system_fingerprint"].startswith("b")
|
||||
assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
|
||||
if last_cmpl_id is None:
|
||||
@@ -242,12 +248,18 @@ def test_chat_completion_with_timings_per_token():
|
||||
"stream": True,
|
||||
"timings_per_token": True,
|
||||
})
|
||||
for data in res:
|
||||
assert "timings" in data
|
||||
assert "prompt_per_second" in data["timings"]
|
||||
assert "predicted_per_second" in data["timings"]
|
||||
assert "predicted_n" in data["timings"]
|
||||
assert data["timings"]["predicted_n"] <= 10
|
||||
for i, data in enumerate(res):
|
||||
if i == 0:
|
||||
# Check first role message for stream=True
|
||||
assert data["choices"][0]["delta"]["content"] == ""
|
||||
assert data["choices"][0]["delta"]["role"] == "assistant"
|
||||
else:
|
||||
assert "role" not in data["choices"][0]["delta"]
|
||||
assert "timings" in data
|
||||
assert "prompt_per_second" in data["timings"]
|
||||
assert "predicted_per_second" in data["timings"]
|
||||
assert "predicted_n" in data["timings"]
|
||||
assert data["timings"]["predicted_n"] <= 10
|
||||
|
||||
|
||||
def test_logprobs():
|
||||
@@ -295,17 +307,23 @@ def test_logprobs_stream():
|
||||
)
|
||||
output_text = ''
|
||||
aggregated_text = ''
|
||||
for data in res:
|
||||
for i, data in enumerate(res):
|
||||
choice = data.choices[0]
|
||||
if choice.finish_reason is None:
|
||||
if choice.delta.content:
|
||||
output_text += choice.delta.content
|
||||
assert choice.logprobs is not None
|
||||
assert choice.logprobs.content is not None
|
||||
for token in choice.logprobs.content:
|
||||
aggregated_text += token.token
|
||||
assert token.logprob <= 0.0
|
||||
assert token.bytes is not None
|
||||
assert token.top_logprobs is not None
|
||||
assert len(token.top_logprobs) > 0
|
||||
if i == 0:
|
||||
# Check first role message for stream=True
|
||||
assert choice.delta.content == ""
|
||||
assert choice.delta.role == "assistant"
|
||||
else:
|
||||
assert choice.delta.role is None
|
||||
if choice.finish_reason is None:
|
||||
if choice.delta.content:
|
||||
output_text += choice.delta.content
|
||||
assert choice.logprobs is not None
|
||||
assert choice.logprobs.content is not None
|
||||
for token in choice.logprobs.content:
|
||||
aggregated_text += token.token
|
||||
assert token.logprob <= 0.0
|
||||
assert token.bytes is not None
|
||||
assert token.top_logprobs is not None
|
||||
assert len(token.top_logprobs) > 0
|
||||
assert aggregated_text == output_text
|
||||
|
||||
Reference in New Issue
Block a user