Compare commits

...

9 Commits
b7009 ... b7018

Author SHA1 Message Date
lhez
ece0f5c177 opencl: add fastdiv and use it in set_rows, ported from cuda (#17090)
* opencl: add fastdiv for mm q8_0

* opencl: use uint4 for fastdiv vals

* opencl: use fastdiv for set_rows

* opencl: do not use fastdiv for q8_0 mm
2025-11-10 15:00:13 -08:00
Sigbjørn Skjæret
7bef684118 models : move build_inp_out_ids outside loop (#17151)
* move build_inp_out_ids outside loop

* realign
2025-11-10 22:55:30 +01:00
Max Krasnyansky
395e286bc9 cpu: skip NOPs to avoid barriers (#17133)
* cpu: skip NOPs to avoid barriers

* cpu: use ggml_op_is_empty
2025-11-10 12:44:49 -08:00
Georgi Gerganov
13730c183b metal : cap threadgroups size of set_rows (#17146) 2025-11-10 21:33:35 +02:00
Adrien Gallouët
967eb4b2bf ggml-cpu : inspect -march and -mcpu to found the CPU (#16333)
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2025-11-10 21:03:36 +02:00
Ruben Ortlam
f117be185e vulkan: check glslc executable string (#17144) 2025-11-10 16:59:26 +01:00
Ruben Ortlam
85234a4b3a vulkan: fix validation issue introduced by #16868 (#17145) 2025-11-10 16:59:10 +01:00
Gabe Goodhart
0c74f32632 memory: Hybrid context shift (#17009)
* feat(memory): Only fail partial erasure of recurrent tail

The recurrent state is always assumed to be the state as of the last update
from the final token in the sequence. When doing a partial erasure, if the
range does not include the final token, the erasure can be considered a
success since any memory used for the sequence prior to the final token
(which is no memory) has been successfully removed.

There is one potential case that this doesn't address which is the pruning
of cache to remove sensitive data from the context. This wouldn't work for
attention cache partial removal (in the middle) either since the KV state
is linearly-dependent and states in later sequence positions would still be
based on the state from the sensitive data, even if that data is no longer
cached, so I don't think this is relevant, but it is worth noting that the
semantics of this change for a partial erasure in the middle of the cache
are essentially "my context is already compressed" and not "all trace of
the removed tokens has been removed."

https://github.com/ggml-org/llama.cpp/issues/16768
Branch: HybridContextShift-16768

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(main): Check the output of seq_rm for prefix matching

This prefix matching is explicitly attempting to remove the tokens at the
end of the sequence that don't match. This is the operation that can't be
performed on a recurrent cache due to the state being updated in place, so
if this removal fails, we need to clear the whole cache.

https://github.com/ggml-org/llama.cpp/issues/16768
Branch: HybridContextShift-16768

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(memory): Fix condition for partial erasure failure if p0 > pos

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

Co-authored-by: compilade <git@compilade.net>

* style: Fix extra parens

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* fix(main.cpp): Set n_matching_session_tokens to 0 on cache clear

https://github.com/ggml-org/llama.cpp/issues/16768
Branch: HybridContextShift-16768

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

---------

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
Co-authored-by: compilade <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-11-10 17:14:23 +02:00
Georgi Gerganov
c27efd2bd1 metal : enable tensor API for A19 (#17087) 2025-11-10 15:38:42 +02:00
14 changed files with 145 additions and 102 deletions

View File

@@ -126,25 +126,36 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
)
if (NOT ARM_MCPU_RESULT)
string(REGEX MATCH "-mcpu=[^ ']+" ARM_MCPU_FLAG "${ARM_MCPU}")
string(REGEX MATCH "-march=[^ ']+" ARM_MARCH_FLAG "${ARM_MCPU}")
# on some old GCC we need to read -march=
if (ARM_MARCH_FLAG AND NOT "${ARM_MARCH_FLAG}" STREQUAL "-march=native")
set(ARM_NATIVE_FLAG "${ARM_MARCH_FLAG}")
elseif(ARM_MCPU_FLAG AND NOT "${ARM_MCPU_FLAG}" STREQUAL "-mcpu=native")
set(ARM_NATIVE_FLAG "${ARM_MCPU_FLAG}")
endif()
endif()
if ("${ARM_MCPU_FLAG}" STREQUAL "")
set(ARM_MCPU_FLAG -mcpu=native)
message(STATUS "ARM -mcpu not found, -mcpu=native will be used")
if ("${ARM_NATIVE_FLAG}" STREQUAL "")
set(ARM_NATIVE_FLAG -mcpu=native)
message(WARNING "ARM -march/-mcpu not found, -mcpu=native will be used")
else()
message(STATUS "ARM detected flags: ${ARM_NATIVE_FLAG}")
endif()
include(CheckCXXSourceRuns)
function(check_arm_feature tag code)
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+${tag}")
set(CMAKE_REQUIRED_FLAGS "${ARM_NATIVE_FLAG}+${tag}")
check_cxx_source_runs("${code}" GGML_MACHINE_SUPPORTS_${tag})
if (GGML_MACHINE_SUPPORTS_${tag})
set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+${tag}" PARENT_SCOPE)
set(ARM_NATIVE_FLAG_FIX "${ARM_NATIVE_FLAG_FIX}+${tag}" PARENT_SCOPE)
else()
set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+no${tag}")
set(CMAKE_REQUIRED_FLAGS "${ARM_NATIVE_FLAG}+no${tag}")
check_cxx_source_compiles("int main() { return 0; }" GGML_MACHINE_SUPPORTS_no${tag})
if (GGML_MACHINE_SUPPORTS_no${tag})
set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+no${tag}" PARENT_SCOPE)
set(ARM_NATIVE_FLAG_FIX "${ARM_NATIVE_FLAG_FIX}+no${tag}" PARENT_SCOPE)
endif()
endif()
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
@@ -155,7 +166,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
check_arm_feature(sve "#include <arm_sve.h>\nint main() { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }")
check_arm_feature(sme "#include <arm_sme.h>\n__arm_locally_streaming int main() { __asm__ volatile(\"smstart; smstop;\"); return 0; }")
list(APPEND ARCH_FLAGS "${ARM_MCPU_FLAG}${ARM_MCPU_FLAG_FIX}")
list(APPEND ARCH_FLAGS "${ARM_NATIVE_FLAG}${ARM_NATIVE_FLAG_FIX}")
else()
if (GGML_CPU_ARM_ARCH)
list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})

View File

@@ -1807,22 +1807,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_cont(params, tensor);
} break;
case GGML_OP_RESHAPE:
{
ggml_compute_forward_reshape(params, tensor);
} break;
case GGML_OP_VIEW:
{
ggml_compute_forward_view(params, tensor);
} break;
case GGML_OP_PERMUTE:
{
ggml_compute_forward_permute(params, tensor);
} break;
case GGML_OP_TRANSPOSE:
{
ggml_compute_forward_transpose(params, tensor);
} break;
case GGML_OP_GET_ROWS:
{
ggml_compute_forward_get_rows(params, tensor);
@@ -2042,6 +2026,22 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
// nop
} break;
case GGML_OP_RESHAPE:
{
// nop
} break;
case GGML_OP_PERMUTE:
{
// nop
} break;
case GGML_OP_VIEW:
{
// nop
} break;
case GGML_OP_TRANSPOSE:
{
// nop
} break;
case GGML_OP_COUNT:
{
GGML_ABORT("fatal error");
@@ -2884,6 +2884,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
struct ggml_tensor * node = cgraph->nodes[node_n];
if (ggml_op_is_empty(node->op)) {
// skip NOPs
continue;
}
ggml_compute_forward(&params, node);
if (state->ith == 0 && cplan->abort_callback &&

View File

@@ -4455,46 +4455,6 @@ void ggml_compute_forward_cont(
ggml_compute_forward_dup(params, dst);
}
// ggml_compute_forward_reshape
void ggml_compute_forward_reshape(
const ggml_compute_params * params,
ggml_tensor * dst) {
// NOP
GGML_UNUSED(params);
GGML_UNUSED(dst);
}
// ggml_compute_forward_view
void ggml_compute_forward_view(
const ggml_compute_params * params,
ggml_tensor * dst) {
// NOP
GGML_UNUSED(params);
GGML_UNUSED(dst);
}
// ggml_compute_forward_permute
void ggml_compute_forward_permute(
const ggml_compute_params * params,
ggml_tensor * dst) {
// NOP
GGML_UNUSED(params);
GGML_UNUSED(dst);
}
// ggml_compute_forward_transpose
void ggml_compute_forward_transpose(
const ggml_compute_params * params,
ggml_tensor * dst) {
// NOP
GGML_UNUSED(params);
GGML_UNUSED(dst);
}
// ggml_compute_forward_get_rows
static void ggml_compute_forward_get_rows_q(

View File

@@ -51,10 +51,6 @@ void ggml_compute_forward_scale(const struct ggml_compute_params * params, struc
void ggml_compute_forward_set(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_cpy(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_cont(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_reshape(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_view(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_permute(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_transpose(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_get_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_get_rows_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_set_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);

View File

@@ -564,8 +564,10 @@ ggml_metal_device_t ggml_metal_device_init(void) {
// TODO: try to update the tensor API kernels to at least match the simdgroup performance
if (getenv("GGML_METAL_TENSOR_ENABLE") == NULL &&
![[dev->mtl_device name] containsString:@"M5"] &&
![[dev->mtl_device name] containsString:@"M6"]) {
GGML_LOG_WARN("%s: tensor API disabled for pre-M5 device\n", __func__);
![[dev->mtl_device name] containsString:@"M6"] &&
![[dev->mtl_device name] containsString:@"A19"] &&
![[dev->mtl_device name] containsString:@"A20"]) {
GGML_LOG_WARN("%s: tensor API disabled for pre-M5 and pre-A19 devices\n", __func__);
dev->props.has_tensor = false;
}

View File

@@ -1036,6 +1036,11 @@ int ggml_metal_op_set_rows(ggml_metal_op_t ctx, int idx) {
nth = std::min(nth, nk0);
if (nth*nrptg > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
nth = ggml_metal_pipeline_max_theads_per_threadgroup(pipeline);
nrptg = 1;
}
ggml_metal_kargs_set_rows args = {
/*.nk0 =*/ nk0,
/*.ne01 =*/ ne01,

View File

@@ -53,6 +53,37 @@
bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor);
// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
// Precompute mp (m' in the paper) and L such that division
// can be computed using a multiply (high 32b of 64b result)
// and a shift:
//
// n/d = (mulhi(n, mp) + n) >> L;
struct fastdiv_vals {
uint32_t mp;
uint32_t L;
uint32_t d;
uint32_t pad;
};
static_assert(sizeof(fastdiv_vals) == 16, "fastdiv_vals size incorrect");
static fastdiv_vals init_fastdiv_values(uint64_t d_64) {
GGML_ASSERT(d_64 != 0);
GGML_ASSERT(d_64 <= std::numeric_limits<uint32_t>::max());
uint32_t d = (uint32_t)d_64;
// compute L = ceil(log2(d));
uint32_t L = 0;
while (L < 32 && (uint32_t{ 1 } << L) < d) {
L++;
}
uint32_t mp = (uint32_t) ((uint64_t{ 1 } << 32) * ((uint64_t{ 1 } << L) - d) / d + 1);
// pack divisor as well to reduce error surface
return { mp, L, d, 0 };
}
enum GPU_FAMILY {
ADRENO,
INTEL,
@@ -4464,6 +4495,9 @@ static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, c
GGML_ABORT("not implemented");
}
fastdiv_vals ne11_ = init_fastdiv_values(ne11);
fastdiv_vals ne12_ = init_fastdiv_values(ne12);
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
@@ -4474,8 +4508,8 @@ static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, c
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(fastdiv_vals), &ne11_));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(fastdiv_vals), &ne12_));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb10));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb12));

View File

@@ -1,5 +1,16 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
// v = { mp, L, d }
inline uint fastdiv(uint n, uint4 v) {
uint msbs;
msbs = mul_hi(n, v.s0);
return (msbs + n) >> v.s1;
}
inline uint fastmod(uint n, uint4 v) {
uint q = fastdiv(n, v);
return n - q * v.s2;
}
kernel void kernel_set_rows_f32_i64(
global char * src0,
ulong offset0,
@@ -11,8 +22,8 @@ kernel void kernel_set_rows_f32_i64(
ulong nb01,
ulong nb02,
ulong nb03,
int ne11,
int ne12,
uint4 ne11,
uint4 ne12,
ulong nb10,
ulong nb11,
ulong nb12,
@@ -33,8 +44,10 @@ kernel void kernel_set_rows_f32_i64(
return;
}
int i12 = i03%ne12;
int i11 = i02%ne11;
//int i12 = i03%ne12;
//int i11 = i02%ne11;
int i12 = fastmod(i03, ne12);
int i11 = fastmod(i02, ne11);
int i10 = i01;
long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
@@ -58,8 +71,8 @@ kernel void kernel_set_rows_f16_i64(
ulong nb01,
ulong nb02,
ulong nb03,
int ne11,
int ne12,
uint4 ne11,
uint4 ne12,
ulong nb10,
ulong nb11,
ulong nb12,
@@ -80,8 +93,10 @@ kernel void kernel_set_rows_f16_i64(
return;
}
int i12 = i03%ne12;
int i11 = i02%ne11;
//int i12 = i03%ne12;
//int i11 = i02%ne11;
int i12 = fastmod(i03, ne12);
int i11 = fastmod(i02, ne11);
int i10 = i01;
long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
@@ -105,8 +120,8 @@ kernel void kernel_set_rows_f32_i32(
ulong nb01,
ulong nb02,
ulong nb03,
int ne11,
int ne12,
uint4 ne11,
uint4 ne12,
ulong nb10,
ulong nb11,
ulong nb12,
@@ -127,8 +142,10 @@ kernel void kernel_set_rows_f32_i32(
return;
}
int i12 = i03%ne12;
int i11 = i02%ne11;
//int i12 = i03%ne12;
//int i11 = i02%ne11;
int i12 = fastmod(i03, ne12);
int i11 = fastmod(i02, ne11);
int i10 = i01;
int i1 = ((global int *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
@@ -152,8 +169,8 @@ kernel void kernel_set_rows_f16_i32(
ulong nb01,
ulong nb02,
ulong nb03,
int ne11,
int ne12,
uint4 ne11,
uint4 ne12,
ulong nb10,
ulong nb11,
ulong nb12,
@@ -174,8 +191,10 @@ kernel void kernel_set_rows_f16_i32(
return;
}
int i12 = i03%ne12;
int i11 = i02%ne11;
//int i12 = i03%ne12;
//int i11 = i02%ne11;
int i12 = fastmod(i03, ne12);
int i11 = fastmod(i02, ne11);
int i10 = i01;
int i1 = ((global int *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];

View File

@@ -6831,7 +6831,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
vk_buffer d_B = d_D;
size_t b_buf_offset = 0;
uint64_t b_sz = 0;
uint64_t b_sz = 1;
if (enable_bias) {
const ggml_tensor * add = cgraph->nodes[node_idx + 1];
@@ -6965,7 +6965,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
vk_buffer d_B = d_D;
size_t b_buf_offset = 0;
uint64_t b_sz = 0;
uint64_t b_sz = 1;
if (enable_bias) {
const ggml_tensor * add = cgraph->nodes[node_idx + 1];
@@ -7101,7 +7101,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
vk_buffer d_B = d_D;
size_t b_buf_offset = 0;
uint64_t b_sz = 0;
uint64_t b_sz = 1;
if (enable_bias) {
const ggml_tensor * add = cgraph->nodes[node_idx + 1];
@@ -7676,7 +7676,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
vk_buffer d_B = d_D;
size_t b_buf_offset = 0;
uint64_t b_sz = 0;
uint64_t b_sz = 1;
if (enable_bias || enable_scale) {
const ggml_tensor * bias = cgraph->nodes[node_idx + 1]->src[1];

View File

@@ -18,6 +18,7 @@
#include <algorithm>
#include <sys/stat.h>
#include <sys/types.h>
#include <filesystem>
#ifdef _WIN32
#define NOMINMAX
@@ -1080,6 +1081,11 @@ int main(int argc, char** argv) {
if (args.find("--glslc") != args.end()) {
GLSLC = args["--glslc"]; // Path to glslc
if (!std::filesystem::exists(GLSLC) || !std::filesystem::is_regular_file(GLSLC)) {
std::cerr << "Error: glslc not found at " << GLSLC << std::endl;
return EXIT_FAILURE;
}
}
if (args.find("--source") != args.end()) {
input_filepath = args["--source"]; // The shader source file to compile

View File

@@ -151,7 +151,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
p1 = std::numeric_limits<llama_pos>::max();
}
// models like Mamba or RWKV can't have a state partially erased
// models like Mamba or RWKV can't have a state partially erased at the end
// of the sequence because their state isn't preserved for previous tokens
if (seq_id >= (int64_t) size) {
// could be fatal
return false;
@@ -160,8 +161,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
int32_t & tail_id = cells[seq_id].tail;
if (tail_id >= 0) {
const auto & cell = cells[tail_id];
// partial intersection is invalid
if ((0 < p0 && p0 < cell.pos) || (0 < p1 && p1 <= cell.pos)) {
// partial intersection is invalid if it includes the final pos
if (0 < p0 && p0 <= cell.pos && p1 > cell.pos) {
//printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n");
return false;
}

View File

@@ -1,7 +1,5 @@
#include "models.h"
llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -19,6 +17,8 @@ llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_grap
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
@@ -67,9 +67,8 @@ llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_grap
}
if (il == n_layer - 1) {
// skip computing output for unused tokens
ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);

View File

@@ -11,6 +11,8 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
auto * inp_attn = build_attn_inp_kv_iswa();
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
@@ -69,7 +71,6 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
}
if (il == n_layer - 1) {
// skip computing output for unused tokens
ggml_tensor * inp_out_ids = build_inp_out_ids();
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}

View File

@@ -354,7 +354,11 @@ int main(int argc, char ** argv) {
}
// remove any "future" tokens that we might have inherited from the previous session
llama_memory_seq_rm(mem, -1, n_matching_session_tokens, -1);
if (!llama_memory_seq_rm(mem, -1, n_matching_session_tokens, -1)) {
LOG_INF("%s: unable to resuse common prefix\n", __func__);
n_matching_session_tokens = 0;
llama_memory_seq_rm(mem, -1, -1, -1);
}
}
LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",