mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-16 05:54:06 +00:00
Compare commits
4 Commits
0cc4m/vulk
...
b7136
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d5bc1ad110 | ||
|
|
0c7220db56 | ||
|
|
96ac5a2329 | ||
|
|
bc809e9c53 |
@@ -1,9 +1,7 @@
|
||||
ARG UBUNTU_VERSION=25.10
|
||||
ARG UBUNTU_VERSION=26.04
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
# Ref: https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html
|
||||
|
||||
# Install build tools
|
||||
RUN apt update && apt install -y git build-essential cmake wget xz-utils
|
||||
|
||||
|
||||
@@ -212,6 +212,6 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
|
||||
}
|
||||
|
||||
template<typename src_t, typename dst_t>
|
||||
static __device__ void cpy_1_flt(const char * cxi, char * cdsti) {
|
||||
static __device__ void cpy_1_scalar(const char * cxi, char * cdsti) {
|
||||
*(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi);
|
||||
}
|
||||
|
||||
@@ -12,10 +12,10 @@ const int CUDA_CPY_BLOCK_NM = 8; // block size of 3rd dimension if available
|
||||
const int CUDA_CPY_BLOCK_ROWS = 8; // block dimension for marching through rows
|
||||
|
||||
template <cpy_kernel_t cpy_1>
|
||||
static __global__ void cpy_flt(const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13) {
|
||||
static __global__ void cpy_scalar(const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13) {
|
||||
const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= ne) {
|
||||
@@ -40,7 +40,7 @@ static __global__ void cpy_flt(const char * cx, char * cdst, const int ne,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __global__ void cpy_flt_transpose(const char * cx, char * cdst, const int ne,
|
||||
static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13) {
|
||||
@@ -166,7 +166,7 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
|
||||
}
|
||||
|
||||
template<typename src_t, typename dst_t>
|
||||
static __global__ void cpy_flt_contiguous(const char * cx, char * cdst, const int64_t ne) {
|
||||
static __global__ void cpy_scalar_contiguous(const char * cx, char * cdst, const int64_t ne) {
|
||||
const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= ne) {
|
||||
@@ -180,17 +180,17 @@ static __global__ void cpy_flt_contiguous(const char * cx, char * cdst, const in
|
||||
}
|
||||
|
||||
template<typename src_t, typename dst_t>
|
||||
static void ggml_cpy_flt_contiguous_cuda(
|
||||
static void ggml_cpy_scalar_contiguous_cuda(
|
||||
const char * cx, char * cdst, const int64_t ne,
|
||||
cudaStream_t stream) {
|
||||
|
||||
const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_flt_contiguous<src_t, dst_t><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
cpy_scalar_contiguous<src_t, dst_t><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne);
|
||||
}
|
||||
|
||||
template<typename src_t, typename dst_t, bool transposed = false>
|
||||
static void ggml_cpy_flt_cuda(
|
||||
static void ggml_cpy_scalar_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
@@ -212,11 +212,11 @@ static void ggml_cpy_flt_cuda(
|
||||
(ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D,
|
||||
(ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM);
|
||||
dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1);
|
||||
cpy_flt_transpose<dst_t><<<dimGrid, dimBlock, 0, stream>>>
|
||||
cpy_scalar_transpose<dst_t><<<dimGrid, dimBlock, 0, stream>>>
|
||||
(cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
} else {
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
cpy_scalar<cpy_1_scalar<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
}
|
||||
@@ -399,94 +399,132 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
if (can_be_transposed) {
|
||||
ggml_cpy_flt_cuda<float, float, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<float, float, true>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<float, float>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_flt_contiguous_cuda<float, nv_bfloat16> (src0_ddc, src1_ddc, ne, main_stream);
|
||||
ggml_cpy_scalar_contiguous_cuda<float, nv_bfloat16>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<float, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<float, nv_bfloat16>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_flt_contiguous_cuda<float, half> (src0_ddc, src1_ddc, ne, main_stream);
|
||||
ggml_cpy_scalar_contiguous_cuda<float, half>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<float, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<float, half>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_f32_q8_0_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_q8_0_f32_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
||||
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_f32_q4_0_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q4_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
|
||||
nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_q4_0_f32_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
||||
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_f32_q4_1_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q4_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
|
||||
nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_q4_1_f32_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
|
||||
ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_f32_q5_0_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q5_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
|
||||
nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_q5_0_f32_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
|
||||
ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_f32_iq4_nl_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
|
||||
ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_f32_q5_1_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_q5_1_f32_cuda
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
||||
if (can_be_transposed) {
|
||||
ggml_cpy_flt_cuda<half, half, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<half, half, true>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<half, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<half, half>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_BF16) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_flt_contiguous_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, main_stream);
|
||||
ggml_cpy_scalar_contiguous_cuda<half, nv_bfloat16>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<half, nv_bfloat16>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_flt_contiguous_cuda<half, float> (src0_ddc, src1_ddc, ne, main_stream);
|
||||
ggml_cpy_scalar_contiguous_cuda<half, float>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<half, float>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
|
||||
if (can_be_transposed) {
|
||||
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<nv_bfloat16, nv_bfloat16, true>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<nv_bfloat16, nv_bfloat16>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_flt_contiguous_cuda<nv_bfloat16, half> (src0_ddc, src1_ddc, ne, main_stream);
|
||||
ggml_cpy_scalar_contiguous_cuda<nv_bfloat16, half>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<nv_bfloat16, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<nv_bfloat16, half>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_flt_contiguous_cuda<nv_bfloat16, float> (src0_ddc, src1_ddc, ne, main_stream);
|
||||
ggml_cpy_scalar_contiguous_cuda<nv_bfloat16, float>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<nv_bfloat16, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<nv_bfloat16, float>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
|
||||
if (can_be_transposed) {
|
||||
ggml_cpy_scalar_cuda<int32_t, int32_t, true>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
ggml_cpy_scalar_cuda<int32_t, int32_t>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_I32) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_flt_contiguous_cuda<float, int32_t> (src0_ddc, src1_ddc, ne, main_stream);
|
||||
ggml_cpy_scalar_contiguous_cuda<float, int32_t>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<float, int32_t> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<float, int32_t>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_F32) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_flt_contiguous_cuda<int32_t, float> (src0_ddc, src1_ddc, ne, main_stream);
|
||||
ggml_cpy_scalar_contiguous_cuda<int32_t, float>
|
||||
(src0_ddc, src1_ddc, ne, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<int32_t, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
ggml_cpy_scalar_cuda<int32_t, float>
|
||||
(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
|
||||
@@ -4115,6 +4115,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
if (src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_I32) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == src1_type && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -240,6 +240,23 @@ struct ggml_hexagon_session {
|
||||
uint32_t prof_pkts;
|
||||
};
|
||||
|
||||
static inline void hex_print_op_info(const ggml_tensor * op, ggml_hexagon_session * sess, const uint32_t req_flags) {
|
||||
char dims[64 * GGML_MAX_SRC];
|
||||
char strides[64 * GGML_MAX_SRC];
|
||||
char types[16 * GGML_MAX_SRC];
|
||||
char buffs[64 * GGML_MAX_SRC];
|
||||
char names[64 * GGML_MAX_SRC];
|
||||
|
||||
hex_format_op_dims(dims, op);
|
||||
hex_format_op_strides(strides, op);
|
||||
hex_format_op_types(types, op);
|
||||
hex_format_op_buffs(buffs, op);
|
||||
hex_format_op_names(names, op);
|
||||
|
||||
HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
|
||||
names, dims, types, strides, buffs, req_flags);
|
||||
}
|
||||
|
||||
void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
|
||||
// Bump pending flag (cleared in the session::flush once we get the responce)
|
||||
this->op_pending++; // atomic inc
|
||||
@@ -1912,6 +1929,15 @@ static bool hex_supported_dims(const struct ggml_tensor * x, const struct ggml_t
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename... _TTensor>
|
||||
static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess, _TTensor... tensors) {
|
||||
return ([&]() -> bool {
|
||||
return !tensors || !tensors->buffer ||
|
||||
(ggml_backend_buffer_is_hexagon(tensors->buffer) &&
|
||||
ggml_backend_hexagon_buffer_get_sess(tensors->buffer) == sess);
|
||||
}() && ...);
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
@@ -1959,16 +1985,7 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
|
||||
}
|
||||
|
||||
// src0 & src1 & dst must be mapped to the same session
|
||||
if (src0->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (src1->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (dst->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
|
||||
if (!hex_supported_buffer(sess, src0, src1, dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2016,20 +2033,7 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
|
||||
|
||||
// src0 (weights) must be repacked and mapped to the same session
|
||||
// src1 & sr2 & dst must be mapped to the same session
|
||||
if (src0->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (src1->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (src2->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (dst->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
|
||||
if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2063,16 +2067,7 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se
|
||||
}
|
||||
|
||||
// src0, src1 & dst must be mapped to the same session
|
||||
if (src0->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (src1->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (dst->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
|
||||
if (!hex_supported_buffer(sess, src0, src1, dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2104,20 +2099,7 @@ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * se
|
||||
}
|
||||
|
||||
// src0, src1 & dst must be mapped to the same session
|
||||
if (src0->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (src1->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (src2->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (dst->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
|
||||
if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2144,12 +2126,7 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
|
||||
}
|
||||
|
||||
// src0 & dst must be mapped to the same session
|
||||
if (src0->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (dst->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
|
||||
if (!hex_supported_buffer(sess, src0, dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2186,16 +2163,7 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
|
||||
}
|
||||
|
||||
// src0, src1 & dst must be mapped to the same session
|
||||
if (src0->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (src1 && src1->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (dst->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
|
||||
if (!hex_supported_buffer(sess, src0, src1, dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2248,16 +2216,7 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
|
||||
}
|
||||
|
||||
// src0, src1 & dst must be mapped to the same session
|
||||
if (src0->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (src1 && src1->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (dst->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
|
||||
if (!hex_supported_buffer(sess, src0, src1, dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2312,20 +2271,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
|
||||
}
|
||||
|
||||
// src0, src1, src2 & dst must be mapped to the same session
|
||||
if (src0->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (src1->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (src2 && src2->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
|
||||
return false;
|
||||
}
|
||||
if (dst->buffer &&
|
||||
(!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
|
||||
if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2346,6 +2292,26 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
|
||||
h->nb[3] = t->nb[3];
|
||||
}
|
||||
|
||||
static size_t dspqueue_buffers_init(dspqueue_buffer * buf, const ggml_tensor * t, bool flush_host, bool flush_htp) {
|
||||
if (!t) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
memset(buf, 0, sizeof(*buf));
|
||||
auto tensor_buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
|
||||
buf->fd = tensor_buf->fd;
|
||||
buf->ptr = t->data;
|
||||
buf->offset = (uint8_t *) t->data - tensor_buf->base;
|
||||
buf->size = ggml_nbytes(t);
|
||||
buf->flags = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0); // Flush CPU
|
||||
buf->flags |= (flush_htp ? DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT : 0); // Invalidate DSP
|
||||
return 1;
|
||||
}
|
||||
|
||||
static ggml_hexagon_session * get_session_from_tensor(const ggml_tensor * t) {
|
||||
return static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context)->sess;
|
||||
}
|
||||
|
||||
static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer * d) {
|
||||
auto buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
|
||||
auto sess = buf->sess;
|
||||
@@ -2360,10 +2326,6 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
|
||||
const struct ggml_tensor * src1 = op->src[1];
|
||||
const struct ggml_tensor * dst = op;
|
||||
|
||||
auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
|
||||
auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
|
||||
auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = ggml_time_us();
|
||||
|
||||
@@ -2385,55 +2347,27 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
|
||||
}
|
||||
|
||||
dspqueue_buffer bufs[3];
|
||||
memset(bufs, 0, sizeof(bufs));
|
||||
|
||||
// First buffer Weights.
|
||||
// The content is static, there is no need to do any cache management
|
||||
bufs[0].fd = src0_buf->fd;
|
||||
bufs[0].ptr = src0->data;
|
||||
bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
|
||||
bufs[0].size = ggml_nbytes(src0);
|
||||
bufs[0].flags = 0;
|
||||
dspqueue_buffers_init(bufs, src0, false, false);
|
||||
|
||||
// Second buffer Input Activations. This is a buffer that the CPU
|
||||
// writes and the DSP reads, so we'll need to flush CPU caches and
|
||||
// invalidate DSP ones. On platforms with I/O coherency support the
|
||||
// framework will automatically skip cache operations where possible.
|
||||
bufs[1].fd = src1_buf->fd;
|
||||
bufs[1].ptr = src1->data;
|
||||
bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
|
||||
bufs[1].size = ggml_nbytes(src1);
|
||||
bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
|
||||
dspqueue_buffers_init(&bufs[1], src1, true, true);
|
||||
|
||||
// Third buffer Output Activations. We'll handle DSP
|
||||
// cache maintenance in the response message but need to flush
|
||||
// CPU caches to ensure any previously written dirty lines are
|
||||
// written out before writes from the DSP start.
|
||||
bufs[2].fd = dst_buf->fd;
|
||||
bufs[2].ptr = dst->data;
|
||||
bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
|
||||
bufs[2].size = ggml_nbytes(dst);
|
||||
bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
|
||||
dspqueue_buffers_init(&bufs[2], dst, true, false);
|
||||
|
||||
// Primary DSP session from the src0 (normally weight) tensor
|
||||
auto sess = src0_buf->sess;
|
||||
auto * sess = get_session_from_tensor(src0);
|
||||
|
||||
if (opt_verbose) {
|
||||
char dims[64 * GGML_MAX_SRC];
|
||||
char strides[64 * GGML_MAX_SRC];
|
||||
char types[16 * GGML_MAX_SRC];
|
||||
char buffs[64 * GGML_MAX_SRC];
|
||||
char names[64 * GGML_MAX_SRC];
|
||||
|
||||
hex_format_op_dims(dims, op);
|
||||
hex_format_op_strides(strides, op);
|
||||
hex_format_op_types(types, op);
|
||||
hex_format_op_buffs(buffs, op);
|
||||
hex_format_op_names(names, op);
|
||||
|
||||
HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
|
||||
names, dims, types, strides, buffs, req.flags);
|
||||
hex_print_op_info(op, sess, req.flags);
|
||||
if (opt_verbose > 1) {
|
||||
hex_dump_dspbuf(src0, &bufs[0]);
|
||||
hex_dump_dspbuf(src1, &bufs[1]);
|
||||
@@ -2463,11 +2397,6 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
|
||||
const struct ggml_tensor * src2 = op->src[2];
|
||||
const struct ggml_tensor * dst = op;
|
||||
|
||||
auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
|
||||
auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
|
||||
auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
|
||||
auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = ggml_time_us();
|
||||
|
||||
@@ -2490,66 +2419,32 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
|
||||
}
|
||||
|
||||
dspqueue_buffer bufs[4];
|
||||
memset(bufs, 0, sizeof(bufs));
|
||||
|
||||
// First buffer Weights.
|
||||
// The content is static, there is no need to do any cache management
|
||||
bufs[0].fd = src0_buf->fd;
|
||||
bufs[0].ptr = src0->data;
|
||||
bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
|
||||
bufs[0].size = ggml_nbytes(src0);
|
||||
bufs[0].flags = 0;
|
||||
dspqueue_buffers_init(bufs, src0, false, false);
|
||||
|
||||
// Second buffer Input Activations. This is a buffer that the CPU
|
||||
// writes and the DSP reads, so we'll need to flush CPU caches and
|
||||
// invalidate DSP ones. On platforms with I/O coherency support the
|
||||
// framework will automatically skip cache operations where possible.
|
||||
bufs[1].fd = src1_buf->fd;
|
||||
bufs[1].ptr = src1->data;
|
||||
bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
|
||||
bufs[1].size = ggml_nbytes(src1);
|
||||
bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
|
||||
dspqueue_buffers_init(&bufs[1], src1, true, true);
|
||||
|
||||
// Third buffer expert IDs. This is a buffer that the CPU
|
||||
// writes and the DSP reads, so we'll need to flush CPU caches and
|
||||
// invalidate DSP ones. On platforms with I/O coherency support the
|
||||
// framework will automatically skip cache operations where possible.
|
||||
bufs[2].fd = src2_buf->fd;
|
||||
bufs[2].ptr = src2->data;
|
||||
bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
|
||||
bufs[2].size = ggml_nbytes(src2);
|
||||
bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
|
||||
dspqueue_buffers_init(&bufs[2], src2, true, true);
|
||||
|
||||
// Forth buffer Output Activations. We'll handle DSP
|
||||
// cache maintenance in the response message but need to flush
|
||||
// CPU caches to ensure any previously written dirty lines are
|
||||
// written out before writes from the DSP start.
|
||||
bufs[3].fd = dst_buf->fd;
|
||||
bufs[3].ptr = dst->data;
|
||||
bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
|
||||
bufs[3].size = ggml_nbytes(dst);
|
||||
bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
|
||||
dspqueue_buffers_init(&bufs[3], dst, true, false);
|
||||
|
||||
// Primary DSP session from the src0 (normally weight) tensor
|
||||
auto sess = src0_buf->sess;
|
||||
auto * sess = get_session_from_tensor(src0);
|
||||
|
||||
if (opt_verbose) {
|
||||
char dims[64 * GGML_MAX_SRC];
|
||||
char strides[64 * GGML_MAX_SRC];
|
||||
char types[16 * GGML_MAX_SRC];
|
||||
char buffs[64 * GGML_MAX_SRC];
|
||||
char names[64 * GGML_MAX_SRC];
|
||||
|
||||
hex_format_op_dims(dims, op);
|
||||
hex_format_op_types(types, op);
|
||||
hex_format_op_buffs(buffs, op);
|
||||
hex_format_op_names(names, op);
|
||||
|
||||
HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
|
||||
names, dims, types, strides, buffs, req.flags);
|
||||
|
||||
hex_print_op_info(op, sess, req.flags);
|
||||
if (opt_verbose > 1) {
|
||||
hex_dump_dspbuf(src0, &bufs[0]);
|
||||
hex_dump_dspbuf(src1, &bufs[1]);
|
||||
@@ -2581,10 +2476,6 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
|
||||
const struct ggml_tensor * src1 = node->src[1];
|
||||
const struct ggml_tensor * dst = node;
|
||||
|
||||
auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
|
||||
auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
|
||||
auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
|
||||
|
||||
uint64_t t1 = 0;
|
||||
uint64_t t2 = 0;
|
||||
|
||||
@@ -2621,60 +2512,30 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
|
||||
init_htp_tensor(&req.dst, dst);
|
||||
|
||||
dspqueue_buffer bufs[3];
|
||||
memset(bufs, 0, sizeof(bufs));
|
||||
|
||||
// First buffer = First Operand of Binary op
|
||||
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
||||
// need to flush CPU caches and invalidate DSP ones. On platforms
|
||||
// with I/O coherency support the framework will automatically skip
|
||||
// cache operations where possible.
|
||||
bufs[0].fd = src0_buf->fd;
|
||||
bufs[0].ptr = src0->data;
|
||||
bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
|
||||
bufs[0].size = ggml_nbytes(src0);
|
||||
bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
|
||||
dspqueue_buffers_init(bufs, src0, true, true);
|
||||
|
||||
// Second buffer = Second Operand of Binary op
|
||||
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
||||
// need to flush CPU caches and invalidate DSP ones. On platforms
|
||||
// with I/O coherency support the framework will automatically skip
|
||||
// cache operations where possible.
|
||||
bufs[1].fd = src1_buf->fd;
|
||||
bufs[1].ptr = src1->data;
|
||||
bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
|
||||
bufs[1].size = ggml_nbytes(src1);
|
||||
bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
|
||||
dspqueue_buffers_init(&bufs[1], src1, true, true);
|
||||
|
||||
// Third buffer = Output Activations. We'll handle DSP
|
||||
// cache maintenance in the response message but need to flush
|
||||
// CPU caches to ensure any previously written dirty lines are
|
||||
// written out before writes from the DSP start.
|
||||
bufs[2].fd = dst_buf->fd;
|
||||
bufs[2].ptr = dst->data;
|
||||
bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
|
||||
bufs[2].size = ggml_nbytes(dst);
|
||||
bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
|
||||
dspqueue_buffers_init(&bufs[2], dst, true, false);
|
||||
|
||||
// Primary DSP session from the src0 tensor
|
||||
ggml_hexagon_session * sess = src0_buf->sess;
|
||||
auto * sess = get_session_from_tensor(src0);
|
||||
|
||||
if (opt_verbose) {
|
||||
char dims[64 * GGML_MAX_SRC];
|
||||
char strides[16 * GGML_MAX_SRC];
|
||||
char types[16 * GGML_MAX_SRC];
|
||||
char buffs[64 * GGML_MAX_SRC];
|
||||
char names[64 * GGML_MAX_SRC];
|
||||
|
||||
hex_format_op_dims(dims, op);
|
||||
hex_format_op_strides(strides, op);
|
||||
hex_format_op_types(types, op);
|
||||
hex_format_op_buffs(buffs, op);
|
||||
hex_format_op_names(names, op);
|
||||
|
||||
HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
|
||||
ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
|
||||
hex_print_op_info(op, sess, req.flags);
|
||||
if (opt_verbose > 1) {
|
||||
hex_dump_dspbuf(src0, &bufs[0]);
|
||||
hex_dump_dspbuf(src1, &bufs[1]);
|
||||
@@ -2705,11 +2566,6 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
|
||||
const struct ggml_tensor * src2 = node->src[2];
|
||||
const struct ggml_tensor * dst = node;
|
||||
|
||||
auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
|
||||
auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
|
||||
auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
|
||||
auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
|
||||
|
||||
uint64_t t1 = 0;
|
||||
uint64_t t2 = 0;
|
||||
|
||||
@@ -2741,58 +2597,19 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
|
||||
init_htp_tensor(&req.dst, dst);
|
||||
|
||||
dspqueue_buffer bufs[4];
|
||||
memset(bufs, 0, sizeof(bufs));
|
||||
|
||||
// First buffer = input activations
|
||||
bufs[0].fd = src0_buf->fd;
|
||||
bufs[0].ptr = src0->data;
|
||||
bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
|
||||
bufs[0].size = ggml_nbytes(src0);
|
||||
bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
|
||||
|
||||
dspqueue_buffers_init(bufs, src0, true, true);
|
||||
// Second buffer = experts bias
|
||||
bufs[1].fd = src1_buf->fd;
|
||||
bufs[1].ptr = src1->data;
|
||||
bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
|
||||
bufs[1].size = ggml_nbytes(src1);
|
||||
bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
|
||||
|
||||
dspqueue_buffers_init(&bufs[1], src1, true, true);
|
||||
// Third buffer = activated experts
|
||||
bufs[2].fd = src2_buf->fd;
|
||||
bufs[2].ptr = src2->data;
|
||||
bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
|
||||
bufs[2].size = ggml_nbytes(src2);
|
||||
bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
|
||||
|
||||
dspqueue_buffers_init(&bufs[2], src2, true, true);
|
||||
// Forth buffer = output activations
|
||||
bufs[3].fd = dst_buf->fd;
|
||||
bufs[3].ptr = dst->data;
|
||||
bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
|
||||
bufs[3].size = ggml_nbytes(dst);
|
||||
bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
|
||||
dspqueue_buffers_init(&bufs[3], dst, true, true);
|
||||
|
||||
// Primary DSP session from the src0 tensor
|
||||
ggml_hexagon_session * sess = src0_buf->sess;
|
||||
auto * sess = get_session_from_tensor(src0);
|
||||
|
||||
if (opt_verbose) {
|
||||
char dims[64 * GGML_MAX_SRC];
|
||||
char strides[16 * GGML_MAX_SRC];
|
||||
char types[16 * GGML_MAX_SRC];
|
||||
char buffs[64 * GGML_MAX_SRC];
|
||||
char names[64 * GGML_MAX_SRC];
|
||||
|
||||
hex_format_op_dims(dims, op);
|
||||
hex_format_op_strides(strides, op);
|
||||
hex_format_op_types(types, op);
|
||||
hex_format_op_buffs(buffs, op);
|
||||
hex_format_op_names(names, op);
|
||||
|
||||
HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
|
||||
ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
|
||||
|
||||
hex_print_op_info(op, sess, req.flags);
|
||||
if (opt_verbose > 1) {
|
||||
hex_dump_dspbuf(src0, &bufs[0]);
|
||||
hex_dump_dspbuf(src1, &bufs[1]);
|
||||
@@ -2886,71 +2703,33 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
|
||||
}
|
||||
|
||||
dspqueue_buffer bufs[3];
|
||||
int n_bufs = 0;
|
||||
|
||||
memset(bufs, 0, sizeof(bufs));
|
||||
|
||||
// First buffer = Only Operand of Unary op
|
||||
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
||||
// need to flush CPU caches and invalidate DSP ones. On platforms
|
||||
// with I/O coherency support the framework will automatically skip
|
||||
// cache operations where possible.
|
||||
auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
|
||||
bufs[n_bufs].fd = src0_buf->fd;
|
||||
bufs[n_bufs].ptr = src0->data;
|
||||
bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
|
||||
bufs[n_bufs].size = ggml_nbytes(src0);
|
||||
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
|
||||
++n_bufs;
|
||||
size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true);
|
||||
|
||||
if (src1) {
|
||||
// Second buffer = Second Operand of Binary op
|
||||
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
||||
// need to flush CPU caches and invalidate DSP ones. On platforms
|
||||
// with I/O coherency support the framework will automatically skip
|
||||
// cache operations where possible.
|
||||
auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
|
||||
bufs[n_bufs].fd = src1_buf->fd;
|
||||
bufs[n_bufs].ptr = src1->data;
|
||||
bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
|
||||
bufs[n_bufs].size = ggml_nbytes(src1);
|
||||
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
|
||||
++n_bufs;
|
||||
}
|
||||
// Second buffer(nullable) = Second Operand of Binary op
|
||||
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
||||
// need to flush CPU caches and invalidate DSP ones. On platforms
|
||||
// with I/O coherency support the framework will automatically skip
|
||||
// cache operations where possible.
|
||||
n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true);
|
||||
|
||||
// Second or third buffer = Output Activations. We'll handle DSP
|
||||
// Second buffer = Output Activations. We'll handle DSP
|
||||
// cache maintenance in the response message but need to flush
|
||||
// CPU caches to ensure any previously written dirty lines are
|
||||
// written out before writes from the DSP start.
|
||||
auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
|
||||
bufs[n_bufs].fd = dst_buf->fd;
|
||||
bufs[n_bufs].ptr = dst->data;
|
||||
bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
|
||||
bufs[n_bufs].size = ggml_nbytes(dst);
|
||||
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
|
||||
++n_bufs;
|
||||
n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false);
|
||||
|
||||
// Primary DSP session from the src0 tensor
|
||||
ggml_hexagon_session * sess = src0_buf->sess;
|
||||
auto * sess = get_session_from_tensor(src0);
|
||||
|
||||
if (opt_verbose) {
|
||||
char dims[64 * GGML_MAX_SRC];
|
||||
char strides[64 * GGML_MAX_SRC];
|
||||
char types[16 * GGML_MAX_SRC];
|
||||
char buffs[64 * GGML_MAX_SRC];
|
||||
char names[64 * GGML_MAX_SRC];
|
||||
|
||||
hex_format_op_dims(dims, op);
|
||||
hex_format_op_strides(strides, op);
|
||||
hex_format_op_types(types, op);
|
||||
hex_format_op_buffs(buffs, op);
|
||||
hex_format_op_names(names, op);
|
||||
|
||||
HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
|
||||
names, dims, types, strides, buffs, req.flags);
|
||||
hex_print_op_info(op, sess, req.flags);
|
||||
if (opt_verbose > 1) {
|
||||
hex_dump_dspbuf(src0, &bufs[0]);
|
||||
if (src1) {
|
||||
@@ -3023,85 +2802,40 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
|
||||
}
|
||||
|
||||
dspqueue_buffer bufs[4];
|
||||
int n_bufs = 0;
|
||||
|
||||
memset(bufs, 0, sizeof(bufs));
|
||||
|
||||
// First buffer
|
||||
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
||||
// need to flush CPU caches and invalidate DSP ones. On platforms
|
||||
// with I/O coherency support the framework will automatically skip
|
||||
// cache operations where possible.
|
||||
auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
|
||||
bufs[n_bufs].fd = src0_buf->fd;
|
||||
bufs[n_bufs].ptr = src0->data;
|
||||
bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
|
||||
bufs[n_bufs].size = ggml_nbytes(src0);
|
||||
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP;
|
||||
++n_bufs;
|
||||
size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true);
|
||||
|
||||
// Second buffer
|
||||
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
||||
// need to flush CPU caches and invalidate DSP ones. On platforms
|
||||
// with I/O coherency support the framework will automatically skip
|
||||
// cache operations where possible.
|
||||
auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
|
||||
bufs[n_bufs].fd = src1_buf->fd;
|
||||
bufs[n_bufs].ptr = src1->data;
|
||||
bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
|
||||
bufs[n_bufs].size = ggml_nbytes(src1);
|
||||
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
|
||||
++n_bufs;
|
||||
n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true);
|
||||
|
||||
if (src2) {
|
||||
// Third buffer
|
||||
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
||||
// need to flush CPU caches and invalidate DSP ones. On platforms
|
||||
// with I/O coherency support the framework will automatically skip
|
||||
// cache operations where possible.
|
||||
auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
|
||||
bufs[n_bufs].fd = src2_buf->fd;
|
||||
bufs[n_bufs].ptr = src2->data;
|
||||
bufs[n_bufs].offset = (uint8_t *) src2->data - src2_buf->base;
|
||||
bufs[n_bufs].size = ggml_nbytes(src2);
|
||||
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP
|
||||
++n_bufs;
|
||||
}
|
||||
// Third buffer(nullable)
|
||||
// This is a buffer that the CPU writes and the DSP reads, so we'll
|
||||
// need to flush CPU caches and invalidate DSP ones. On platforms
|
||||
// with I/O coherency support the framework will automatically skip
|
||||
// cache operations where possible.
|
||||
n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src2, true, true);
|
||||
|
||||
// Final buffer = Output Activations. We'll handle DSP
|
||||
// Second buffer = Output Activations. We'll handle DSP
|
||||
// cache maintenance in the response message but need to flush
|
||||
// CPU caches to ensure any previously written dirty lines are
|
||||
// written out before writes from the DSP start.
|
||||
auto dst_buf = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
|
||||
bufs[n_bufs].fd = dst_buf->fd;
|
||||
bufs[n_bufs].ptr = dst->data;
|
||||
bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
|
||||
bufs[n_bufs].size = ggml_nbytes(dst);
|
||||
bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
|
||||
++n_bufs;
|
||||
n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false);
|
||||
|
||||
// Primary DSP session from the src0 tensor
|
||||
ggml_hexagon_session * sess = src0_buf->sess;
|
||||
auto * sess = get_session_from_tensor(src0);
|
||||
|
||||
if (opt_verbose) {
|
||||
char dims[64 * GGML_MAX_SRC];
|
||||
char strides[64 * GGML_MAX_SRC];
|
||||
char types[16 * GGML_MAX_SRC];
|
||||
char buffs[64 * GGML_MAX_SRC];
|
||||
char names[64 * GGML_MAX_SRC];
|
||||
|
||||
hex_format_op_dims(dims, op);
|
||||
hex_format_op_strides(strides, op);
|
||||
hex_format_op_types(types, op);
|
||||
hex_format_op_buffs(buffs, op);
|
||||
hex_format_op_names(names, op);
|
||||
|
||||
HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
|
||||
names, dims, types, strides, buffs, req.flags);
|
||||
hex_print_op_info(op, sess, req.flags);
|
||||
if (opt_verbose > 1) {
|
||||
hex_dump_dspbuf(src0, &bufs[0]);
|
||||
if (src1) {
|
||||
|
||||
@@ -16,13 +16,8 @@
|
||||
#include "hvx-utils.h"
|
||||
#include "ops-utils.h"
|
||||
|
||||
static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec) {
|
||||
static const float kInf = INFINITY;
|
||||
static const float kMaxExp = 88.02f; // log(INF)
|
||||
|
||||
const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
|
||||
const HVX_Vector inf = hvx_vec_splat_fp32(kInf);
|
||||
const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
|
||||
static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec, HVX_Vector max_exp, HVX_Vector inf) {
|
||||
const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
|
||||
|
||||
HVX_Vector out = hvx_vec_exp_fp32(in_vec);
|
||||
|
||||
@@ -47,6 +42,12 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
|
||||
|
||||
HVX_Vector vec_out = Q6_V_vzero();
|
||||
|
||||
static const float kInf = INFINITY;
|
||||
static const float kMaxExp = 88.02f; // log(INF)
|
||||
|
||||
const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
|
||||
const HVX_Vector inf = hvx_vec_splat_fp32(kInf);
|
||||
|
||||
if (0 == unaligned_loop) {
|
||||
HVX_Vector * p_vec_in1 = (HVX_Vector *) src;
|
||||
HVX_Vector * p_vec_out = (HVX_Vector *) dst;
|
||||
@@ -55,9 +56,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
|
||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||
if (true == negate) {
|
||||
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++);
|
||||
*p_vec_out++ = hvx_vec_exp_fp32_guard(neg_vec_in);
|
||||
*p_vec_out++ = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
|
||||
} else {
|
||||
*p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++);
|
||||
*p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++, max_exp, inf);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@@ -67,9 +68,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
|
||||
|
||||
if (true == negate) {
|
||||
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
|
||||
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in);
|
||||
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
|
||||
} else {
|
||||
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in);
|
||||
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in, max_exp, inf);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -83,9 +84,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
|
||||
if (true == negate) {
|
||||
HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
|
||||
|
||||
vec_out = hvx_vec_exp_fp32_guard(neg_vec_in);
|
||||
vec_out = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
|
||||
} else {
|
||||
vec_out = hvx_vec_exp_fp32_guard(in);
|
||||
vec_out = hvx_vec_exp_fp32_guard(in, max_exp, inf);
|
||||
}
|
||||
|
||||
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out);
|
||||
|
||||
@@ -16,6 +16,15 @@
|
||||
#include "hvx-utils.h"
|
||||
#include "ops-utils.h"
|
||||
|
||||
static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf, HVX_Vector nan_inf_mask) {
|
||||
HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
|
||||
|
||||
HVX_Vector masked_out = Q6_V_vand_VV(out, nan_inf_mask);
|
||||
const HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out);
|
||||
|
||||
return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out);
|
||||
}
|
||||
|
||||
void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
|
||||
int left_over = num_elems & (VLEN_FP32 - 1);
|
||||
int num_elems_whole = num_elems - left_over;
|
||||
@@ -32,19 +41,22 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
|
||||
FARF(HIGH, "hvx_inverse_f32: unaligned loop in hvx op, possibly slower execution\n");
|
||||
}
|
||||
|
||||
static const uint32_t kNanInfMask = 0x7f800000;
|
||||
const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(kNanInfMask);
|
||||
|
||||
if (0 == unaligned_loop) {
|
||||
HVX_Vector * p_vec_in = (HVX_Vector *) src;
|
||||
HVX_Vector * p_vec_out = (HVX_Vector *) dst;
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||
*p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++);
|
||||
*p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++, nan_inf_mask);
|
||||
}
|
||||
} else {
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
|
||||
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
|
||||
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in);
|
||||
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in, nan_inf_mask);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -53,7 +65,7 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
|
||||
float * dstf = (float *) dst + num_elems_whole;
|
||||
|
||||
HVX_Vector in = *(HVX_UVector *) srcf;
|
||||
HVX_Vector out = hvx_vec_inverse_fp32_guard(in);
|
||||
HVX_Vector out = hvx_vec_inverse_fp32_guard(in, nan_inf_mask);
|
||||
|
||||
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
|
||||
}
|
||||
|
||||
@@ -726,24 +726,6 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) {
|
||||
return Q6_Vsf_equals_Vqf32(r_qf);
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf) {
|
||||
static const float kInf = INFINITY;
|
||||
static const uint32_t kNanMask = 0x7fffffff;
|
||||
static const uint32_t kNanMin = 0x7f800000;
|
||||
|
||||
const HVX_Vector inf = hvx_vec_splat_fp32(kInf);
|
||||
const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf);
|
||||
|
||||
HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
|
||||
|
||||
const HVX_Vector nan_mask = Q6_V_vsplat_R(kNanMask);
|
||||
const HVX_Vector nan_min = Q6_V_vsplat_R(kNanMin);
|
||||
HVX_Vector masked_out = Q6_V_vand_VV(out, nan_mask);
|
||||
const HVX_VectorPred pred = Q6_Q_vcmp_gtand_QVuwVuw(pred_inf, nan_min, masked_out);
|
||||
|
||||
return Q6_V_vmux_QVV(pred, out, Q6_V_vzero());
|
||||
}
|
||||
|
||||
#define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022
|
||||
#define FAST_SIGMOID_C1 (0x3d009076) // 0.03138777
|
||||
#define FAST_SIGMOID_C2 (0x3e8d74bd) // 0.276281267
|
||||
@@ -958,14 +940,16 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) {
|
||||
return Q6_Vsf_equals_Vqf32(temp);
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v) {
|
||||
static const float kMaxExp = -88.02f; // log(INF)
|
||||
|
||||
const HVX_Vector max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp));
|
||||
const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(v, max_exp);
|
||||
static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v,
|
||||
HVX_Vector one,
|
||||
HVX_Vector max_exp,
|
||||
HVX_Vector min_exp) {
|
||||
const HVX_VectorPred pred_max = Q6_Q_vcmp_gt_VsfVsf(max_exp, v);
|
||||
const HVX_VectorPred pred_min = Q6_Q_vcmp_gt_VsfVsf(v, min_exp);
|
||||
|
||||
HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v);
|
||||
return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero());
|
||||
out = Q6_V_vmux_QVV(pred_max, out, one);
|
||||
return Q6_V_vmux_QVV(pred_min, out, Q6_V_vzero());
|
||||
}
|
||||
|
||||
static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
|
||||
@@ -977,9 +961,16 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
|
||||
const HVX_Vector * restrict v_src = (HVX_Vector *) src;
|
||||
HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
|
||||
|
||||
static const float kMinExp = -87.f; // 0
|
||||
static const float kMaxExp = 87.f; // 1
|
||||
|
||||
const HVX_Vector one = hvx_vec_splat_fp32(1.f);
|
||||
const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
|
||||
const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < step_of_1; i++) {
|
||||
v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i]);
|
||||
v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -6953,9 +6953,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
|
||||
|
||||
for (ggml_type type_dst : { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16 }) {
|
||||
for (ggml_type type_dst : { GGML_TYPE_F32, GGML_TYPE_I32, GGML_TYPE_F16, GGML_TYPE_BF16 }) {
|
||||
for (bool use_view_slice : { true, false }) {
|
||||
for (std::array<int64_t, 4> ne : std::initializer_list<std::array<int64_t, 4>>{ {2, 1, 1, 1}, {2, 1, 3, 5},
|
||||
{2, 3, 5, 7}, {1, 4, 4, 1}, {1, 8, 17, 1}, {10, 10, 10, 1} }) {
|
||||
|
||||
Binary file not shown.
@@ -29,6 +29,7 @@
|
||||
sendMessage,
|
||||
stopGeneration
|
||||
} from '$lib/stores/chat.svelte';
|
||||
import { config } from '$lib/stores/settings.svelte';
|
||||
import {
|
||||
supportsVision,
|
||||
supportsAudio,
|
||||
@@ -47,6 +48,7 @@
|
||||
|
||||
let { showCenteredEmpty = false } = $props();
|
||||
|
||||
let disableAutoScroll = $derived(Boolean(config().disableAutoScroll));
|
||||
let autoScrollEnabled = $state(true);
|
||||
let chatScrollContainer: HTMLDivElement | undefined = $state();
|
||||
let dragCounter = $state(0);
|
||||
@@ -149,7 +151,7 @@
|
||||
}
|
||||
|
||||
function handleScroll() {
|
||||
if (!chatScrollContainer) return;
|
||||
if (disableAutoScroll || !chatScrollContainer) return;
|
||||
|
||||
const { scrollTop, scrollHeight, clientHeight } = chatScrollContainer;
|
||||
const distanceFromBottom = scrollHeight - scrollTop - clientHeight;
|
||||
@@ -194,8 +196,10 @@
|
||||
const extras = result?.extras;
|
||||
|
||||
// Enable autoscroll for user-initiated message sending
|
||||
userScrolledUp = false;
|
||||
autoScrollEnabled = true;
|
||||
if (!disableAutoScroll) {
|
||||
userScrolledUp = false;
|
||||
autoScrollEnabled = true;
|
||||
}
|
||||
await sendMessage(message, extras);
|
||||
scrollChatToBottom();
|
||||
|
||||
@@ -241,6 +245,8 @@
|
||||
}
|
||||
|
||||
function scrollChatToBottom(behavior: ScrollBehavior = 'smooth') {
|
||||
if (disableAutoScroll) return;
|
||||
|
||||
chatScrollContainer?.scrollTo({
|
||||
top: chatScrollContainer?.scrollHeight,
|
||||
behavior
|
||||
@@ -248,14 +254,27 @@
|
||||
}
|
||||
|
||||
afterNavigate(() => {
|
||||
setTimeout(() => scrollChatToBottom('instant'), INITIAL_SCROLL_DELAY);
|
||||
if (!disableAutoScroll) {
|
||||
setTimeout(() => scrollChatToBottom('instant'), INITIAL_SCROLL_DELAY);
|
||||
}
|
||||
});
|
||||
|
||||
onMount(() => {
|
||||
setTimeout(() => scrollChatToBottom('instant'), INITIAL_SCROLL_DELAY);
|
||||
if (!disableAutoScroll) {
|
||||
setTimeout(() => scrollChatToBottom('instant'), INITIAL_SCROLL_DELAY);
|
||||
}
|
||||
});
|
||||
|
||||
$effect(() => {
|
||||
if (disableAutoScroll) {
|
||||
autoScrollEnabled = false;
|
||||
if (scrollInterval) {
|
||||
clearInterval(scrollInterval);
|
||||
scrollInterval = undefined;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (isCurrentConversationLoading && autoScrollEnabled) {
|
||||
scrollInterval = setInterval(scrollChatToBottom, AUTO_SCROLL_INTERVAL);
|
||||
} else if (scrollInterval) {
|
||||
@@ -289,9 +308,11 @@
|
||||
class="mb-16 md:mb-24"
|
||||
messages={activeMessages()}
|
||||
onUserAction={() => {
|
||||
userScrolledUp = false;
|
||||
autoScrollEnabled = true;
|
||||
scrollChatToBottom();
|
||||
if (!disableAutoScroll) {
|
||||
userScrolledUp = false;
|
||||
autoScrollEnabled = true;
|
||||
scrollChatToBottom();
|
||||
}
|
||||
}}
|
||||
/>
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
Settings,
|
||||
Funnel,
|
||||
AlertTriangle,
|
||||
Brain,
|
||||
Code,
|
||||
Monitor,
|
||||
Sun,
|
||||
@@ -58,6 +57,33 @@
|
||||
label: 'Paste long text to file length',
|
||||
type: 'input'
|
||||
},
|
||||
{
|
||||
key: 'enableContinueGeneration',
|
||||
label: 'Enable "Continue" button',
|
||||
type: 'checkbox',
|
||||
isExperimental: true
|
||||
},
|
||||
{
|
||||
key: 'pdfAsImage',
|
||||
label: 'Parse PDF as image',
|
||||
type: 'checkbox'
|
||||
},
|
||||
{
|
||||
key: 'askForTitleConfirmation',
|
||||
label: 'Ask for confirmation before changing conversation title',
|
||||
type: 'checkbox'
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
title: 'Display',
|
||||
icon: Monitor,
|
||||
fields: [
|
||||
{
|
||||
key: 'showThoughtInProgress',
|
||||
label: 'Show thought in progress',
|
||||
type: 'checkbox'
|
||||
},
|
||||
{
|
||||
key: 'showMessageStats',
|
||||
label: 'Show message generation statistics',
|
||||
@@ -79,25 +105,14 @@
|
||||
type: 'checkbox'
|
||||
},
|
||||
{
|
||||
key: 'enableContinueGeneration',
|
||||
label: 'Enable "Continue" button',
|
||||
type: 'checkbox',
|
||||
isExperimental: true
|
||||
},
|
||||
{
|
||||
key: 'pdfAsImage',
|
||||
label: 'Parse PDF as image',
|
||||
key: 'disableAutoScroll',
|
||||
label: 'Disable automatic scroll',
|
||||
type: 'checkbox'
|
||||
},
|
||||
{
|
||||
key: 'renderUserContentAsMarkdown',
|
||||
label: 'Render user content as Markdown',
|
||||
type: 'checkbox'
|
||||
},
|
||||
{
|
||||
key: 'askForTitleConfirmation',
|
||||
label: 'Ask for confirmation before changing conversation title',
|
||||
type: 'checkbox'
|
||||
}
|
||||
]
|
||||
},
|
||||
@@ -208,17 +223,6 @@
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
title: 'Reasoning',
|
||||
icon: Brain,
|
||||
fields: [
|
||||
{
|
||||
key: 'showThoughtInProgress',
|
||||
label: 'Show thought in progress',
|
||||
type: 'checkbox'
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
title: 'Import/Export',
|
||||
icon: Database,
|
||||
|
||||
@@ -14,6 +14,7 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
|
||||
pasteLongTextToFileLen: 2500,
|
||||
pdfAsImage: false,
|
||||
showModelInfo: false,
|
||||
disableAutoScroll: false,
|
||||
renderUserContentAsMarkdown: false,
|
||||
modelSelectorEnabled: false,
|
||||
// make sure these default values are in sync with `common.h`
|
||||
@@ -93,6 +94,8 @@ export const SETTING_CONFIG_INFO: Record<string, string> = {
|
||||
'Ask for confirmation before automatically changing conversation title when editing the first message.',
|
||||
pdfAsImage: 'Parse PDF as image instead of text (requires vision-capable model).',
|
||||
showModelInfo: 'Display the model name used to generate each message below the message content.',
|
||||
disableAutoScroll:
|
||||
'Disable automatic scrolling while messages stream so you can control the viewport position manually.',
|
||||
renderUserContentAsMarkdown: 'Render user messages using markdown formatting in the chat.',
|
||||
modelSelectorEnabled:
|
||||
'Enable the model selector in the chat input to choose the inference model. Sends the associated model field in API requests.',
|
||||
|
||||
Reference in New Issue
Block a user