mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-07 09:34:07 +00:00
Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a6f32f0b34 | ||
|
|
2bb3597e42 | ||
|
|
8293970542 | ||
|
|
8bbf26083d | ||
|
|
35782aeedb | ||
|
|
c80a7759da | ||
|
|
250d7953e8 | ||
|
|
403fbacbbc |
@@ -240,7 +240,7 @@ public:
|
||||
auto index = key.get<int>();
|
||||
return array_->at(index < 0 ? array_->size() + index : index);
|
||||
} else if (object_) {
|
||||
if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
|
||||
if (!key.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
|
||||
auto it = object_->find(key.primitive_);
|
||||
if (it == object_->end()) return Value();
|
||||
return it->second;
|
||||
@@ -249,7 +249,7 @@ public:
|
||||
}
|
||||
void set(const Value& key, const Value& value) {
|
||||
if (!object_) throw std::runtime_error("Value is not an object: " + dump());
|
||||
if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
|
||||
if (!key.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
|
||||
(*object_)[key.primitive_] = value;
|
||||
}
|
||||
Value call(const std::shared_ptr<Context> & context, ArgumentsValue & args) const {
|
||||
|
||||
@@ -3557,8 +3557,8 @@ class RWKV6Qwen2Model(Rwkv6Model):
|
||||
head_size = hidden_size // num_attention_heads
|
||||
rms_norm_eps = self.hparams["rms_norm_eps"]
|
||||
intermediate_size = self.hparams["intermediate_size"]
|
||||
time_mix_extra_dim = 64 if hidden_size >= 4096 else 32
|
||||
time_decay_extra_dim = 128 if hidden_size >= 4096 else 64
|
||||
time_mix_extra_dim = self.hparams.get("lora_rank_tokenshift", 64 if hidden_size >= 4096 else 32)
|
||||
time_decay_extra_dim = self.hparams.get("lora_rank_decay", 128 if hidden_size >= 4096 else 64)
|
||||
|
||||
# RWKV isn't context limited
|
||||
self.gguf_writer.add_context_length(1048576)
|
||||
@@ -5146,7 +5146,7 @@ class BailingMoeModel(Model):
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
if "head_dim" in hparams:
|
||||
if hparams.get("head_dim"):
|
||||
rope_dim = hparams["head_dim"]
|
||||
else:
|
||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
|
||||
|
||||
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
|
||||
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
|
||||
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. Intel oneMKL, oneMath and oneDNN)*.
|
||||
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
|
||||
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
|
||||
|
||||
@@ -227,16 +227,6 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
|
||||
|
||||
**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
|
||||
|
||||
|
||||
**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
|
||||
|
||||
```sh
|
||||
git clone https://github.com/oneapi-src/oneMKL
|
||||
cd oneMKL
|
||||
cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
|
||||
cmake --build buildWithCublas --config Release
|
||||
```
|
||||
|
||||
**oneDNN**: The current oneDNN releases *(shipped with the oneAPI base-toolkit)* do not include the NVIDIA backend. Therefore, oneDNN must be compiled from source to enable the NVIDIA target:
|
||||
|
||||
```sh
|
||||
@@ -250,16 +240,6 @@ cmake --build build-nvidia --config Release
|
||||
|
||||
**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.
|
||||
|
||||
**oneMKL for rocBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* doesn't contain the rocBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *rocBLAS* backend enabled is thus required to run it on AMD GPUs.
|
||||
|
||||
```sh
|
||||
git clone https://github.com/oneapi-src/oneMKL
|
||||
cd oneMKL
|
||||
# Find your HIPTARGET with rocminfo, under the key 'Name:'
|
||||
cmake -B buildWithrocBLAS -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_ROCBLAS_BACKEND=ON -DHIPTARGETS=${HIPTARGET} -DTARGET_DOMAINS=blas
|
||||
cmake --build buildWithrocBLAS --config Release
|
||||
```
|
||||
|
||||
3. **Verify installation and environment**
|
||||
|
||||
In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
|
||||
@@ -324,13 +304,10 @@ cmake --build build --config Release -j -v
|
||||
|
||||
#### Nvidia GPU
|
||||
|
||||
```sh
|
||||
# Export relevant ENV variables
|
||||
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
|
||||
export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
|
||||
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
|
||||
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
|
||||
The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
|
||||
By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
|
||||
|
||||
```sh
|
||||
# Build LLAMA with Nvidia BLAS acceleration through SYCL
|
||||
# Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
|
||||
GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
|
||||
@@ -347,12 +324,10 @@ cmake --build build --config Release -j -v
|
||||
|
||||
#### AMD GPU
|
||||
|
||||
```sh
|
||||
# Export relevant ENV variables
|
||||
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH
|
||||
export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
|
||||
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR
|
||||
The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
|
||||
By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
|
||||
|
||||
```sh
|
||||
# Build LLAMA with rocBLAS acceleration through SYCL
|
||||
|
||||
## AMD
|
||||
|
||||
@@ -31,6 +31,8 @@
|
||||
#include "ggml-cuda/rope.cuh"
|
||||
#include "ggml-cuda/scale.cuh"
|
||||
#include "ggml-cuda/softmax.cuh"
|
||||
#include "ggml-cuda/ssm-conv.cuh"
|
||||
#include "ggml-cuda/ssm-scan.cuh"
|
||||
#include "ggml-cuda/sum.cuh"
|
||||
#include "ggml-cuda/sumrows.cuh"
|
||||
#include "ggml-cuda/tsembd.cuh"
|
||||
@@ -2296,6 +2298,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_SUM_ROWS:
|
||||
ggml_cuda_op_sum_rows(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SSM_CONV:
|
||||
ggml_cuda_op_ssm_conv(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SSM_SCAN:
|
||||
ggml_cuda_op_ssm_scan(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_ARGSORT:
|
||||
ggml_cuda_op_argsort(ctx, dst);
|
||||
break;
|
||||
@@ -3193,6 +3201,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_LOG:
|
||||
case GGML_OP_SSM_SCAN:
|
||||
case GGML_OP_SSM_CONV:
|
||||
return true;
|
||||
case GGML_OP_CONT:
|
||||
return op->src[0]->type != GGML_TYPE_BF16;
|
||||
|
||||
151
ggml/src/ggml-cuda/ssm-conv.cu
Normal file
151
ggml/src/ggml-cuda/ssm-conv.cu
Normal file
@@ -0,0 +1,151 @@
|
||||
#include "ssm-conv.cuh"
|
||||
|
||||
template <size_t split_d_inner, size_t d_conv>
|
||||
static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float * __restrict__ src1,
|
||||
const int src0_nb0, const int src0_nb1, const int src0_nb2, const int src1_nb1,
|
||||
float * __restrict__ dst, const int dst_nb0, const int dst_nb1, const int dst_nb2,
|
||||
const int nc, const int ncs, const int nr, const int n_t, const int n_s) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bidx = blockIdx.x;
|
||||
const int bidy = blockIdx.y;
|
||||
|
||||
const float * x_block = (const float *) ((char *) src0 + bidx * src0_nb2 + bidy * split_d_inner * src0_nb1);
|
||||
const float * w_block = (const float *) ((char *) src1 + bidy * split_d_inner * src1_nb1);
|
||||
float * y_block = (float *) ((char *) dst + bidx * dst_nb2 + bidy * split_d_inner * dst_nb0);
|
||||
|
||||
const int stride_x = src0_nb1 / sizeof(float);
|
||||
const int stride_w = src1_nb1 / sizeof(float);
|
||||
const int stride_y = dst_nb1 / sizeof(float);
|
||||
|
||||
float x[d_conv] = { 0.0f };
|
||||
float w[d_conv] = { 0.0f };
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < d_conv; j++) {
|
||||
w[j] = w_block[tid * stride_w + j];
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_t; i++) {
|
||||
float sumf = 0.0f;
|
||||
|
||||
if (i == 0) {
|
||||
for (int j = 0; j < d_conv; j++) {
|
||||
x[j] = x_block[tid * stride_x + j];
|
||||
}
|
||||
} else {
|
||||
x[(i - 1) % d_conv] = x_block[tid * stride_x + i + d_conv - 1];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < d_conv; j++) {
|
||||
sumf += x[(i + j) % d_conv] * w[j];
|
||||
}
|
||||
y_block[i * stride_y + tid] = sumf;
|
||||
}
|
||||
}
|
||||
|
||||
template <size_t split_d_inner, size_t d_conv, size_t split_n_t>
|
||||
static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0, const float * __restrict__ src1,
|
||||
const int src0_nb0, const int src0_nb1, const int src0_nb2,
|
||||
const int src1_nb1, float * __restrict__ dst, const int dst_nb0,
|
||||
const int dst_nb1, const int dst_nb2, const int nc, const int ncs,
|
||||
const int nr, const int n_t, const int n_s) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bidx = blockIdx.x;
|
||||
const int bidy = blockIdx.y;
|
||||
const int bidz = blockIdx.z;
|
||||
|
||||
const float * x_block = (const float *) ((char *) src0 + bidx * src0_nb2 + bidy * split_d_inner * src0_nb1 +
|
||||
bidz * split_n_t * src0_nb0);
|
||||
const float * w_block = (const float *) ((char *) src1 + bidy * split_d_inner * src1_nb1);
|
||||
float * y_block =
|
||||
(float *) ((char *) dst + bidx * dst_nb2 + bidz * split_n_t * dst_nb1 + bidy * split_d_inner * dst_nb0);
|
||||
|
||||
const int stride_x = src0_nb1 / sizeof(float);
|
||||
const int stride_w = src1_nb1 / sizeof(float);
|
||||
const int stride_y = dst_nb1 / sizeof(float);
|
||||
|
||||
float x[d_conv] = { 0.0f };
|
||||
float w[d_conv] = { 0.0f };
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < d_conv; j++) {
|
||||
w[j] = w_block[tid * stride_w + j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < split_n_t; i++) {
|
||||
if (bidz * split_n_t + i < n_t) {
|
||||
float sumf = 0.0f;
|
||||
|
||||
if (i == 0) {
|
||||
for (int j = 0; j < d_conv; j++) {
|
||||
x[j] = x_block[tid * stride_x + j];
|
||||
}
|
||||
} else {
|
||||
x[(i - 1) % d_conv] = x_block[tid * stride_x + i + d_conv - 1];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < d_conv; j++) {
|
||||
sumf += x[(i + j) % d_conv] * w[j];
|
||||
}
|
||||
y_block[i * stride_y + tid] = sumf;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int src0_nb0, const int src0_nb1,
|
||||
const int src0_nb2, const int src1_nb1, float * dst, const int dst_nb0, const int dst_nb1,
|
||||
const int dst_nb2, const int nc, const int ncs, const int nr, const int n_t,
|
||||
const int n_s, cudaStream_t stream) {
|
||||
const int threads = 128;
|
||||
GGML_ASSERT(nr % threads == 0);
|
||||
|
||||
if (n_t <= 32) {
|
||||
const dim3 blocks(n_s, (nr + threads - 1) / threads, 1);
|
||||
if (nc == 4) {
|
||||
ssm_conv_f32<threads, 4><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
|
||||
dst, dst_nb0, dst_nb1, dst_nb2, nc, ncs, nr, n_t,
|
||||
n_s);
|
||||
} else {
|
||||
GGML_ABORT("Only support kernel size = 4 now.");
|
||||
}
|
||||
} else {
|
||||
if (nc == 4) {
|
||||
const int split_n_t = 32;
|
||||
dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
|
||||
ssm_conv_long_token_f32<threads, 4, split_n_t>
|
||||
<<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0,
|
||||
dst_nb1, dst_nb2, nc, ncs, nr, n_t, n_s);
|
||||
} else {
|
||||
GGML_ABORT("Only support kernel size = 4 right now.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const struct ggml_tensor * src0 = dst->src[0]; // conv_x
|
||||
const struct ggml_tensor * src1 = dst->src[1]; // conv1d.weight
|
||||
|
||||
const int nc = src1->ne[0]; // d_conv
|
||||
const int ncs = src0->ne[0]; // d_conv - 1 + n_t
|
||||
const int nr = src0->ne[1]; // d_inner
|
||||
const int n_t = dst->ne[1]; // tokens per sequence
|
||||
const int n_s = dst->ne[2]; // number of sequences in the batch
|
||||
|
||||
GGML_ASSERT(dst->ne[0] == nr);
|
||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src0->nb[1] == src0->ne[0] * sizeof(float));
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
ssm_conv_f32_cuda(src0_d, src1_d, src0->nb[0], src0->nb[1], src0->nb[2], src1->nb[1], dst_d, dst->nb[0], dst->nb[1],
|
||||
dst->nb[2], nc, ncs, nr, n_t, n_s, stream);
|
||||
}
|
||||
3
ggml/src/ggml-cuda/ssm-conv.cuh
Normal file
3
ggml/src/ggml-cuda/ssm-conv.cuh
Normal file
@@ -0,0 +1,3 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
155
ggml/src/ggml-cuda/ssm-scan.cu
Normal file
155
ggml/src/ggml-cuda/ssm-scan.cu
Normal file
@@ -0,0 +1,155 @@
|
||||
#include "ssm-scan.cuh"
|
||||
|
||||
// #include <cuda_runtime.h>
|
||||
// static __device__ void global_to_shared(const float *src, float *dst) {
|
||||
// asm volatile("cp.async.");
|
||||
// }
|
||||
|
||||
template <size_t splitD, size_t N>
|
||||
__global__ void __launch_bounds__(splitD, 2)
|
||||
ssm_scan_f32(const float * __restrict__ src0, const float * __restrict__ src1, const float * __restrict__ src2,
|
||||
const float * __restrict__ src3, const float * __restrict__ src4, const float * __restrict__ src5,
|
||||
const int src0_nb1, const int src0_nb2, const int src1_nb0, const int src1_nb1, const int src1_nb2,
|
||||
const int src1_nb3, const int src2_nb0, const int src2_nb1, const int src2_nb2, const int src3_nb1,
|
||||
const int src4_nb1, const int src4_nb2, const int src5_nb1, const int src5_nb2,
|
||||
float * __restrict__ dst, const int D, const int L, const int B) {
|
||||
const int bidx = blockIdx.x; // split along B
|
||||
const int bidy = blockIdx.y; // split along D
|
||||
const int tid = threadIdx.x;
|
||||
const int wid = tid / 32;
|
||||
const int wtid = tid % 32;
|
||||
|
||||
extern __shared__ float smem[];
|
||||
const int stride_sA = N + 1;
|
||||
const int stride_ss0 = N + 1;
|
||||
float * smem_A = smem;
|
||||
float * smem_s0 = smem_A + splitD * stride_sA;
|
||||
|
||||
const float * s0_block = (const float *) ((char *) src0 + bidx * src0_nb2 + bidy * splitD * src0_nb1);
|
||||
const float * x_block = (const float *) ((char *) src1 + (bidx * src1_nb2) + bidy * splitD * sizeof(float));
|
||||
const float * dt_block = (const float *) ((char *) src2 + (bidx * src2_nb2) + bidy * splitD * sizeof(float));
|
||||
const float * A_block = (const float *) ((char *) src3 + bidy * splitD * src3_nb1);
|
||||
const float * B_block = (const float *) ((char *) src4 + (bidx * src4_nb2));
|
||||
const float * C_block = (const float *) ((char *) src5 + (bidx * src5_nb2));
|
||||
float * y_block = (float *) ((char *) dst + (bidx * src1_nb2) + bidy * splitD * sizeof(float));
|
||||
float * s_block = (float *) ((char *) dst + src1_nb3 + bidx * src0_nb2 + bidy * splitD * src0_nb1);
|
||||
|
||||
const int stride_s0 = src0_nb1 / sizeof(float);
|
||||
const int stride_x = src1_nb1 / sizeof(float);
|
||||
const int stride_dt = src2_nb1 / sizeof(float);
|
||||
const int stride_A = src3_nb1 / sizeof(float);
|
||||
const int stride_B = src4_nb1 / sizeof(float);
|
||||
const int stride_C = src5_nb1 / sizeof(float);
|
||||
const int stride_s = stride_s0;
|
||||
const int stride_y = stride_x;
|
||||
|
||||
// can N not be 16? for example 32?
|
||||
if (N == 16) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < splitD / 4; i += 2) {
|
||||
float value = A_block[(wid * warpSize + i) * stride_A + wtid];
|
||||
// todo: bank conflict
|
||||
// I am always confused with how to use the swizzling method to solve
|
||||
// bank conflit. Hoping somebody can tell me.
|
||||
smem_A[(wid * warpSize + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i = 0; i < splitD / 4; i += 2) {
|
||||
float value = s0_block[(wid * warpSize + i) * stride_s0 + wtid];
|
||||
smem_s0[(wid * warpSize + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (int i = 0; i < L; i++) {
|
||||
float dt_soft_plus = dt_block[i * stride_dt + tid];
|
||||
if (dt_soft_plus <= 20.0f) {
|
||||
dt_soft_plus = log1pf(exp(dt_soft_plus));
|
||||
}
|
||||
float x_dt = x_block[i * stride_x + tid] * dt_soft_plus;
|
||||
float sumf = 0.0f;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < N; j++) {
|
||||
float state = (smem_s0[tid * stride_ss0 + j] * expf(dt_soft_plus * smem_A[tid * stride_sA + j])) +
|
||||
(B_block[i * stride_B + j] * x_dt);
|
||||
sumf += state * C_block[i * stride_C + j];
|
||||
if (i == L - 1) {
|
||||
s_block[tid * stride_s + j] = state;
|
||||
} else {
|
||||
smem_s0[tid * stride_ss0 + j] = state;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
y_block[i * stride_y + tid] = sumf;
|
||||
}
|
||||
}
|
||||
|
||||
static void ssm_scan_f32_cuda(const float * src0, const float * src1, const float * src2, const float * src3,
|
||||
const float * src4, const float * src5, const int src0_nb1, const int src0_nb2,
|
||||
const int src1_nb0, const int src1_nb1, const int src1_nb2, const int src1_nb3,
|
||||
const int src2_nb0, const int src2_nb1, const int src2_nb2, const int src3_nb1,
|
||||
const int src4_nb1, const int src4_nb2, const int src5_nb1, const int src5_nb2,
|
||||
float * dst, const int N, const int D, const int L, const int B, cudaStream_t stream) {
|
||||
const int threads = 128;
|
||||
// todo: consider D cannot be divided,does this situation exist?
|
||||
GGML_ASSERT(D % threads == 0);
|
||||
const dim3 blocks(B, (D + threads - 1) / threads, 1);
|
||||
const int smem_size = (threads * (N + 1) * 2) * sizeof(float);
|
||||
if (N == 16) {
|
||||
ssm_scan_f32<128, 16><<<blocks, threads, smem_size, stream>>>(
|
||||
src0, src1, src2, src3, src4, src5, src0_nb1, src0_nb2, src1_nb0, src1_nb1, src1_nb2, src1_nb3, src2_nb0,
|
||||
src2_nb1, src2_nb2, src3_nb1, src4_nb1, src4_nb2, src5_nb1, src5_nb2, dst, D, L, B);
|
||||
} else {
|
||||
GGML_ABORT("doesn't support N!=16.");
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_ssm_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const struct ggml_tensor * src0 = dst->src[0]; // s
|
||||
const struct ggml_tensor * src1 = dst->src[1]; // x
|
||||
const struct ggml_tensor * src2 = dst->src[2]; // dt
|
||||
const struct ggml_tensor * src3 = dst->src[3]; // A
|
||||
const struct ggml_tensor * src4 = dst->src[4]; // B
|
||||
const struct ggml_tensor * src5 = dst->src[5]; // C
|
||||
|
||||
// const int64_t d_state = src0->ne[0];
|
||||
// const int64_t d_inner = src0->ne[1];
|
||||
// const int64_t l = src1->ne[1];
|
||||
// const int64_t b = src0->ne[2];
|
||||
|
||||
const int64_t nc = src0->ne[0]; // d_state
|
||||
const int64_t nr = src0->ne[1]; // d_inner
|
||||
const int64_t n_t = src1->ne[1]; // number of tokens per sequence
|
||||
const int64_t n_s = src0->ne[2]; // number of sequences in the batch
|
||||
|
||||
GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst));
|
||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src2->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src3->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src4->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src5->nb[0] == sizeof(float));
|
||||
// required for the dot product between s and C
|
||||
GGML_ASSERT(src0->nb[1] == src0->ne[0] * sizeof(float));
|
||||
// required for per-sequence offsets for states
|
||||
GGML_ASSERT(src0->nb[2] == src0->ne[0] * src0->ne[1] * sizeof(float));
|
||||
// required to get correct offset for state destination (i.e. src1->nb[3])
|
||||
GGML_ASSERT(src1->nb[3] == src1->ne[0] * src1->ne[1] * src1->ne[2] * sizeof(float));
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
const float * src2_d = (const float *) src2->data;
|
||||
const float * src3_d = (const float *) src3->data;
|
||||
const float * src4_d = (const float *) src4->data;
|
||||
const float * src5_d = (const float *) src5->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
ssm_scan_f32_cuda(src0_d, src1_d, src2_d, src3_d, src4_d, src5_d, src0->nb[1], src0->nb[2], src1->nb[0],
|
||||
src1->nb[1], src1->nb[2], src1->nb[3], src2->nb[0], src2->nb[1], src2->nb[2], src3->nb[1],
|
||||
src4->nb[1], src4->nb[2], src5->nb[1], src5->nb[2], dst_d, nc, nr, n_t, n_s, stream);
|
||||
}
|
||||
3
ggml/src/ggml-cuda/ssm-scan.cuh
Normal file
3
ggml/src/ggml-cuda/ssm-scan.cuh
Normal file
@@ -0,0 +1,3 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_op_ssm_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -23,6 +23,23 @@ ggml_add_backend_library(ggml-sycl
|
||||
../../include/ggml-sycl.h
|
||||
)
|
||||
|
||||
file(GLOB GGML_HEADERS_SYCL "*.hpp")
|
||||
file(GLOB GGML_SOURCES_SYCL "*.cpp")
|
||||
target_sources(ggml-sycl PRIVATE ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL})
|
||||
|
||||
find_package(IntelSYCL)
|
||||
if (IntelSYCL_FOUND)
|
||||
# Use oneAPI CMake when possible
|
||||
target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX)
|
||||
else()
|
||||
# Fallback to the simplest way of enabling SYCL when using intel/llvm nightly for instance
|
||||
target_compile_options(ggml-sycl PRIVATE "-fsycl")
|
||||
target_link_options(ggml-sycl PRIVATE "-fsycl")
|
||||
endif()
|
||||
|
||||
target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")
|
||||
|
||||
# Link against oneDNN
|
||||
find_package(DNNL)
|
||||
set(GGML_SYCL_DNNL 0)
|
||||
if(DNNL_FOUND)
|
||||
@@ -62,8 +79,6 @@ if (GGML_SYCL_F16)
|
||||
add_compile_definitions(GGML_SYCL_F16)
|
||||
endif()
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
|
||||
|
||||
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
||||
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||
@@ -76,34 +91,84 @@ else()
|
||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
|
||||
endif()
|
||||
|
||||
file(GLOB GGML_HEADERS_SYCL "*.hpp")
|
||||
file(GLOB GGML_SOURCES_SYCL "*.cpp")
|
||||
target_sources(ggml-sycl PRIVATE ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL})
|
||||
if (GGML_SYCL_GRAPH)
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GRAPH)
|
||||
endif()
|
||||
|
||||
|
||||
if (WIN32)
|
||||
find_package(IntelSYCL REQUIRED)
|
||||
# Link against Intel oneMKL or oneMath
|
||||
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
# Intel devices use Intel oneMKL directly instead of oneMath to avoid the limitation of linking Intel oneMKL statically
|
||||
# See https://github.com/uxlfoundation/oneMath/issues/654
|
||||
find_package(MKL REQUIRED)
|
||||
target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
||||
target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS)
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_USE_INTEL_ONEMKL)
|
||||
else()
|
||||
if (GGML_SYCL_GRAPH)
|
||||
add_compile_definitions(GGML_SYCL_GRAPH)
|
||||
find_package(oneMath QUIET)
|
||||
if (NOT oneMath_FOUND)
|
||||
message(STATUS "oneMath not found: oneMath will be automatically downloaded")
|
||||
# Use FetchContent to automatically pull and build oneMath
|
||||
include(FetchContent)
|
||||
set(BUILD_FUNCTIONAL_TESTS False)
|
||||
set(BUILD_EXAMPLES False)
|
||||
set(TARGET_DOMAINS blas)
|
||||
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
set(ENABLE_MKLCPU_BACKEND False)
|
||||
set(ENABLE_MKLGPU_BACKEND False)
|
||||
set(ENABLE_CUBLAS_BACKEND True)
|
||||
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||
set(ENABLE_MKLCPU_BACKEND False)
|
||||
set(ENABLE_MKLGPU_BACKEND False)
|
||||
set(ENABLE_ROCBLAS_BACKEND True)
|
||||
# Ensure setting a string variable here is not overriden by oneMath CACHE variables
|
||||
cmake_policy(SET CMP0126 NEW)
|
||||
# Setting the device architecture is only needed and useful for AMD devices in oneMath
|
||||
set(HIP_TARGETS ${GGML_SYCL_DEVICE_ARCH} CACHE STRING "oneMath HIP target" FORCE)
|
||||
endif()
|
||||
FetchContent_Declare(
|
||||
ONEMATH
|
||||
GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
|
||||
GIT_TAG c255b1b4c41e2ee3059455c1f96a965d6a62568a
|
||||
)
|
||||
FetchContent_MakeAvailable(ONEMATH)
|
||||
# Create alias to match with find_package targets name
|
||||
function(onemath_alias target)
|
||||
if (TARGET ${target}_obj)
|
||||
# Silence verbose warnings from external libraries
|
||||
target_compile_options(${target}_obj PRIVATE -w)
|
||||
endif()
|
||||
if (TARGET ${target})
|
||||
add_library(ONEMATH::${target} ALIAS ${target})
|
||||
endif()
|
||||
endfunction()
|
||||
onemath_alias(onemath)
|
||||
onemath_alias(onemath_blas_mklcpu)
|
||||
onemath_alias(onemath_blas_mklgpu)
|
||||
onemath_alias(onemath_blas_cublas)
|
||||
onemath_alias(onemath_blas_rocblas)
|
||||
endif()
|
||||
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
||||
add_compile_definitions(GGML_SYCL_NVIDIA)
|
||||
target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl_blas_cublas)
|
||||
|
||||
# Below oneMath compile-time dispatching is used for better performance
|
||||
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_cublas)
|
||||
target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
|
||||
target_link_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA)
|
||||
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||
if (NOT GGML_SYCL_DEVICE_ARCH)
|
||||
message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
|
||||
endif()
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amdgcn-amd-amdhsa")
|
||||
target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl)
|
||||
target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas)
|
||||
target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
|
||||
target_link_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_AMD)
|
||||
else()
|
||||
# Fallback to oneMath runtime dispatcher
|
||||
target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath)
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GENERIC)
|
||||
endif()
|
||||
|
||||
if (GGML_SYCL_DEVICE_ARCH)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_SYCL_DEVICE_ARCH)
|
||||
target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
|
||||
target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
|
||||
endif()
|
||||
|
||||
@@ -16,9 +16,18 @@
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <sycl/half_type.hpp>
|
||||
#include <syclcompat/math.hpp>
|
||||
#include <oneapi/mkl.hpp>
|
||||
#include <map>
|
||||
|
||||
#ifdef GGML_SYCL_USE_INTEL_ONEMKL
|
||||
#include <oneapi/mkl.hpp>
|
||||
// Allow to use the same namespace for Intel oneMKL and oneMath
|
||||
namespace oneapi {
|
||||
namespace math = mkl;
|
||||
}
|
||||
#else
|
||||
#include <oneapi/math.hpp>
|
||||
#endif
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#if defined(__linux__)
|
||||
@@ -83,13 +92,32 @@ inline std::string get_device_backend_and_type(const sycl::device &device) {
|
||||
}
|
||||
|
||||
template <typename Ts> struct matrix_info_t {
|
||||
oneapi::mkl::transpose transpose_info[2];
|
||||
oneapi::math::transpose transpose_info[2];
|
||||
Ts value_info[2];
|
||||
std::int64_t size_info[3];
|
||||
std::int64_t ld_info[3];
|
||||
std::int64_t groupsize_info;
|
||||
};
|
||||
|
||||
inline auto get_onemath_backend(sycl::queue& queue)
|
||||
#if defined(GGML_SYCL_GENERIC) || defined(GGML_SYCL_USE_INTEL_ONEMKL)
|
||||
-> sycl::queue&
|
||||
#endif
|
||||
{
|
||||
// If the backend is known at compile-time, use oneMath backend_selector to use
|
||||
// compile-time dispatching and avoid the need to dlopen libraries. Otherwise
|
||||
// fallback to runtime dispatching.
|
||||
#if defined(GGML_SYCL_NVIDIA)
|
||||
return oneapi::math::backend_selector<oneapi::math::backend::cublas>{ queue };
|
||||
#elif defined(GGML_SYCL_AMD)
|
||||
return oneapi::math::backend_selector<oneapi::math::backend::rocblas>{ queue };
|
||||
#elif defined(GGML_SYCL_GENERIC) || defined(GGML_SYCL_USE_INTEL_ONEMKL)
|
||||
return queue;
|
||||
#else
|
||||
static_assert(false, "Unsupported backend");
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace dpct
|
||||
{
|
||||
typedef sycl::queue *queue_ptr;
|
||||
@@ -1686,26 +1714,18 @@ namespace dpct
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class Ta, class Tb, class Tc, class Ts>
|
||||
inline void gemm_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
|
||||
oneapi::mkl::transpose b_trans, int m, int n, int k,
|
||||
const void *alpha, const void *a, int lda, const void *b,
|
||||
int ldb, const void *beta, void *c, int ldc)
|
||||
{
|
||||
Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
|
||||
Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
|
||||
auto data_a = get_memory<const Ta>(a);
|
||||
auto data_b = get_memory<const Tb>(b);
|
||||
auto data_c = get_memory<Tc>(c);
|
||||
#ifdef GGML_SYCL_NVIDIA
|
||||
oneapi::mkl::blas::column_major::gemm(oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q },
|
||||
a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb,
|
||||
beta_value, data_c, ldc);
|
||||
#else
|
||||
oneapi::mkl::blas::column_major::gemm(q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb,
|
||||
beta_value, data_c, ldc);
|
||||
#endif
|
||||
}
|
||||
template <class Ta, class Tb, class Tc, class Ts>
|
||||
inline void gemm_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
|
||||
int n, int k, const void * alpha, const void * a, int lda, const void * b, int ldb,
|
||||
const void * beta, void * c, int ldc) {
|
||||
Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
|
||||
Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
|
||||
auto data_a = get_memory<const Ta>(a);
|
||||
auto data_b = get_memory<const Tb>(b);
|
||||
auto data_c = get_memory<Tc>(c);
|
||||
oneapi::math::blas::column_major::gemm(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value, data_a,
|
||||
lda, data_b, ldb, beta_value, data_c, ldc);
|
||||
}
|
||||
|
||||
template <typename VecT, class BinaryOperation, class = void>
|
||||
class vectorized_binary
|
||||
@@ -1735,7 +1755,7 @@ namespace dpct
|
||||
};
|
||||
|
||||
template <class Ta, class Tb, class Tc, class Ts>
|
||||
inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans,
|
||||
inline void gemm_batch_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans,
|
||||
int m, int n, int k, const void * alpha, const void ** a, int lda, const void ** b,
|
||||
int ldb, const void * beta, void ** c, int ldc, int batch_size,
|
||||
matrix_info_t<float> * matrix_info) {
|
||||
@@ -1754,48 +1774,28 @@ namespace dpct
|
||||
matrix_info->ld_info[2] = ldc;
|
||||
matrix_info->groupsize_info = batch_size;
|
||||
|
||||
#ifdef GGML_SYCL_NVIDIA
|
||||
sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
|
||||
oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q }, matrix_info->transpose_info,
|
||||
matrix_info->transpose_info + 1, matrix_info->size_info, matrix_info->size_info + 1,
|
||||
matrix_info->size_info + 2, reinterpret_cast<Ts *>(matrix_info->value_info),
|
||||
reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b),
|
||||
matrix_info->ld_info + 1, reinterpret_cast<Ts *>(matrix_info->value_info + 1),
|
||||
reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
|
||||
#else
|
||||
sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
|
||||
q, matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info,
|
||||
matrix_info->size_info + 1, matrix_info->size_info + 2, reinterpret_cast<Ts *>(matrix_info->value_info),
|
||||
reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b),
|
||||
matrix_info->ld_info + 1, reinterpret_cast<Ts *>(matrix_info->value_info + 1),
|
||||
reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
|
||||
#endif
|
||||
sycl::event e = oneapi::math::blas::column_major::gemm_batch(
|
||||
get_onemath_backend(q), matrix_info->transpose_info, matrix_info->transpose_info + 1,
|
||||
matrix_info->size_info, matrix_info->size_info + 1, matrix_info->size_info + 2,
|
||||
reinterpret_cast<Ts *>(matrix_info->value_info), reinterpret_cast<const Ta **>(a), matrix_info->ld_info,
|
||||
reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
|
||||
reinterpret_cast<Ts *>(matrix_info->value_info + 1), reinterpret_cast<Tc **>(c),
|
||||
matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
|
||||
}
|
||||
|
||||
template <class Ta, class Tb, class Tc, class Ts>
|
||||
inline void
|
||||
gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
|
||||
oneapi::mkl::transpose b_trans, int m, int n,
|
||||
int k, const void *alpha, const void *a, int lda,
|
||||
long long int stride_a, const void *b, int ldb,
|
||||
long long int stride_b, const void *beta, void *c,
|
||||
int ldc, long long int stride_c, int batch_size)
|
||||
{
|
||||
inline void gemm_batch_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans,
|
||||
int m, int n, int k, const void * alpha, const void * a, int lda,
|
||||
long long int stride_a, const void * b, int ldb, long long int stride_b,
|
||||
const void * beta, void * c, int ldc, long long int stride_c, int batch_size) {
|
||||
Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
|
||||
Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
|
||||
auto data_a = get_memory<const Ta>(a);
|
||||
auto data_b = get_memory<const Tb>(b);
|
||||
auto data_c = get_memory<Tc>(c);
|
||||
#ifdef GGML_SYCL_NVIDIA
|
||||
oneapi::mkl::blas::column_major::gemm_batch(
|
||||
oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q }, a_trans, b_trans, m, n, k,
|
||||
alpha_value, data_a, lda, stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc, stride_c,
|
||||
batch_size);
|
||||
#else
|
||||
oneapi::mkl::blas::column_major::gemm_batch(q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
|
||||
stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc,
|
||||
stride_c, batch_size);
|
||||
#endif
|
||||
oneapi::math::blas::column_major::gemm_batch(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value,
|
||||
data_a, lda, stride_a, data_b, ldb, stride_b, beta_value,
|
||||
data_c, ldc, stride_c, batch_size);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
@@ -2259,13 +2259,10 @@ namespace dpct
|
||||
sycl::range<3>(x, y, 1), direction);
|
||||
}
|
||||
|
||||
inline void gemm(sycl::queue &q, oneapi::mkl::transpose a_trans,
|
||||
oneapi::mkl::transpose b_trans, int m, int n, int k,
|
||||
const void *alpha, const void *a, library_data_t a_type,
|
||||
int lda, const void *b, library_data_t b_type, int ldb,
|
||||
const void *beta, void *c, library_data_t c_type, int ldc,
|
||||
library_data_t scaling_type)
|
||||
{
|
||||
inline void gemm(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m, int n,
|
||||
int k, const void * alpha, const void * a, library_data_t a_type, int lda, const void * b,
|
||||
library_data_t b_type, int ldb, const void * beta, void * c, library_data_t c_type, int ldc,
|
||||
library_data_t scaling_type) {
|
||||
if (scaling_type == library_data_t::real_float &&
|
||||
c_type == library_data_t::complex_float)
|
||||
{
|
||||
@@ -2329,9 +2326,8 @@ namespace dpct
|
||||
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
||||
library_data_t::real_float, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
|
||||
float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b,
|
||||
ldb, beta, c, ldc);
|
||||
detail::gemm_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
|
||||
break;
|
||||
}
|
||||
case detail::get_type_combination_id(
|
||||
@@ -2369,8 +2365,7 @@ namespace dpct
|
||||
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
||||
library_data_t::real_bfloat16, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
|
||||
oneapi::mkl::bfloat16, float>(
|
||||
detail::gemm_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
|
||||
break;
|
||||
}
|
||||
@@ -2390,7 +2385,7 @@ namespace dpct
|
||||
default:
|
||||
throw std::runtime_error("the combination of data type is unsupported");
|
||||
}
|
||||
} // gemm()
|
||||
} // gemm()
|
||||
|
||||
/// Computes a batch of matrix-matrix product with general matrices.
|
||||
/// \param [in] q The queue where the routine should be executed.
|
||||
@@ -2412,7 +2407,7 @@ namespace dpct
|
||||
/// \param [in] ldc Leading dimension of C.
|
||||
/// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
|
||||
/// \param [in] scaling_type Data type of the scaling factors.
|
||||
inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
|
||||
inline void gemm_batch(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
|
||||
int n, int k, const void * alpha, const void * a[], library_data_t a_type, int lda,
|
||||
const void * b[], library_data_t b_type, int ldb, const void * beta, void * c[],
|
||||
library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type,
|
||||
@@ -2450,7 +2445,7 @@ namespace dpct
|
||||
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
||||
library_data_t::real_bfloat16, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
|
||||
detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
|
||||
break;
|
||||
}
|
||||
@@ -2458,7 +2453,7 @@ namespace dpct
|
||||
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
||||
library_data_t::real_float, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
|
||||
detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
|
||||
break;
|
||||
}
|
||||
@@ -2534,15 +2529,11 @@ namespace dpct
|
||||
/// \param [in] stride_c Stride between the different C matrices.
|
||||
/// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
|
||||
/// \param [in] scaling_type Data type of the scaling factors.
|
||||
inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
|
||||
oneapi::mkl::transpose b_trans, int m, int n, int k,
|
||||
const void *alpha, const void *a, library_data_t a_type,
|
||||
int lda, long long int stride_a, const void *b,
|
||||
library_data_t b_type, int ldb, long long int stride_b,
|
||||
const void *beta, void *c, library_data_t c_type,
|
||||
int ldc, long long int stride_c, int batch_size,
|
||||
library_data_t scaling_type)
|
||||
{
|
||||
inline void gemm_batch(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
|
||||
int n, int k, const void * alpha, const void * a, library_data_t a_type, int lda,
|
||||
long long int stride_a, const void * b, library_data_t b_type, int ldb,
|
||||
long long int stride_b, const void * beta, void * c, library_data_t c_type, int ldc,
|
||||
long long int stride_c, int batch_size, library_data_t scaling_type) {
|
||||
if (scaling_type == library_data_t::real_float &&
|
||||
c_type == library_data_t::complex_float)
|
||||
{
|
||||
@@ -2611,20 +2602,18 @@ namespace dpct
|
||||
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
||||
library_data_t::real_bfloat16, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
|
||||
oneapi::mkl::bfloat16, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
|
||||
beta, c, ldc, stride_c, batch_size);
|
||||
detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
|
||||
batch_size);
|
||||
break;
|
||||
}
|
||||
case detail::get_type_combination_id(
|
||||
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
||||
library_data_t::real_float, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
|
||||
float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
|
||||
stride_a, b, ldb, stride_b, beta, c, ldc,
|
||||
stride_c, batch_size);
|
||||
detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
|
||||
batch_size);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -2059,8 +2059,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
const sycl::half alpha_f16 = 1.0f;
|
||||
const sycl::half beta_f16 = 0.0f;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
|
||||
*stream, oneapi::mkl::transpose::trans,
|
||||
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
|
||||
*stream, oneapi::math::transpose::trans,
|
||||
oneapi::math::transpose::nontrans, row_diff, src1_ncols, ne10,
|
||||
&alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
|
||||
src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
|
||||
dst_f16.get(), dpct::library_data_t::real_half, ldc,
|
||||
@@ -2097,17 +2097,10 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
#if !GGML_SYCL_DNNL
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
# ifdef GGML_SYCL_NVIDIA
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
|
||||
oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream }, oneapi::mkl::transpose::trans,
|
||||
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i,
|
||||
ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
|
||||
# else
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
|
||||
*stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
|
||||
dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream),
|
||||
dst_dd_i, ldc)));
|
||||
# endif
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::math::blas::column_major::gemm(
|
||||
get_onemath_backend(*stream), oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, row_diff,
|
||||
src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
|
||||
dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
|
||||
#else
|
||||
DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i,
|
||||
DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(),
|
||||
@@ -2836,14 +2829,10 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
|
||||
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
||||
*main_stream, oneapi::mkl::transpose::trans,
|
||||
oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
(const char *)src0_as_f16, dpct::library_data_t::real_half,
|
||||
nb01 / nb00, nb02 / nb00,
|
||||
(const char *)src1_f16, dpct::library_data_t::real_half,
|
||||
nb11 / nb10, nb12 / nb10, beta,
|
||||
(char *)dst_t, cu_data_type, ne01, nb2 / nb0,
|
||||
ne12 * ne13, cu_compute_type)));
|
||||
*main_stream, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
(const char *) src0_as_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
|
||||
(const char *) src1_f16, dpct::library_data_t::real_half, nb11 / nb10, nb12 / nb10, beta, (char *) dst_t,
|
||||
cu_data_type, ne01, nb2 / nb0, ne12 * ne13, cu_compute_type)));
|
||||
} else {
|
||||
const int ne23 = ne12*ne13;
|
||||
|
||||
@@ -2878,7 +2867,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
|
||||
});
|
||||
}
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
||||
*main_stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
*main_stream, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
(const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
|
||||
(const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, nb11 / nb10, beta,
|
||||
(void **) (ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23, cu_compute_type, matrix_info.get())));
|
||||
|
||||
@@ -367,7 +367,7 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||
block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
l2_norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
nullptr, WARP_SIZE);
|
||||
});
|
||||
@@ -389,7 +389,7 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||
block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1)
|
||||
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
l2_norm_f32(x, dst, ncols, eps, item_ct1,
|
||||
get_pointer(s_sum_acc_ct1), work_group_size);
|
||||
});
|
||||
|
||||
@@ -1,8 +1,5 @@
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <oneapi/mkl.hpp>
|
||||
#include "outprod.hpp"
|
||||
|
||||
|
||||
void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
const ggml_tensor *src0 = dst->src[0];
|
||||
const ggml_tensor *src1 = dst->src[1];
|
||||
@@ -34,20 +31,13 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
|
||||
|
||||
// Handle transposition of src1
|
||||
const bool src1_T = ggml_is_transposed(src1);
|
||||
const oneapi::mkl::transpose src1_op =
|
||||
src1_T ? oneapi::mkl::transpose::nontrans : oneapi::mkl::transpose::trans;
|
||||
const oneapi::math::transpose src1_op = src1_T ? oneapi::math::transpose::nontrans : oneapi::math::transpose::trans;
|
||||
const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
|
||||
|
||||
try {
|
||||
// Perform matrix multiplication using oneMKL GEMM
|
||||
#ifdef GGML_SYCL_NVIDIA
|
||||
oneapi::mkl::blas::column_major::gemm(oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream },
|
||||
oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha, src0_d,
|
||||
ne00, src1_d, ldb, beta, dst_d, ne0);
|
||||
#else
|
||||
oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha,
|
||||
src0_d, ne00, src1_d, ldb, beta, dst_d, ne0);
|
||||
#endif
|
||||
// Perform matrix multiplication using oneMath GEMM
|
||||
oneapi::math::blas::column_major::gemm(get_onemath_backend(*stream), oneapi::math::transpose::nontrans, src1_op,
|
||||
ne0, ne1, ne01, alpha, src0_d, ne00, src1_d, ldb, beta, dst_d, ne0);
|
||||
}
|
||||
catch (sycl::exception const& exc) {
|
||||
std::cerr << exc.what() << std::endl;
|
||||
|
||||
@@ -3136,7 +3136,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
&& shader_integer_dot_product_features.shaderIntegerDotProduct;
|
||||
|
||||
coopmat_support = coopmat_support
|
||||
#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
&& coopmat_features.cooperativeMatrix
|
||||
#endif
|
||||
&& ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props, device_architecture);
|
||||
|
||||
std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none";
|
||||
|
||||
@@ -932,6 +932,7 @@ static void gguf_check_reserved_keys(const std::string & key, const T val) {
|
||||
if constexpr (std::is_same<T, uint32_t>::value) {
|
||||
GGML_ASSERT(val > 0 && (val & (val - 1)) == 0 && GGUF_KEY_GENERAL_ALIGNMENT " must be power of 2");
|
||||
} else {
|
||||
GGML_UNUSED(val);
|
||||
GGML_ABORT(GGUF_KEY_GENERAL_ALIGNMENT " must be type u32");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1807,6 +1807,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|| t.first == "<end_of_turn>"
|
||||
|| t.first == "<|endoftext|>"
|
||||
|| t.first == "<EOT>"
|
||||
|| t.first == "_<EOT>"
|
||||
|| t.first == "<|end▁of▁sentence|>" // DeepSeek
|
||||
) {
|
||||
special_eot_id = t.second;
|
||||
@@ -1839,6 +1840,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|| t.first == "<fim-prefix>"
|
||||
|| t.first == "<|fim▁begin|>" // DeepSeek
|
||||
|| t.first == "<PRE>"
|
||||
|| t.first == "▁<PRE>" // CodeLlama
|
||||
) {
|
||||
special_fim_pre_id = t.second;
|
||||
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
@@ -1856,6 +1858,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|| t.first == "<fim-suffix>"
|
||||
|| t.first == "<|fim▁hole|>" // DeepSeek
|
||||
|| t.first == "<SUF>"
|
||||
|| t.first == "▁<SUF>" // CodeLlama
|
||||
) {
|
||||
special_fim_suf_id = t.second;
|
||||
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
@@ -1873,6 +1876,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|| t.first == "<fim-middle>"
|
||||
|| t.first == "<|fim▁end|>" // DeepSeek
|
||||
|| t.first == "<MID>"
|
||||
|| t.first == "▁<MID>" // CodeLlama
|
||||
) {
|
||||
special_fim_mid_id = t.second;
|
||||
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
@@ -1957,6 +1961,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|| t.first == "<|endoftext|>"
|
||||
|| t.first == "<|eom_id|>"
|
||||
|| t.first == "<EOT>"
|
||||
|| t.first == "_<EOT>"
|
||||
) {
|
||||
special_eog_ids.insert(t.second);
|
||||
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||
|
||||
Reference in New Issue
Block a user