mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-07 09:34:07 +00:00
Compare commits
20 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d6fe40fa00 | ||
|
|
e14e842e87 | ||
|
|
647b960bd8 | ||
|
|
299f5d782c | ||
|
|
ac76d36201 | ||
|
|
6515610506 | ||
|
|
7956bb4d7f | ||
|
|
9008027aa3 | ||
|
|
16bcc1259d | ||
|
|
9eb9a1331d | ||
|
|
7c23f3f0d4 | ||
|
|
8c0d6bb455 | ||
|
|
5c9a18e674 | ||
|
|
7f09a680af | ||
|
|
aa374175c3 | ||
|
|
5b180c3d60 | ||
|
|
b7f9010d24 | ||
|
|
4882f0ff78 | ||
|
|
9d7c518d64 | ||
|
|
22c8c3c6ad |
18
.github/workflows/build.yml
vendored
18
.github/workflows/build.yml
vendored
@@ -161,15 +161,16 @@ jobs:
|
||||
- name: Dawn Dependency
|
||||
id: dawn-depends
|
||||
run: |
|
||||
DAWN_VERSION="v1.0.0"
|
||||
DAWN_VERSION="v2.0.0"
|
||||
DAWN_OWNER="reeselevine"
|
||||
DAWN_REPO="dawn"
|
||||
DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-macos-latest-Release.tar.gz"
|
||||
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release.zip"
|
||||
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
|
||||
curl -L -o artifact.tar.gz \
|
||||
curl -L -o artifact.zip \
|
||||
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
|
||||
mkdir dawn
|
||||
tar -xvf artifact.tar.gz -C dawn --strip-components=1
|
||||
unzip artifact.zip
|
||||
tar -xvf Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release.tar.gz -C dawn --strip-components=1
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
@@ -521,15 +522,16 @@ jobs:
|
||||
id: dawn-depends
|
||||
run: |
|
||||
sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
|
||||
DAWN_VERSION="v1.0.0"
|
||||
DAWN_VERSION="v2.0.0"
|
||||
DAWN_OWNER="reeselevine"
|
||||
DAWN_REPO="dawn"
|
||||
DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-ubuntu-latest-Release.tar.gz"
|
||||
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release.zip"
|
||||
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
|
||||
curl -L -o artifact.tar.gz \
|
||||
curl -L -o artifact.zip \
|
||||
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
|
||||
mkdir dawn
|
||||
tar -xvf artifact.tar.gz -C dawn --strip-components=1
|
||||
unzip artifact.zip
|
||||
tar -xvf Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release.tar.gz -C dawn --strip-components=1
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
|
||||
@@ -56,6 +56,8 @@ add_library(${TARGET} STATIC
|
||||
common.h
|
||||
console.cpp
|
||||
console.h
|
||||
download.cpp
|
||||
download.h
|
||||
http.h
|
||||
json-partial.cpp
|
||||
json-partial.h
|
||||
|
||||
1004
common/arg.cpp
1004
common/arg.cpp
File diff suppressed because it is too large
Load Diff
@@ -59,8 +59,8 @@ struct common_arg {
|
||||
common_arg & set_sparam();
|
||||
bool in_example(enum llama_example ex);
|
||||
bool is_exclude(enum llama_example ex);
|
||||
bool get_value_from_env(std::string & output);
|
||||
bool has_value_from_env();
|
||||
bool get_value_from_env(std::string & output) const;
|
||||
bool has_value_from_env() const;
|
||||
std::string to_string();
|
||||
};
|
||||
|
||||
|
||||
1014
common/download.cpp
Normal file
1014
common/download.cpp
Normal file
File diff suppressed because it is too large
Load Diff
41
common/download.h
Normal file
41
common/download.h
Normal file
@@ -0,0 +1,41 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
struct common_params_model;
|
||||
|
||||
//
|
||||
// download functionalities
|
||||
//
|
||||
|
||||
struct common_hf_file_res {
|
||||
std::string repo; // repo name with ":tag" removed
|
||||
std::string ggufFile;
|
||||
std::string mmprojFile;
|
||||
};
|
||||
|
||||
// resolve and download model from Docker registry
|
||||
// return local path to downloaded model file
|
||||
std::string common_docker_resolve_model(const std::string & docker);
|
||||
|
||||
/**
|
||||
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
|
||||
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
|
||||
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
|
||||
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
|
||||
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
|
||||
*
|
||||
* Return pair of <repo, file> (with "repo" already having tag removed)
|
||||
*
|
||||
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
|
||||
*/
|
||||
common_hf_file_res common_get_hf_file(
|
||||
const std::string & hf_repo_with_tag,
|
||||
const std::string & bearer_token,
|
||||
bool offline);
|
||||
|
||||
// returns true if download succeeded
|
||||
bool common_download_model(
|
||||
const common_params_model & model,
|
||||
const std::string & bearer_token,
|
||||
bool offline);
|
||||
@@ -178,6 +178,48 @@ GeForce RTX 3070 8.6
|
||||
cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="86;89"
|
||||
```
|
||||
|
||||
### Overriding the CUDA Version
|
||||
|
||||
If you have multiple CUDA installations on your system and want to compile llama.cpp for a specific one, e.g. for CUDA 11.7 installed under `/opt/cuda-11.7`:
|
||||
|
||||
```bash
|
||||
cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/opt/cuda-11.7/bin/nvcc -DCMAKE_INSTALL_RPATH="/opt/cuda-11.7/lib64;\$ORIGIN" -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
|
||||
```
|
||||
|
||||
#### Fixing Compatibility Issues with Old CUDA and New glibc
|
||||
|
||||
If you try to use an old CUDA version (e.g. v11.7) with a new glibc version you can get errors like this:
|
||||
|
||||
```
|
||||
/usr/include/bits/mathcalls.h(83): error: exception specification is
|
||||
incompatible with that of previous function "cospi"
|
||||
|
||||
|
||||
/opt/cuda-11.7/bin/../targets/x86_64-linux/include/crt/math_functions.h(5545):
|
||||
here
|
||||
```
|
||||
|
||||
It seems the least bad solution is to patch the CUDA installation to declare the correct signatures.
|
||||
Replace the following lines in `/path/to/your/cuda/installation/targets/x86_64-linux/include/crt/math_functions.h`:
|
||||
|
||||
```C++
|
||||
// original lines
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double cospi(double x);
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float cospif(float x);
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double sinpi(double x);
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float sinpif(float x);
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double rsqrt(double x);
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float rsqrtf(float x);
|
||||
|
||||
// edited lines
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double cospi(double x) noexcept (true);
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float cospif(float x) noexcept (true);
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double sinpi(double x) noexcept (true);
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float sinpif(float x) noexcept (true);
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double rsqrt(double x) noexcept (true);
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float rsqrtf(float x) noexcept (true);
|
||||
```
|
||||
|
||||
### Runtime CUDA environmental variables
|
||||
|
||||
You may set the [cuda environmental variables](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) at runtime.
|
||||
|
||||
@@ -24,7 +24,7 @@ Legend:
|
||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
|
||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ |
|
||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ |
|
||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| CONV_2D | ❌ | ❌ | ✅ | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ |
|
||||
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||
|
||||
@@ -9307,37 +9307,37 @@
|
||||
"SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=1","support","1","yes","SYCL"
|
||||
"SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","0","yes","SYCL"
|
||||
"SYCL0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","SYCL"
|
||||
"SYCL0","ARGSORT","type=f32,ne=[16,10,10,10],order=0","support","1","yes","SYCL"
|
||||
"SYCL0","ARGSORT","type=f32,ne=[60,10,10,10],order=0","support","1","yes","SYCL"
|
||||
|
||||
|
Can't render this file because it is too large.
|
@@ -580,16 +580,19 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
uint8_t *patmp = atmp;
|
||||
int vsums;
|
||||
int tmp;
|
||||
int tmp, t1, t2, t3, t4, t5, t6, t7;
|
||||
__asm__ __volatile__(
|
||||
"vsetivli zero, 16, e8, m1\n\t"
|
||||
"vmv.v.x v8, zero\n\t"
|
||||
"lb zero, 15(%[sc])\n\t"
|
||||
"vle8.v v1, (%[sc])\n\t"
|
||||
"vle8.v v2, (%[bsums])\n\t"
|
||||
"addi %[tmp], %[bsums], 16\n\t"
|
||||
"vand.vi v0, v1, 0xF\n\t"
|
||||
"vsrl.vi v1, v1, 4\n\t"
|
||||
"vle8.v v3, (%[tmp])\n\t"
|
||||
"vse8.v v0, (%[scale])\n\t"
|
||||
"vsetivli zero, 16, e16, m2\n\t"
|
||||
"vle16.v v2, (%[bsums])\n\t"
|
||||
"vzext.vf2 v0, v1\n\t"
|
||||
"vwmul.vv v4, v0, v2\n\t"
|
||||
"vsetivli zero, 16, e32, m4\n\t"
|
||||
@@ -608,46 +611,89 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
__asm__ __volatile__(
|
||||
"vsetvli zero, %[vl32], e8, m2\n\t"
|
||||
"lb zero, 31(%[q2])\n\t"
|
||||
"addi %[tmp], %[q2], 16\n\t"
|
||||
"addi %[t1], %[q8], 16\n\t"
|
||||
"vsetivli zero, 16, e8, m1\n\t"
|
||||
"vle8.v v0, (%[q2])\n\t"
|
||||
"vle8.v v1, (%[tmp])\n\t"
|
||||
"vsrl.vi v2, v0, 2\n\t"
|
||||
"vsrl.vi v3, v1, 2\n\t"
|
||||
"vsrl.vi v4, v0, 4\n\t"
|
||||
"vsrl.vi v6, v0, 6\n\t"
|
||||
"vand.vi v0, v0, 0x3\n\t"
|
||||
"vand.vi v2, v2, 0x3\n\t"
|
||||
"vand.vi v4, v4, 0x3\n\t"
|
||||
"vsetvli zero, %[vl128], e8, m8\n\t"
|
||||
"addi %[tmp], %[q8], 32\n\t"
|
||||
"vle8.v v8, (%[q8])\n\t"
|
||||
"vsetvli zero, %[vl64], e8, m4\n\t"
|
||||
"vle8.v v9, (%[t1])\n\t"
|
||||
"addi %[t1], %[t1], 32\n\t"
|
||||
"vsrl.vi v5, v1, 4\n\t"
|
||||
"vsrl.vi v6, v0, 6\n\t"
|
||||
"vsrl.vi v7, v1, 6\n\t"
|
||||
"vle8.v v10, (%[tmp])\n\t"
|
||||
"vle8.v v11, (%[t1])\n\t"
|
||||
"addi %[tmp], %[tmp], 32\n\t"
|
||||
"addi %[t1], %[t1], 32\n\t"
|
||||
"vand.vi v0, v0, 0x3\n\t"
|
||||
"vand.vi v1, v1, 0x3\n\t"
|
||||
"vand.vi v2, v2, 0x3\n\t"
|
||||
"vle8.v v12, (%[tmp])\n\t"
|
||||
"vle8.v v13, (%[t1])\n\t"
|
||||
"addi %[tmp], %[tmp], 32\n\t"
|
||||
"addi %[t1], %[t1], 32\n\t"
|
||||
"vand.vi v3, v3, 0x3\n\t"
|
||||
"vand.vi v4, v4, 0x3\n\t"
|
||||
"vand.vi v5, v5, 0x3\n\t"
|
||||
"vle8.v v14, (%[tmp])\n\t"
|
||||
"vle8.v v15, (%[t1])\n\t"
|
||||
"vwmul.vv v16, v0, v8\n\t"
|
||||
"vwmul.vv v18, v1, v9\n\t"
|
||||
"vwmul.vv v20, v2, v10\n\t"
|
||||
"vwmul.vv v22, v3, v11\n\t"
|
||||
"vwmul.vv v24, v4, v12\n\t"
|
||||
"vsetivli zero, 16, e16, m2\n\t"
|
||||
"vwmul.vv v26, v5, v13\n\t"
|
||||
"vwmul.vv v28, v6, v14\n\t"
|
||||
"vwmul.vv v30, v7, v15\n\t"
|
||||
"vsetivli zero, 8, e16, m1\n\t"
|
||||
"vmv.v.x v0, zero\n\t"
|
||||
"vwredsum.vs v10, v16, v0\n\t"
|
||||
"lbu %[tmp], 0(%[scale])\n\t"
|
||||
"vwredsum.vs v8, v16, v0\n\t"
|
||||
"vwredsum.vs v9, v18, v0\n\t"
|
||||
"vwredsum.vs v8, v20, v0\n\t"
|
||||
"vwredsum.vs v7, v22, v0\n\t"
|
||||
"vwredsum.vs v11, v24, v0\n\t"
|
||||
"vwredsum.vs v12, v26, v0\n\t"
|
||||
"vwredsum.vs v13, v28, v0\n\t"
|
||||
"vwredsum.vs v14, v30, v0\n\t"
|
||||
"lbu %[t1], 1(%[scale])\n\t"
|
||||
"vwredsum.vs v10, v20, v0\n\t"
|
||||
"vwredsum.vs v11, v22, v0\n\t"
|
||||
"lbu %[t2], 2(%[scale])\n\t"
|
||||
"vwredsum.vs v12, v24, v0\n\t"
|
||||
"vwredsum.vs v13, v26, v0\n\t"
|
||||
"lbu %[t3], 3(%[scale])\n\t"
|
||||
"vwredsum.vs v14, v28, v0\n\t"
|
||||
"vwredsum.vs v15, v30, v0\n\t"
|
||||
"lbu %[t4], 4(%[scale])\n\t"
|
||||
"vwredsum.vs v8, v17, v8\n\t"
|
||||
"vwredsum.vs v9, v19, v9\n\t"
|
||||
"lbu %[t5], 5(%[scale])\n\t"
|
||||
"vwredsum.vs v10, v21, v10\n\t"
|
||||
"vwredsum.vs v11, v23, v11\n\t"
|
||||
"lbu %[t6], 6(%[scale])\n\t"
|
||||
"vwredsum.vs v12, v25, v12\n\t"
|
||||
"vwredsum.vs v13, v27, v13\n\t"
|
||||
"lbu %[t7], 7(%[scale])\n\t"
|
||||
"vwredsum.vs v14, v29, v14\n\t"
|
||||
"vwredsum.vs v15, v31, v15\n\t"
|
||||
"vsetivli zero, 4, e32, m1\n\t"
|
||||
"vslideup.vi v10, v9, 1\n\t"
|
||||
"vslideup.vi v8, v7, 1\n\t"
|
||||
"vslideup.vi v11, v12, 1\n\t"
|
||||
"vslideup.vi v13, v14, 1\n\t"
|
||||
"vslideup.vi v10, v8, 2\n\t"
|
||||
"vslideup.vi v11, v13, 2\n\t"
|
||||
"vsetivli zero, 8, e32, m2\n\t"
|
||||
"vle8.v v15, (%[scale])\n\t"
|
||||
"vzext.vf4 v12, v15\n\t"
|
||||
"vmul.vv v10, v10, v12\n\t"
|
||||
"vredsum.vs v0, v10, v0\n\t"
|
||||
"vmul.vx v0, v8, %[tmp]\n\t"
|
||||
"vmul.vx v1, v9, %[t1]\n\t"
|
||||
"vmacc.vx v0, %[t2], v10\n\t"
|
||||
"vmacc.vx v1, %[t3], v11\n\t"
|
||||
"vmacc.vx v0, %[t4], v12\n\t"
|
||||
"vmacc.vx v1, %[t5], v13\n\t"
|
||||
"vmacc.vx v0, %[t6], v14\n\t"
|
||||
"vmacc.vx v1, %[t7], v15\n\t"
|
||||
"vmv.x.s %[tmp], v0\n\t"
|
||||
"add %[isum], %[isum], %[tmp]"
|
||||
: [tmp] "=&r" (tmp), [isum] "+&r" (isum)
|
||||
"vmv.x.s %[t1], v1\n\t"
|
||||
"add %[isum], %[isum], %[tmp]\n\t"
|
||||
"add %[isum], %[isum], %[t1]"
|
||||
: [tmp] "=&r" (tmp), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
|
||||
, [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), [t7] "=&r" (t7)
|
||||
, [isum] "+&r" (isum)
|
||||
: [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
|
||||
, [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
@@ -929,7 +975,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
int8_t * scale = (int8_t *)utmp;
|
||||
int tmp;
|
||||
int tmp, t1, t2, t3, t4, t5, t6, t7;
|
||||
__asm__ __volatile__(
|
||||
"vsetivli zero, 12, e8, m1\n\t"
|
||||
"vle8.v v0, (%[s6b])\n\t"
|
||||
@@ -967,19 +1013,23 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
int isum = 0;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
__asm__ __volatile__(
|
||||
"lb zero, 31(%[q3])\n\t"
|
||||
"vsetvli zero, %[vl32], e8, m2, ta, mu\n\t"
|
||||
"vle8.v v8, (%[q3])\n\t"
|
||||
"vsrl.vi v10, v8, 2\n\t"
|
||||
"vsrl.vi v12, v8, 4\n\t"
|
||||
"vsrl.vi v14, v8, 6\n\t"
|
||||
"lb zero, 64(%[q8])\n\t"
|
||||
"vand.vi v8, v8, 3\n\t"
|
||||
"vand.vi v10, v10, 3\n\t"
|
||||
"vand.vi v12, v12, 3\n\t"
|
||||
"vle8.v v2, (%[qh])\n\t"
|
||||
"lb zero, 127(%[q8])\n\t"
|
||||
"vand.vx v4, v2, %[m]\n\t"
|
||||
"slli %[m], %[m], 1\n\t"
|
||||
"vmseq.vx v0, v4, zero\n\t"
|
||||
"vadd.vi v8, v8, -4, v0.t\n\t"
|
||||
"lb zero, 0(%[q8])\n\t"
|
||||
"vand.vx v4, v2, %[m]\n\t"
|
||||
"slli %[m], %[m], 1\n\t"
|
||||
"vmseq.vx v0, v4, zero\n\t"
|
||||
@@ -994,34 +1044,43 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
"vadd.vi v14, v14, -4, v0.t\n\t"
|
||||
"vsetvli zero, %[vl128], e8, m8\n\t"
|
||||
"vle8.v v0, (%[q8])\n\t"
|
||||
"lb %[tmp], 0(%[scale])\n\t"
|
||||
"lb %[t1], 1(%[scale])\n\t"
|
||||
"lb %[t2], 2(%[scale])\n\t"
|
||||
"lb %[t3], 3(%[scale])\n\t"
|
||||
"vsetvli zero, %[vl64], e8, m4\n\t"
|
||||
"vwmul.vv v16, v0, v8\n\t"
|
||||
"vwmul.vv v24, v4, v12\n\t"
|
||||
"vsetivli zero, 16, e16, m2\n\t"
|
||||
"vmv.v.x v0, zero\n\t"
|
||||
"vwredsum.vs v10, v16, v0\n\t"
|
||||
"vwredsum.vs v8, v16, v0\n\t"
|
||||
"lb %[t4], 4(%[scale])\n\t"
|
||||
"lb %[t5], 5(%[scale])\n\t"
|
||||
"vwredsum.vs v9, v18, v0\n\t"
|
||||
"vwredsum.vs v8, v20, v0\n\t"
|
||||
"vwredsum.vs v7, v22, v0\n\t"
|
||||
"vwredsum.vs v11, v24, v0\n\t"
|
||||
"vwredsum.vs v12, v26, v0\n\t"
|
||||
"vwredsum.vs v13, v28, v0\n\t"
|
||||
"vwredsum.vs v14, v30, v0\n\t"
|
||||
"vwredsum.vs v10, v20, v0\n\t"
|
||||
"vwredsum.vs v11, v22, v0\n\t"
|
||||
"vwredsum.vs v12, v24, v0\n\t"
|
||||
"lb %[t6], 6(%[scale])\n\t"
|
||||
"lb %[t7], 7(%[scale])\n\t"
|
||||
"vwredsum.vs v13, v26, v0\n\t"
|
||||
"vwredsum.vs v14, v28, v0\n\t"
|
||||
"vwredsum.vs v15, v30, v0\n\t"
|
||||
"vsetivli zero, 4, e32, m1\n\t"
|
||||
"vslideup.vi v10, v9, 1\n\t"
|
||||
"vslideup.vi v8, v7, 1\n\t"
|
||||
"vslideup.vi v11, v12, 1\n\t"
|
||||
"vslideup.vi v13, v14, 1\n\t"
|
||||
"vslideup.vi v10, v8, 2\n\t"
|
||||
"vslideup.vi v11, v13, 2\n\t"
|
||||
"vsetivli zero, 8, e32, m2\n\t"
|
||||
"vle8.v v15, (%[scale])\n\t"
|
||||
"vsext.vf4 v12, v15\n\t"
|
||||
"vmul.vv v10, v10, v12\n\t"
|
||||
"vredsum.vs v0, v10, v0\n\t"
|
||||
"vmul.vx v0, v8, %[tmp]\n\t"
|
||||
"vmul.vx v1, v9, %[t1]\n\t"
|
||||
"vmacc.vx v0, %[t2], v10\n\t"
|
||||
"vmacc.vx v1, %[t3], v11\n\t"
|
||||
"vmacc.vx v0, %[t4], v12\n\t"
|
||||
"vmacc.vx v1, %[t5], v13\n\t"
|
||||
"vmacc.vx v0, %[t6], v14\n\t"
|
||||
"vmacc.vx v1, %[t7], v15\n\t"
|
||||
"vmv.x.s %[tmp], v0\n\t"
|
||||
"add %[isum], %[isum], %[tmp]"
|
||||
: [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum)
|
||||
"vmv.x.s %[t1], v1\n\t"
|
||||
"add %[isum], %[isum], %[tmp]\n\t"
|
||||
"add %[isum], %[isum], %[t1]"
|
||||
: [tmp] "=&r" (tmp), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
|
||||
, [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), [t7] "=&r" (t7)
|
||||
, [m] "+&r" (m), [isum] "+&r" (isum)
|
||||
: [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
|
||||
, [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
|
||||
: "memory"
|
||||
|
||||
@@ -198,7 +198,7 @@ static void ggml_cpy_flt_cuda(
|
||||
if (transposed) {
|
||||
GGML_ASSERT(ne == ne00*ne01*ne02); // ne[3] is 1 assumed
|
||||
int ne00n, ne01n, ne02n;
|
||||
if (nb00 < nb02) {
|
||||
if (nb00 <= nb02) { // most likely safe to handle nb00 = nb02 case here
|
||||
ne00n = ne00;
|
||||
ne01n = ne01;
|
||||
ne02n = ne02;
|
||||
@@ -206,8 +206,6 @@ static void ggml_cpy_flt_cuda(
|
||||
ne00n = ne00;
|
||||
ne01n = ne01*ne02;
|
||||
ne02n = 1;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
dim3 dimGrid( (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D,
|
||||
|
||||
@@ -2113,7 +2113,7 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_f(const ggml_tensor * tensor) {
|
||||
src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, is_mul_mat_id ? src1->ne[2] : src1->ne[1]);
|
||||
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, is_mul_mat_id ? src1->ne[2] : src1->ne[1]);
|
||||
|
||||
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft) ||
|
||||
ggml_backend_buft_is_cuda_split(src1->buffer->buft);
|
||||
@@ -2207,16 +2207,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
const int warp_size = ggml_cuda_info().devices[id].warp_size;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1], /*mul_mat_id=*/false);
|
||||
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
|
||||
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
|
||||
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
||||
}
|
||||
} else {
|
||||
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
||||
const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1], /*mul_mat_id=*/false);
|
||||
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
|
||||
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
|
||||
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
||||
}
|
||||
|
||||
@@ -2287,7 +2287,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
return;
|
||||
}
|
||||
|
||||
if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src1->ne[2], /*mul_mat_id=*/true)) {
|
||||
if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src0->nb, src1->ne[2], /*mul_mat_id=*/true)) {
|
||||
ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -119,15 +119,27 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr
|
||||
}
|
||||
}
|
||||
|
||||
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, const int src1_ncols, bool mul_mat_id) {
|
||||
|
||||
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne,
|
||||
const size_t * src0_nb, const int src1_ncols, bool mul_mat_id) {
|
||||
if (ggml_is_quantized(type)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src0_ne[0] % (warp_size * (4/ggml_type_size(type))) != 0) {
|
||||
const size_t ts = ggml_type_size(type);
|
||||
if (src0_ne[0] % (warp_size * (4/ts)) != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src0_nb[0] != ts) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Pointers not aligned to the size of half2/nv_bfloat162/float2 would result in a crash:
|
||||
for (size_t i = 1; i < GGML_MAX_DIMS; ++i) {
|
||||
if (src0_nb[i] % (2*ts) != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (src0_ne[1] % MMF_ROWS_PER_BLOCK != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -17,7 +17,7 @@ struct mmf_ids_data {
|
||||
|
||||
void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
|
||||
|
||||
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const int src1_ncols, bool mul_mat_id);
|
||||
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const size_t * src0_nb, const int src1_ncols, bool mul_mat_id);
|
||||
|
||||
template <typename T, int rows_per_block, int cols_per_block, int nwarps, bool has_ids>
|
||||
__launch_bounds__(ggml_cuda_get_physical_warp_size()*nwarps, 1)
|
||||
|
||||
@@ -3494,7 +3494,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
|
||||
const int col_diff = col_high - col_low;
|
||||
|
||||
for (int j = threadIdx.y*warp_size + threadIdx.x; j < mmq_x; j += nwarps*warp_size) {
|
||||
ids_dst_shared[j] = ids_dst[col_low + j];
|
||||
ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
|
||||
@@ -716,10 +716,23 @@ void ggml_cuda_op_mul_mat_vec_f(
|
||||
GGML_UNUSED_VARS(ctx, src1, dst, src1_ddq_i, src1_ncols, src1_padded_row_size);
|
||||
}
|
||||
|
||||
bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11) {
|
||||
bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, const size_t * src0_nb, int64_t ne11) {
|
||||
if (src0_ne[0] % 2 != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const size_t ts = ggml_type_size(type);
|
||||
if (src0_nb[0] != ts) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Pointers not aligned to the size of half2/nv_bfloat162/float2 would result in a crash:
|
||||
for (size_t i = 1; i < GGML_MAX_DIMS; ++i) {
|
||||
if (src0_nb[i] % (2*ts) != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
|
||||
|
||||
@@ -9,4 +9,4 @@ void ggml_cuda_op_mul_mat_vec_f(
|
||||
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
||||
const int64_t src1_padded_row_size, cudaStream_t stream);
|
||||
|
||||
bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11);
|
||||
bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, const size_t * src0_nb, int64_t ne11);
|
||||
|
||||
@@ -35,7 +35,6 @@ struct ggml_metal {
|
||||
// additional, inference-time compiled pipelines
|
||||
ggml_metal_pipelines_t pipelines_ext;
|
||||
|
||||
bool use_bfloat;
|
||||
bool use_fusion;
|
||||
bool use_concurrency;
|
||||
bool use_graph_optimize;
|
||||
@@ -121,11 +120,10 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
|
||||
}
|
||||
}
|
||||
|
||||
const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
|
||||
//const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
|
||||
|
||||
res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
||||
|
||||
res->use_bfloat = props_dev->has_bfloat;
|
||||
res->use_fusion = getenv("GGML_METAL_FUSION_DISABLE") == nil;
|
||||
res->use_concurrency = getenv("GGML_METAL_CONCURRENCY_DISABLE") == nil;
|
||||
|
||||
@@ -147,7 +145,6 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
|
||||
|
||||
memset(res->fuse_cnt, 0, sizeof(res->fuse_cnt));
|
||||
|
||||
GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, res->use_bfloat ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: use fusion = %s\n", __func__, res->use_fusion ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: use concurrency = %s\n", __func__, res->use_concurrency ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: use graph optimize = %s\n", __func__, res->use_graph_optimize ? "true" : "false");
|
||||
|
||||
@@ -95,7 +95,9 @@ void ggml_metal_encoder_end_encoding(ggml_metal_encoder_t encoder);
|
||||
|
||||
typedef struct ggml_metal_library * ggml_metal_library_t;
|
||||
|
||||
ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev);
|
||||
ggml_metal_library_t ggml_metal_library_init (ggml_metal_device_t dev);
|
||||
ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev, const char * source, bool verbose);
|
||||
|
||||
void ggml_metal_library_free(ggml_metal_library_t lib);
|
||||
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline (ggml_metal_library_t lib, const char * name);
|
||||
@@ -193,6 +195,7 @@ struct ggml_metal_device_props {
|
||||
bool has_simdgroup_mm;
|
||||
bool has_unified_memory;
|
||||
bool has_bfloat;
|
||||
bool has_tensor;
|
||||
bool use_residency_sets;
|
||||
bool use_shared_buffers;
|
||||
|
||||
|
||||
@@ -21,8 +21,9 @@
|
||||
#define GGML_METAL_HAS_RESIDENCY_SETS 1
|
||||
#endif
|
||||
|
||||
// overload of MTLGPUFamilyMetal3 (not available in some environments)
|
||||
// overload of MTLGPUFamilyMetalX (not available in some environments)
|
||||
static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
|
||||
static const NSInteger MTLGPUFamilyMetal4_GGML = 5002;
|
||||
|
||||
// virtual address for GPU memory allocations
|
||||
static atomic_uintptr_t g_addr_device = 0x000000400ULL;
|
||||
@@ -261,6 +262,10 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_HAS_BF16"];
|
||||
}
|
||||
|
||||
if (ggml_metal_device_get_props(dev)->has_tensor) {
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_HAS_TENSOR"];
|
||||
}
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
|
||||
#endif
|
||||
@@ -298,6 +303,72 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
|
||||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev, const char * source, bool verbose) {
|
||||
if (source == NULL) {
|
||||
GGML_LOG_ERROR("%s: source is NULL\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
id<MTLDevice> device = ggml_metal_device_get_obj(dev);
|
||||
id<MTLLibrary> library = nil;
|
||||
NSError * error = nil;
|
||||
|
||||
const int64_t t_start = ggml_time_us();
|
||||
|
||||
NSString * src = [[NSString alloc] initWithBytes:source
|
||||
length:strlen(source)
|
||||
encoding:NSUTF8StringEncoding];
|
||||
if (!src) {
|
||||
GGML_LOG_ERROR("%s: failed to create NSString from source\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@autoreleasepool {
|
||||
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
||||
|
||||
MTLCompileOptions * options = [MTLCompileOptions new];
|
||||
options.preprocessorMacros = prep;
|
||||
|
||||
library = [device newLibraryWithSource:src options:options error:&error];
|
||||
if (error) {
|
||||
if (verbose) {
|
||||
GGML_LOG_ERROR("%s: error compiling source: %s\n", __func__, [[error description] UTF8String]);
|
||||
} else {
|
||||
GGML_LOG_ERROR("%s: error compiling source\n", __func__);
|
||||
}
|
||||
library = nil;
|
||||
}
|
||||
|
||||
[options release];
|
||||
}
|
||||
|
||||
[src release];
|
||||
|
||||
if (!library) {
|
||||
if (verbose) {
|
||||
GGML_LOG_ERROR("%s: failed to create Metal library from source\n", __func__);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (verbose) {
|
||||
GGML_LOG_INFO("%s: compiled in %.3f sec\n", __func__, (ggml_time_us() - t_start) / 1e6);
|
||||
}
|
||||
|
||||
ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library));
|
||||
if (!res) {
|
||||
GGML_LOG_ERROR("%s: calloc failed\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
res->obj = library;
|
||||
res->device = device;
|
||||
res->pipelines = ggml_metal_pipelines_init();
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void ggml_metal_library_free(ggml_metal_library_t lib) {
|
||||
if (!lib) {
|
||||
return;
|
||||
@@ -345,9 +416,9 @@ ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t l
|
||||
if (!mtl_function) {
|
||||
ggml_critical_section_end();
|
||||
|
||||
GGML_LOG_ERROR("%s: error: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
|
||||
GGML_LOG_ERROR("%s: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
|
||||
}
|
||||
|
||||
return nil;
|
||||
@@ -355,13 +426,21 @@ ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t l
|
||||
|
||||
res->obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
|
||||
|
||||
ggml_metal_pipelines_add(lib->pipelines, name, res);
|
||||
|
||||
[mtl_function release];
|
||||
|
||||
GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name, (void *) res->obj,
|
||||
(int) res->obj.maxTotalThreadsPerThreadgroup,
|
||||
(int) res->obj.threadExecutionWidth);
|
||||
|
||||
if (res->obj.maxTotalThreadsPerThreadgroup == 0 || res->obj.threadExecutionWidth == 0) {
|
||||
ggml_critical_section_end();
|
||||
|
||||
GGML_LOG_ERROR("%s: incompatible pipeline %s\n", __func__, name);
|
||||
|
||||
return nil;
|
||||
}
|
||||
|
||||
ggml_metal_pipelines_add(lib->pipelines, name, res);
|
||||
}
|
||||
|
||||
ggml_critical_section_end();
|
||||
@@ -469,6 +548,126 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
||||
|
||||
dev->props.has_bfloat = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
|
||||
dev->props.has_bfloat |= [dev->mtl_device supportsFamily:MTLGPUFamilyApple6];
|
||||
if (getenv("GGML_METAL_BF16_DISABLE") != NULL) {
|
||||
dev->props.has_bfloat = false;
|
||||
}
|
||||
|
||||
dev->props.has_tensor = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal4_GGML];
|
||||
if (getenv("GGML_METAL_TENSOR_DISABLE") != NULL) {
|
||||
dev->props.has_tensor = false;
|
||||
}
|
||||
|
||||
// note: disable the tensor API by default for old chips because with the current implementation it is not useful
|
||||
// - M2 Ultra: ~5% slower
|
||||
// - M4, M4 Max: no significant difference
|
||||
//
|
||||
// TODO: try to update the tensor API kernels to at least match the simdgroup performance
|
||||
if (getenv("GGML_METAL_TENSOR_ENABLE") == NULL &&
|
||||
![[dev->mtl_device name] containsString:@"M5"] &&
|
||||
![[dev->mtl_device name] containsString:@"M6"]) {
|
||||
GGML_LOG_WARN("%s: tensor API disabled for pre-M5 device\n", __func__);
|
||||
dev->props.has_tensor = false;
|
||||
}
|
||||
|
||||
// double-check that the tensor API compiles
|
||||
if (dev->props.has_tensor) {
|
||||
const char * src_tensor_f16 = "\n"
|
||||
"#include <metal_stdlib> \n"
|
||||
"#include <metal_tensor> \n"
|
||||
"#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
|
||||
" \n"
|
||||
"using namespace metal; \n"
|
||||
"using namespace mpp::tensor_ops; \n"
|
||||
" \n"
|
||||
"kernel void dummy_kernel( \n"
|
||||
" tensor<device half, dextents<int32_t, 2>> A [[buffer(0)]], \n"
|
||||
" tensor<device half, dextents<int32_t, 2>> B [[buffer(1)]], \n"
|
||||
" device float * C [[buffer(2)]], \n"
|
||||
" uint2 tgid [[threadgroup_position_in_grid]]) \n"
|
||||
"{ \n"
|
||||
" auto tA = A.slice(0, (int)tgid.y); \n"
|
||||
" auto tB = B.slice((int)tgid.x, 0); \n"
|
||||
" \n"
|
||||
" matmul2d< \n"
|
||||
" matmul2d_descriptor(8, 8, dynamic_extent), \n"
|
||||
" execution_simdgroups<4>> mm; \n"
|
||||
" \n"
|
||||
" auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
|
||||
" \n"
|
||||
" auto sA = tA.slice(0, 0); \n"
|
||||
" auto sB = tB.slice(0, 0); \n"
|
||||
" mm.run(sB, sA, cT); \n"
|
||||
" \n"
|
||||
" auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
|
||||
" \n"
|
||||
" cT.store(tC); \n"
|
||||
"}";
|
||||
|
||||
GGML_LOG_INFO("%s: testing tensor API for f16 support\n", __func__);
|
||||
ggml_metal_library_t lib = ggml_metal_library_init_from_source(dev, src_tensor_f16, false);
|
||||
if (lib == NULL) {
|
||||
GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
|
||||
dev->props.has_tensor = false;
|
||||
} else {
|
||||
ggml_metal_pipeline_t ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
|
||||
if (!ppl) {
|
||||
GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
|
||||
dev->props.has_tensor = false;
|
||||
}
|
||||
|
||||
ggml_metal_library_free(lib);
|
||||
}
|
||||
}
|
||||
|
||||
// try to compile a dummy kernel to determine if the tensor API is supported for bfloat
|
||||
if (dev->props.has_tensor && dev->props.has_bfloat) {
|
||||
const char * src_tensor_bf16 = "\n"
|
||||
"#include <metal_stdlib> \n"
|
||||
"#include <metal_tensor> \n"
|
||||
"#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
|
||||
" \n"
|
||||
"using namespace metal; \n"
|
||||
"using namespace mpp::tensor_ops; \n"
|
||||
" \n"
|
||||
"kernel void dummy_kernel( \n"
|
||||
" tensor<device bfloat, dextents<int32_t, 2>> A [[buffer(0)]], \n"
|
||||
" tensor<device bfloat, dextents<int32_t, 2>> B [[buffer(1)]], \n"
|
||||
" device float * C [[buffer(2)]], \n"
|
||||
" uint2 tgid [[threadgroup_position_in_grid]]) \n"
|
||||
"{ \n"
|
||||
" auto tA = A.slice(0, (int)tgid.y); \n"
|
||||
" auto tB = B.slice((int)tgid.x, 0); \n"
|
||||
" \n"
|
||||
" matmul2d< \n"
|
||||
" matmul2d_descriptor(8, 8, dynamic_extent), \n"
|
||||
" execution_simdgroups<4>> mm; \n"
|
||||
" \n"
|
||||
" auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
|
||||
" \n"
|
||||
" auto sA = tA.slice(0, 0); \n"
|
||||
" auto sB = tB.slice(0, 0); \n"
|
||||
" mm.run(sB, sA, cT); \n"
|
||||
" \n"
|
||||
" auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
|
||||
" \n"
|
||||
" cT.store(tC); \n"
|
||||
"}";
|
||||
|
||||
GGML_LOG_INFO("%s: testing tensor API for bfloat support\n", __func__);
|
||||
ggml_metal_library_t lib = ggml_metal_library_init_from_source(dev, src_tensor_bf16, false);
|
||||
if (lib == NULL) {
|
||||
GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
|
||||
dev->props.has_bfloat = false;
|
||||
} else {
|
||||
ggml_metal_pipeline_t ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
|
||||
if (!ppl) {
|
||||
GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
|
||||
dev->props.has_bfloat = false;
|
||||
}
|
||||
|
||||
ggml_metal_library_free(lib);
|
||||
}
|
||||
}
|
||||
|
||||
dev->props.use_residency_sets = true;
|
||||
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
|
||||
@@ -476,7 +675,6 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
||||
#endif
|
||||
|
||||
dev->props.use_shared_buffers = dev->props.has_unified_memory;
|
||||
|
||||
if (getenv("GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) {
|
||||
dev->props.use_shared_buffers = false;
|
||||
}
|
||||
@@ -529,6 +727,7 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
||||
GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, dev->props.has_simdgroup_mm ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: has unified memory = %s\n", __func__, dev->props.has_unified_memory ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, dev->props.has_bfloat ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: has tensor = %s\n", __func__, dev->props.has_tensor ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: use residency sets = %s\n", __func__, dev->props.use_residency_sets ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: use shared buffers = %s\n", __func__, dev->props.use_shared_buffers ? "true" : "false");
|
||||
|
||||
|
||||
@@ -9,6 +9,12 @@ __embed_ggml-common.h__
|
||||
|
||||
#include <metal_stdlib>
|
||||
|
||||
#ifdef GGML_METAL_HAS_TENSOR
|
||||
#include <metal_tensor>
|
||||
|
||||
#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h>
|
||||
#endif
|
||||
|
||||
using namespace metal;
|
||||
|
||||
#define MAX(x, y) ((x) > (y) ? (x) : (y))
|
||||
@@ -1742,7 +1748,7 @@ kernel void kernel_op_sum_f32(
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int64_t i0 = tpitg.x; i0 < args.np; i0 += ntg.x) {
|
||||
for (uint64_t i0 = tpitg.x; i0 < args.np; i0 += ntg.x) {
|
||||
sumf += src0[i0];
|
||||
}
|
||||
|
||||
@@ -5467,6 +5473,7 @@ template [[host_name("kernel_flash_attn_ext_q8_0_dk576_dv512")]] kernel flash_at
|
||||
|
||||
#undef FA_TYPES
|
||||
#undef FA_TYPES_BF
|
||||
#undef FA_TYPES_F32
|
||||
|
||||
constant bool FC_flash_attn_ext_vec_has_mask [[function_constant(FC_FLASH_ATTN_EXT_VEC + 0)]];
|
||||
constant bool FC_flash_attn_ext_vec_has_sinks [[function_constant(FC_FLASH_ATTN_EXT_VEC + 1)]];
|
||||
@@ -6088,6 +6095,7 @@ template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk576_dv512")]] kernel flas
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 576, 512, 2>;
|
||||
|
||||
#undef FA_TYPES
|
||||
#undef FA_TYPES_F32
|
||||
|
||||
constant int32_t FC_flash_attn_ext_vec_reduce_DV [[function_constant(FC_FLASH_ATTN_EXT_VEC_REDUCE + 0)]];
|
||||
constant int32_t FC_flash_attn_ext_vec_reduce_NWG [[function_constant(FC_FLASH_ATTN_EXT_VEC_REDUCE + 1)]];
|
||||
@@ -8141,17 +8149,6 @@ kernel void kernel_set_rows_f(
|
||||
constant bool FC_mul_mm_bc_inp [[function_constant(FC_MUL_MM + 0)]];
|
||||
constant bool FC_mul_mm_bc_out [[function_constant(FC_MUL_MM + 1)]];
|
||||
|
||||
#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
|
||||
#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
|
||||
#define BLOCK_SIZE_K 32
|
||||
#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
|
||||
#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
|
||||
#define THREAD_PER_BLOCK 128
|
||||
#define THREAD_PER_ROW 2 // 2 thread for each row in matrix A to load numbers
|
||||
#define THREAD_PER_COL 4 // 4 thread for each row in matrix B to load numbers
|
||||
#define SG_MAT_SIZE 64 // simdgroup matrix is of shape 8x8
|
||||
#define SG_MAT_ROW 8
|
||||
|
||||
// each block_q contains 16*nl weights
|
||||
template<typename S0, typename S0_4x4, typename S0_8x8, typename S1, typename S1_2x4, typename S1_8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &), typename T0, typename T0_4x4, typename T1, typename T1_2x4>
|
||||
kernel void kernel_mul_mm(
|
||||
@@ -8167,18 +8164,48 @@ kernel void kernel_mul_mm(
|
||||
threadgroup S0 * sa = (threadgroup S0 *)(shmem);
|
||||
threadgroup S1 * sb = (threadgroup S1 *)(shmem + 4096);
|
||||
|
||||
const int r0 = tgpig.y;
|
||||
const int r1 = tgpig.x;
|
||||
threadgroup float * sc = (threadgroup float *)(shmem);
|
||||
|
||||
constexpr int NR0 = 64;
|
||||
constexpr int NR1 = 32;
|
||||
|
||||
constexpr int NK = 32;
|
||||
constexpr int NL0 = NK/16;
|
||||
constexpr int NL1 = NK/8;
|
||||
|
||||
const int im = tgpig.z;
|
||||
const int r0 = tgpig.y*NR0;
|
||||
const int r1 = tgpig.x*NR1;
|
||||
|
||||
// if this block is of 64x32 shape or smaller
|
||||
const short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
|
||||
const short n_cols = (args.ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (args.ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
|
||||
const short nr0 = (args.ne0 - r0 < NR0) ? (args.ne0 - r0) : NR0;
|
||||
const short nr1 = (args.ne1 - r1 < NR1) ? (args.ne1 - r1) : NR1;
|
||||
|
||||
// a thread shouldn't load data outside of the matrix
|
||||
const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
|
||||
const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
|
||||
const short lr0 = ((short)tiitg/NL0) < nr0 ? ((short)tiitg/NL0) : nr0 - 1; // 0 .. 63
|
||||
const short lr1 = ((short)tiitg/NL1) < nr1 ? ((short)tiitg/NL1) : nr1 - 1; // 0 .. 31
|
||||
|
||||
const short il0 = (tiitg % NL0);
|
||||
|
||||
short il = il0;
|
||||
|
||||
const int i12 = im%args.ne12;
|
||||
const int i13 = im/args.ne12;
|
||||
|
||||
const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const short offset1 = il0/nl;
|
||||
|
||||
device const block_q * x = (device const block_q *)(src0 + args.nb01*(r0 + lr0) + offset0) + offset1;
|
||||
|
||||
const short iy = 8*(tiitg % NL1);
|
||||
|
||||
device const T1 * y = (device const T1 *)(src1
|
||||
+ args.nb13*i13
|
||||
+ args.nb12*i12
|
||||
+ args.nb11*(r1 + lr1)
|
||||
+ args.nb10*iy);
|
||||
|
||||
#ifndef GGML_METAL_HAS_TENSOR
|
||||
S0_8x8 ma[4];
|
||||
S1_8x8 mb[2];
|
||||
|
||||
@@ -8187,36 +8214,36 @@ kernel void kernel_mul_mm(
|
||||
for (short i = 0; i < 8; i++){
|
||||
mc[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
|
||||
}
|
||||
#else
|
||||
auto tA = tensor<threadgroup S0, dextents<int32_t, 2>, tensor_inline>(sa, dextents<int32_t, 2>(NK, NR0));
|
||||
auto tB = tensor<threadgroup S1, dextents<int32_t, 2>, tensor_inline>(sb, dextents<int32_t, 2>(NR1, NK ));
|
||||
|
||||
short il = (tiitg % THREAD_PER_ROW);
|
||||
mpp::tensor_ops::matmul2d<
|
||||
mpp::tensor_ops::matmul2d_descriptor(NR1, NR0, NK, false, true, false, mpp::tensor_ops::matmul2d_descriptor::mode::multiply_accumulate),
|
||||
execution_simdgroups<4>> mm;
|
||||
|
||||
const int i12 = im%args.ne12;
|
||||
const int i13 = im/args.ne12;
|
||||
auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>();
|
||||
#endif
|
||||
|
||||
const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const short offset1 = il/nl;
|
||||
|
||||
device const block_q * x = (device const block_q *)(src0
|
||||
+ args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1;
|
||||
|
||||
const short iy = (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL));
|
||||
|
||||
device const T1 * y = (device const T1 *)(src1
|
||||
+ args.nb13*i13
|
||||
+ args.nb12*i12
|
||||
+ args.nb11*(r1*BLOCK_SIZE_N + thread_col)
|
||||
+ args.nb10*iy);
|
||||
|
||||
for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) {
|
||||
for (int loop_k = 0; loop_k < args.ne00; loop_k += NK) {
|
||||
#ifndef GGML_METAL_HAS_TENSOR
|
||||
// load data and store to threadgroup memory
|
||||
if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// no need for dequantization
|
||||
for (short i = 0; i < 16; i++) {
|
||||
*(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
|
||||
+ (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
|
||||
+ (tiitg/THREAD_PER_ROW)%8 + (i&7)*8) = loop_k + 16*il + i < args.ne00 ? ((device T0 *) x)[i] : 0;
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
//const short lx = i%8;
|
||||
//const short ly = (tiitg/NL0)%8;
|
||||
const short lx = (tiitg/NL0)%8;
|
||||
const short ly = i%8;
|
||||
|
||||
const short ib = 8*sx + sy;
|
||||
|
||||
*(sa + 64*ib + 8*ly + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0;
|
||||
}
|
||||
} else {
|
||||
S0_4x4 temp_a;
|
||||
@@ -8225,91 +8252,203 @@ kernel void kernel_mul_mm(
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 16; i++) {
|
||||
*(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
|
||||
+ (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
|
||||
+ (tiitg/THREAD_PER_ROW)%8 + (i&7)*8) = temp_a[i/4][i%4];
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
//const short lx = i%8;
|
||||
//const short ly = (tiitg/NL0)%8;
|
||||
const short lx = (tiitg/NL0)%8;
|
||||
const short ly = i%8;
|
||||
|
||||
const short ib = 8*sx + sy;
|
||||
|
||||
// NOTE: this is massively slower.. WTF?
|
||||
//sa[64*ib + 8*ly + lx] = temp_a[i/4][i%4];
|
||||
|
||||
*(sa + 64*ib + 8*ly + lx) = temp_a[i/4][i%4];
|
||||
}
|
||||
}
|
||||
|
||||
if (FC_mul_mm_bc_inp) {
|
||||
for (short i = 0; i < 8; ++i) {
|
||||
sb[32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL) + i] = loop_k + iy + i < args.ne00 ? (S1) ((device T1 *) y)[i] : 0;
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
const short lx = i;
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
//const short lx = (tiitg/NL1)%8;
|
||||
//const short ly = i;
|
||||
|
||||
const short ib = 4*sx + sy;
|
||||
|
||||
*(sb + 64*ib + 8*ly + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0;
|
||||
}
|
||||
} else {
|
||||
*(threadgroup S1_2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = (S1_2x4)(*((device T1_2x4 *) y));
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
const short dx = sx;
|
||||
const short dy = sy;
|
||||
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
|
||||
const short ib = 4*sx + sy;
|
||||
|
||||
*(threadgroup S1_2x4 *)(sb + 64*ib + 8*ly) = (S1_2x4)(*((device T1_2x4 *) y));
|
||||
}
|
||||
#else
|
||||
// load data and store to threadgroup memory
|
||||
if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// no need for dequantization
|
||||
for (short i = 0; i < 16; i++) {
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
const short lx = i%8;
|
||||
const short ly = (tiitg/NL0)%8;
|
||||
//const short lx = (tiitg/NL0)%8;
|
||||
//const short ly = i%8;
|
||||
|
||||
*(sa + NK*(8*sy + ly) + 8*sx + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0;
|
||||
}
|
||||
} else {
|
||||
S0_4x4 temp_a;
|
||||
dequantize_func(x, il, temp_a);
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 16; i++) {
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
const short lx = i%8;
|
||||
const short ly = (tiitg/NL0)%8;
|
||||
//const short lx = (tiitg/NL0)%8;
|
||||
//const short ly = i%8;
|
||||
|
||||
*(sa + NK*(8*sy + ly) + 8*sx + lx) = temp_a[i/4][i%4];
|
||||
}
|
||||
}
|
||||
|
||||
if (FC_mul_mm_bc_inp) {
|
||||
for (short i = 0; i < 8; ++i) {
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
const short lx = i;
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
//const short lx = (tiitg/NL1)%8;
|
||||
//const short ly = i;
|
||||
|
||||
*(sb + NK*(8*sy + ly) + 8*sx + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0;
|
||||
}
|
||||
} else {
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
//const short lx = i;
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
//const short lx = (tiitg/NL1)%8;
|
||||
//const short ly = i;
|
||||
|
||||
*(threadgroup S1_2x4 *)(sb + NK*(8*sy + ly) + 8*sx) = (S1_2x4)(*((device T1_2x4 *) y));
|
||||
}
|
||||
#endif
|
||||
|
||||
il = (il + 2 < nl) ? il + 2 : il % 2;
|
||||
x = (il < 2) ? x + (2 + nl - 1)/nl : x;
|
||||
y += BLOCK_SIZE_K;
|
||||
|
||||
y += NK;
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
#ifndef GGML_METAL_HAS_TENSOR
|
||||
// load matrices from threadgroup memory and conduct outer products
|
||||
threadgroup const S0 * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
|
||||
threadgroup const S1 * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
|
||||
threadgroup const S0 * lsma = (sa + 4*64*(sgitg%2));
|
||||
threadgroup const S1 * lsmb = (sb + 2*64*(sgitg/2));
|
||||
|
||||
#pragma unroll(4)
|
||||
for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) {
|
||||
FOR_UNROLL (short ik = 0; ik < NK/8; ik++) {
|
||||
simdgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
#pragma unroll(4)
|
||||
for (short i = 0; i < 4; i++) {
|
||||
simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
|
||||
}
|
||||
|
||||
#pragma unroll(2)
|
||||
for (short i = 0; i < 2; i++) {
|
||||
simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
|
||||
FOR_UNROLL (short i = 0; i < 4; i++) {
|
||||
simdgroup_load(ma[i], lsma + 64*i, 8, 0, false);
|
||||
}
|
||||
|
||||
simdgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
#pragma unroll(8)
|
||||
for (short i = 0; i < 8; i++){
|
||||
FOR_UNROLL (short i = 0; i < 2; i++) {
|
||||
simdgroup_load(mb[i], lsmb + 64*i, 8, 0, false);
|
||||
}
|
||||
|
||||
simdgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 8; i++){
|
||||
simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
|
||||
}
|
||||
|
||||
lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE;
|
||||
lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE;
|
||||
lsma += 8*64;
|
||||
lsmb += 4*64;
|
||||
}
|
||||
#else
|
||||
auto sA = tA.slice(0, 0);
|
||||
auto sB = tB.slice(0, 0);
|
||||
|
||||
mm.run(sB, sA, cT);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (!FC_mul_mm_bc_out || ((r0 + 1) * BLOCK_SIZE_M <= args.ne0 && (r1 + 1) * BLOCK_SIZE_N <= args.ne1)) {
|
||||
if (!FC_mul_mm_bc_out || (r0 + NR0 <= args.ne0 && r1 + NR1 <= args.ne1)) {
|
||||
// if no bounds checks on the output are needed, we can directly write to device memory
|
||||
#ifdef GGML_METAL_HAS_TENSOR
|
||||
device float * C = (device float *) dst +
|
||||
(BLOCK_SIZE_M * r0 + 32*(sgitg & 1)) + \
|
||||
(BLOCK_SIZE_N * r1 + 16*(sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;
|
||||
r0 + \
|
||||
r1 * args.ne0 + im*args.ne1*args.ne0;
|
||||
|
||||
auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(args.ne0, NR1));
|
||||
cT.store(tC);
|
||||
#else
|
||||
device float * C = (device float *) dst +
|
||||
(r0 + 32*(sgitg & 1)) + \
|
||||
(r1 + 16*(sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;
|
||||
|
||||
for (short i = 0; i < 8; i++) {
|
||||
simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.ne0 * (i/4), args.ne0);
|
||||
simdgroup_store(mc[i], C + 8*(i%4) + 8*args.ne0*(i/4), args.ne0, 0, false);
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
// block is smaller than 64x32, we should avoid writing data outside of the matrix
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
threadgroup float * temp_str = ((threadgroup float *) shmem) \
|
||||
+ 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M;
|
||||
|
||||
threadgroup float * temp_str = ((threadgroup float *) shmem) + 32*(sgitg&1) + (16*(sgitg >> 1))*NR0;
|
||||
|
||||
#ifdef GGML_METAL_HAS_TENSOR
|
||||
auto tC = tensor<threadgroup float, dextents<int32_t, 2>, tensor_inline>(sc, dextents<int32_t, 2>(NR0, NR1));
|
||||
cT.store(tC);
|
||||
#else
|
||||
for (short i = 0; i < 8; i++) {
|
||||
simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M);
|
||||
simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*NR0*(i/4), NR0, 0, false);
|
||||
}
|
||||
#endif
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (sgitg == 0) {
|
||||
for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
|
||||
device float * D = (device float *) dst + (r0*BLOCK_SIZE_M) + (r1*BLOCK_SIZE_N + j)*args.ne0 + im*args.ne1*args.ne0;
|
||||
for (int j = tiitg; j < nr1; j += NR1) {
|
||||
device float * D = (device float *) dst + r0 + (r1 + j)*args.ne0 + im*args.ne1*args.ne0;
|
||||
device float4 * D4 = (device float4 *) D;
|
||||
|
||||
threadgroup float * C = temp_str + (j*BLOCK_SIZE_M);
|
||||
threadgroup float * C = temp_str + (j*NR0);
|
||||
threadgroup float4 * C4 = (threadgroup float4 *) C;
|
||||
|
||||
int i = 0;
|
||||
for (; i < n_rows/4; i++) {
|
||||
for (; i < nr0/4; i++) {
|
||||
*(D4 + i) = *(C4 + i);
|
||||
}
|
||||
|
||||
i *= 4;
|
||||
for (; i < n_rows; i++) {
|
||||
for (; i < nr0; i++) {
|
||||
*(D + i) = *(C + i);
|
||||
}
|
||||
}
|
||||
@@ -8394,31 +8533,63 @@ kernel void kernel_mul_mm_id(
|
||||
ushort tiitg[[thread_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
threadgroup S0 * sa = (threadgroup S0 *)(shmem);
|
||||
threadgroup S1 * sb = (threadgroup S1 *)(shmem + 4096);
|
||||
|
||||
const int r0 = tgpig.y;
|
||||
const int r1 = tgpig.x;
|
||||
threadgroup float * sc = (threadgroup float *)(shmem);
|
||||
|
||||
constexpr int NR0 = 64;
|
||||
constexpr int NR1 = 32;
|
||||
|
||||
constexpr int NK = 32;
|
||||
constexpr int NL0 = NK/16;
|
||||
constexpr int NL1 = NK/8;
|
||||
|
||||
const int im = tgpig.z; // expert
|
||||
const int r0 = tgpig.y*NR0;
|
||||
const int r1 = tgpig.x*NR1;
|
||||
|
||||
device const uint32_t * tpe_u32 = (device const uint32_t *) (htpe);
|
||||
device const int32_t * ids_i32 = (device const int32_t *) (hids);
|
||||
|
||||
const int32_t neh1 = tpe_u32[im];
|
||||
|
||||
if (r1*BLOCK_SIZE_N >= neh1) {
|
||||
if (r1 >= neh1) {
|
||||
return;
|
||||
}
|
||||
|
||||
// if this block is of 64x32 shape or smaller
|
||||
const short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
|
||||
const short n_cols = ( neh1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? ( neh1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
|
||||
const short nr0 = (args.ne0 - r0 < NR0) ? (args.ne0 - r0) : NR0;
|
||||
const short nr1 = ( neh1 - r1 < NR1) ? ( neh1 - r1) : NR1;
|
||||
|
||||
// a thread shouldn't load data outside of the matrix
|
||||
const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
|
||||
const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
|
||||
const short lr0 = ((short)tiitg/NL0) < nr0 ? ((short)tiitg/NL0) : nr0 - 1; // 0 .. 63
|
||||
const short lr1 = ((short)tiitg/NL1) < nr1 ? ((short)tiitg/NL1) : nr1 - 1; // 0 .. 31
|
||||
|
||||
const short il0 = (tiitg % NL0);
|
||||
|
||||
short il = il0;
|
||||
|
||||
const int id = ids_i32[im*args.ne21 + r1 + lr1];
|
||||
|
||||
const short i11 = (id % args.ne20) % args.ne11;
|
||||
const short i12 = (id / args.ne20);
|
||||
const short i13 = 0;
|
||||
|
||||
const uint64_t offset0 = im*args.nb02 + i13*args.nb03;
|
||||
const short offset1 = il0/nl;
|
||||
|
||||
device const block_q * x = (device const block_q *)(src0 + args.nb01*(r0 + lr0) + offset0) + offset1;
|
||||
|
||||
const short iy = 8*(tiitg % NL1);
|
||||
|
||||
device const T1 * y = (device const T1 *)(src1
|
||||
+ args.nb13*i13
|
||||
+ args.nb12*i12
|
||||
+ args.nb11*i11
|
||||
+ args.nb10*iy);
|
||||
|
||||
#ifndef GGML_METAL_HAS_TENSOR
|
||||
S0_8x8 ma[4];
|
||||
S1_8x8 mb[2];
|
||||
|
||||
@@ -8427,39 +8598,36 @@ kernel void kernel_mul_mm_id(
|
||||
for (short i = 0; i < 8; i++){
|
||||
mc[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
|
||||
}
|
||||
#else
|
||||
auto tA = tensor<threadgroup S0, dextents<int32_t, 2>, tensor_inline>(sa, dextents<int32_t, 2>(NK, NR0));
|
||||
auto tB = tensor<threadgroup S1, dextents<int32_t, 2>, tensor_inline>(sb, dextents<int32_t, 2>(NR1, NK ));
|
||||
|
||||
short il = (tiitg % THREAD_PER_ROW);
|
||||
mpp::tensor_ops::matmul2d<
|
||||
mpp::tensor_ops::matmul2d_descriptor(NR1, NR0, NK, false, true, false, mpp::tensor_ops::matmul2d_descriptor::mode::multiply_accumulate),
|
||||
execution_simdgroups<4>> mm;
|
||||
|
||||
const int id = ids_i32[im*args.ne21 + r1*BLOCK_SIZE_N + thread_col];
|
||||
auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>();
|
||||
#endif
|
||||
|
||||
const short i11 = (id % args.ne20) % args.ne11;
|
||||
const short i12 = (id / args.ne20);
|
||||
const short i13 = 0;
|
||||
|
||||
const uint64_t offset0 = im*args.nb02 + i13*args.nb03;
|
||||
const short offset1 = il/nl;
|
||||
|
||||
device const block_q * x = (device const block_q *)(src0
|
||||
+ args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1;
|
||||
|
||||
const short iy = (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL));
|
||||
|
||||
device const T1 * y = (device const T1 *)(src1
|
||||
+ args.nb13*i13
|
||||
+ args.nb12*i12
|
||||
+ args.nb11*i11
|
||||
+ args.nb10*iy);
|
||||
|
||||
for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) {
|
||||
for (int loop_k = 0; loop_k < args.ne00; loop_k += NK) {
|
||||
#ifndef GGML_METAL_HAS_TENSOR
|
||||
// load data and store to threadgroup memory
|
||||
if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// no need for dequantization
|
||||
for (short i = 0; i < 16; i++) {
|
||||
*(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
|
||||
+ (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
|
||||
+ (tiitg/THREAD_PER_ROW)%8 + (i&7)*8) = loop_k + 16*il + i < args.ne00 ? ((device T0 *) x)[i] : 0;
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
//const short lx = i%8;
|
||||
//const short ly = (tiitg/NL0)%8;
|
||||
const short lx = (tiitg/NL0)%8;
|
||||
const short ly = i%8;
|
||||
|
||||
const short ib = 8*sx + sy;
|
||||
|
||||
*(sa + 64*ib + 8*ly + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0;
|
||||
}
|
||||
} else {
|
||||
S0_4x4 temp_a;
|
||||
@@ -8468,85 +8636,188 @@ kernel void kernel_mul_mm_id(
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 16; i++) {
|
||||
*(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
|
||||
+ (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
|
||||
+ (tiitg/THREAD_PER_ROW)%8 + (i&7)*8) = temp_a[i/4][i%4];
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
//const short lx = i%8;
|
||||
//const short ly = (tiitg/NL0)%8;
|
||||
const short lx = (tiitg/NL0)%8;
|
||||
const short ly = i%8;
|
||||
|
||||
const short ib = 8*sx + sy;
|
||||
|
||||
// NOTE: this is massively slower.. WTF?
|
||||
//sa[64*ib + 8*ly + lx] = temp_a[i/4][i%4];
|
||||
|
||||
*(sa + 64*ib + 8*ly + lx) = temp_a[i/4][i%4];
|
||||
}
|
||||
}
|
||||
|
||||
if (FC_mul_mm_bc_inp) {
|
||||
for (short i = 0; i < 8; ++i) {
|
||||
sb[32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL) + i] = loop_k + iy + i < args.ne00 ? (S1) ((device T1 *) y)[i] : 0;
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
const short lx = i;
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
//const short lx = (tiitg/NL1)%8;
|
||||
//const short ly = i;
|
||||
|
||||
const short ib = 4*sx + sy;
|
||||
|
||||
*(sb + 64*ib + 8*ly + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0;
|
||||
}
|
||||
} else {
|
||||
*(threadgroup S1_2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = (S1_2x4)(*((device T1_2x4 *) y));
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
const short dx = sx;
|
||||
const short dy = sy;
|
||||
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
|
||||
const short ib = 4*sx + sy;
|
||||
|
||||
*(threadgroup S1_2x4 *)(sb + 64*ib + 8*ly) = (S1_2x4)(*((device T1_2x4 *) y));
|
||||
}
|
||||
#else
|
||||
// load data and store to threadgroup memory
|
||||
if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// no need for dequantization
|
||||
for (short i = 0; i < 16; i++) {
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
const short lx = i%8;
|
||||
const short ly = (tiitg/NL0)%8;
|
||||
//const short lx = (tiitg/NL0)%8;
|
||||
//const short ly = i%8;
|
||||
|
||||
*(sa + NK*(8*sy + ly) + 8*sx + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0;
|
||||
}
|
||||
} else {
|
||||
S0_4x4 temp_a;
|
||||
dequantize_func(x, il, temp_a);
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 16; i++) {
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
const short lx = i%8;
|
||||
const short ly = (tiitg/NL0)%8;
|
||||
//const short lx = (tiitg/NL0)%8;
|
||||
//const short ly = i%8;
|
||||
|
||||
*(sa + NK*(8*sy + ly) + 8*sx + lx) = temp_a[i/4][i%4];
|
||||
}
|
||||
}
|
||||
|
||||
if (FC_mul_mm_bc_inp) {
|
||||
for (short i = 0; i < 8; ++i) {
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
const short lx = i;
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
//const short lx = (tiitg/NL1)%8;
|
||||
//const short ly = i;
|
||||
|
||||
*(sb + NK*(8*sy + ly) + 8*sx + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0;
|
||||
}
|
||||
} else {
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
//const short lx = i;
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
//const short lx = (tiitg/NL1)%8;
|
||||
//const short ly = i;
|
||||
|
||||
*(threadgroup S1_2x4 *)(sb + NK*(8*sy + ly) + 8*sx) = (S1_2x4)(*((device T1_2x4 *) y));
|
||||
}
|
||||
#endif
|
||||
|
||||
il = (il + 2 < nl) ? il + 2 : il % 2;
|
||||
x = (il < 2) ? x + (2 + nl - 1)/nl : x;
|
||||
y += BLOCK_SIZE_K;
|
||||
|
||||
y += NK;
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
#ifndef GGML_METAL_HAS_TENSOR
|
||||
// load matrices from threadgroup memory and conduct outer products
|
||||
threadgroup const S0 * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
|
||||
threadgroup const S1 * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
|
||||
threadgroup const S0 * lsma = (sa + 4*64*(sgitg%2));
|
||||
threadgroup const S1 * lsmb = (sb + 2*64*(sgitg/2));
|
||||
|
||||
#pragma unroll(4)
|
||||
for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) {
|
||||
#pragma unroll(4)
|
||||
for (short i = 0; i < 4; i++) {
|
||||
simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
|
||||
FOR_UNROLL (short ik = 0; ik < NK/8; ik++) {
|
||||
simdgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 4; i++) {
|
||||
simdgroup_load(ma[i], lsma + 64*i, 8, 0, false);
|
||||
}
|
||||
|
||||
simdgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
#pragma unroll(2)
|
||||
for (short i = 0; i < 2; i++) {
|
||||
simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
|
||||
FOR_UNROLL (short i = 0; i < 2; i++) {
|
||||
simdgroup_load(mb[i], lsmb + 64*i, 8, 0, false);
|
||||
}
|
||||
|
||||
#pragma unroll(8)
|
||||
for (short i = 0; i < 8; i++){
|
||||
simdgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 8; i++){
|
||||
simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
|
||||
}
|
||||
|
||||
lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE;
|
||||
lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE;
|
||||
lsma += 8*64;
|
||||
lsmb += 4*64;
|
||||
}
|
||||
#else
|
||||
auto sA = tA.slice(0, 0);
|
||||
auto sB = tB.slice(0, 0);
|
||||
|
||||
mm.run(sB, sA, cT);
|
||||
#endif
|
||||
}
|
||||
|
||||
// block is smaller than 64x32, we should avoid writing data outside of the matrix
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
threadgroup float * temp_str = ((threadgroup float *) shmem) \
|
||||
+ 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M;
|
||||
#ifdef GGML_METAL_HAS_TENSOR
|
||||
auto tC = tensor<threadgroup float, dextents<int32_t, 2>, tensor_inline>(sc, dextents<int32_t, 2>(NR0, NR1));
|
||||
cT.store(tC);
|
||||
#else
|
||||
threadgroup float * temp_str = ((threadgroup float *) shmem) + 32*(sgitg&1) + (16*(sgitg >> 1))*NR0;
|
||||
|
||||
#pragma unroll(8)
|
||||
for (short i = 0; i < 8; i++) {
|
||||
simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M);
|
||||
simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*NR0*(i/4), NR0, 0, false);
|
||||
}
|
||||
#endif
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
for (short j = sgitg; j < n_cols; j += 4) {
|
||||
const int id = ids_i32[im*args.ne21 + r1*BLOCK_SIZE_N + j];
|
||||
for (short j = sgitg; j < nr1; j += 4) {
|
||||
const int id = ids_i32[im*args.ne21 + r1 + j];
|
||||
|
||||
const short ide = id % args.ne20;
|
||||
const short idt = id / args.ne20;
|
||||
|
||||
device float * D = (device float *) dst + (r0*BLOCK_SIZE_M) + ide*args.ne0 + idt*args.ne1*args.ne0;
|
||||
device float * D = (device float *) dst + r0 + ide*args.ne0 + idt*args.ne1*args.ne0;
|
||||
device float4 * D4 = (device float4 *) D;
|
||||
|
||||
threadgroup float * C = (threadgroup float *) shmem + (j*BLOCK_SIZE_M);
|
||||
threadgroup float * C = (threadgroup float *) shmem + j*NR0;
|
||||
threadgroup float4 * C4 = (threadgroup float4 *) C;
|
||||
|
||||
int i = tiisg;
|
||||
for (; i < n_rows/4; i += 32) {
|
||||
for (; i < nr0/4; i += 32) {
|
||||
*(D4 + i) = *(C4 + i);
|
||||
}
|
||||
|
||||
i = (4*(n_rows/4)) + tiisg;
|
||||
for (; i < n_rows; i += 32) {
|
||||
i = (4*(nr0/4)) + tiisg;
|
||||
for (; i < nr0; i += 32) {
|
||||
*(D + i) = *(C + i);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,9 +11,13 @@
|
||||
//
|
||||
|
||||
#include "concat.hpp"
|
||||
#include "common.hpp"
|
||||
|
||||
static void concat_f32_dim0(const float *x, const float *y, float *dst,
|
||||
static inline size_t elem_size(ggml_type t) {
|
||||
return ggml_type_size(t) / ggml_blck_size(t);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void concat_T_dim0(const T *x, const T *y, T *dst,
|
||||
const int ne0, const int ne00,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
int nidx = item_ct1.get_local_id(2) +
|
||||
@@ -36,7 +40,8 @@ static void concat_f32_dim0(const float *x, const float *y, float *dst,
|
||||
}
|
||||
}
|
||||
|
||||
static void concat_f32_dim1(const float *x, const float *y, float *dst,
|
||||
template <typename T>
|
||||
static void concat_T_dim1(const T *x, const T *y, T *dst,
|
||||
const int ne0, const int ne01,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
int nidx = item_ct1.get_local_id(2) +
|
||||
@@ -59,7 +64,8 @@ static void concat_f32_dim1(const float *x, const float *y, float *dst,
|
||||
}
|
||||
}
|
||||
|
||||
static void concat_f32_dim2(const float *x, const float *y, float *dst,
|
||||
template <typename T>
|
||||
static void concat_T_dim2(const T *x, const T *y, T *dst,
|
||||
const int ne0, const int ne02,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
int nidx = item_ct1.get_local_id(2) +
|
||||
@@ -82,45 +88,35 @@ static void concat_f32_dim2(const float *x, const float *y, float *dst,
|
||||
}
|
||||
}
|
||||
|
||||
static void concat_f32_sycl(const float *x, const float *y, float *dst,
|
||||
template <typename T>
|
||||
static void concat_T_sycl(const T *x, const T *y, T *dst,
|
||||
int ne00, int ne01, int ne02, int ne0, int ne1,
|
||||
int ne2, int dim, queue_ptr stream) {
|
||||
int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE;
|
||||
sycl::range<3> gridDim(ne2, ne1, num_blocks);
|
||||
switch (dim) {
|
||||
case 0:
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(gridDim *
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1);
|
||||
});
|
||||
break;
|
||||
stream->parallel_for(sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) { concat_T_dim0<T>(x, y, dst, ne0, ne00, item_ct1); });
|
||||
break;
|
||||
case 1:
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(gridDim *
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1);
|
||||
});
|
||||
break;
|
||||
stream->parallel_for(sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) { concat_T_dim1<T>(x, y, dst, ne0, ne01, item_ct1); });
|
||||
break;
|
||||
// dim >=2 will be dispatched to the default path
|
||||
default:
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(gridDim *
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1);
|
||||
});
|
||||
break;
|
||||
stream->parallel_for(sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) { concat_T_dim2<T>(x, y, dst, ne0, ne02, item_ct1); });
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// non-contiguous kernel (slow)
|
||||
static void concat_f32_sycl_non_cont(
|
||||
template<typename T>
|
||||
static void concat_T_sycl_non_cont(
|
||||
queue_ptr stream, const char *src0, const char *src1, char *dst,
|
||||
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, uint64_t nb00,
|
||||
uint64_t nb01, uint64_t nb02, uint64_t nb03, int64_t /*ne10*/,
|
||||
@@ -137,24 +133,25 @@ static void concat_f32_sycl_non_cont(
|
||||
int64_t o[4] = { 0, 0, 0, 0 };
|
||||
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
|
||||
|
||||
const float * x;
|
||||
const T * x;
|
||||
|
||||
for (int i0 = item_ct1.get_local_id(2); i0 < ne0; i0 += item_ct1.get_local_range(2)) {
|
||||
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
x = (const float *) (src0 + (i3) *nb03 + (i2) *nb02 + (i1) *nb01 + (i0) *nb00);
|
||||
x = (const T *) (src0 + (i3) *nb03 + (i2) *nb02 + (i1) *nb01 + (i0) *nb00);
|
||||
} else {
|
||||
x = (const float *) (src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 + (i1 - o[1]) * nb11 +
|
||||
x = (const T *) (src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 + (i1 - o[1]) * nb11 +
|
||||
(i0 - o[0]) * nb10);
|
||||
}
|
||||
|
||||
float *y = (float *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
|
||||
T *y = (T *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
|
||||
|
||||
*y = *x;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
template <typename T>
|
||||
void concat_impl_sycl(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
@@ -163,15 +160,14 @@ void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
const int32_t dim = ((int32_t *) dst->op_params)[0];
|
||||
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
const T * src0_d = (const T *) src0->data;
|
||||
const T * src1_d = (const T *) src1->data;
|
||||
T * dst_d = (T *) dst->data;
|
||||
size_t type_size = elem_size(dst->type);
|
||||
if (dim != 3) {
|
||||
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
||||
concat_f32_sycl(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
|
||||
dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1], src0->ne[2], dst->ne[0],
|
||||
concat_T_sycl<T>(src0_d + i3 * (src0->nb[3] / type_size), src1_d + i3 * (src1->nb[3] / type_size),
|
||||
dst_d + i3 * (dst->nb[3] / type_size), src0->ne[0], src0->ne[1], src0->ne[2], dst->ne[0],
|
||||
dst->ne[1], dst->ne[2], dim, stream);
|
||||
}
|
||||
} else {
|
||||
@@ -179,13 +175,28 @@ void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
const size_t size1 = ggml_nbytes(src1);
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d + size0 / type_size, src1_d, size1).wait()));
|
||||
}
|
||||
} else {
|
||||
concat_f32_sycl_non_cont(stream, (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
|
||||
concat_T_sycl_non_cont<T>(stream, (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1],
|
||||
src0->nb[2], src0->nb[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
||||
src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
|
||||
dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
switch (dst->type) {
|
||||
case GGML_TYPE_F32:
|
||||
concat_impl_sycl<float>(ctx, dst);
|
||||
break;
|
||||
case GGML_TYPE_I32:
|
||||
concat_impl_sycl<int32_t>(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "ggml_sycl_op_concat: unsupported type");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4534,16 +4534,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
}
|
||||
return false;
|
||||
}
|
||||
case GGML_OP_CONCAT:
|
||||
{
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
||||
}
|
||||
case GGML_OP_REPEAT_BACK:
|
||||
{
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
return src0_type == GGML_TYPE_F32;
|
||||
}
|
||||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_ARGMAX:
|
||||
case GGML_OP_NONE:
|
||||
|
||||
@@ -130,9 +130,9 @@ struct vk_pipeline_struct {
|
||||
// true if fields have been set by ggml_vk_create_pipeline
|
||||
bool initialized {};
|
||||
// set to true to request the pipeline is compiled
|
||||
bool needed {};
|
||||
std::atomic<bool> needed {};
|
||||
// set to true when the shader has been compiled
|
||||
bool compiled {};
|
||||
std::atomic<bool> compiled {};
|
||||
// number of registers used, extracted from pipeline executable properties
|
||||
uint32_t register_count {};
|
||||
};
|
||||
@@ -1842,10 +1842,7 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> guard(device->mutex);
|
||||
device->all_pipelines.push_back(pipeline);
|
||||
}
|
||||
device->all_pipelines.push_back(pipeline);
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> guard(compile_count_mutex);
|
||||
@@ -2536,6 +2533,7 @@ static uint32_t get_subgroup_size(const std::string &pipeline_name, const vk_dev
|
||||
static void ggml_vk_load_shaders(vk_device& device) {
|
||||
VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
|
||||
|
||||
std::lock_guard<std::recursive_mutex> guard(device->mutex);
|
||||
// some shaders have a minimum subgroup size
|
||||
const uint32_t subgroup_size_8 = std::max(device->subgroup_size, 8u);
|
||||
const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u);
|
||||
@@ -2729,6 +2727,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
if (!pipeline->needed || pipeline->compiled) {
|
||||
return;
|
||||
}
|
||||
// TODO: We're no longer benefitting from the async compiles (shaders are
|
||||
// compiled individually, as needed) and this complexity can be removed.
|
||||
{
|
||||
// wait until fewer than N compiles are in progress
|
||||
uint32_t N = std::max(1u, std::thread::hardware_concurrency());
|
||||
@@ -5387,7 +5387,7 @@ static void ggml_vk_host_free(vk_device& device, void* ptr) {
|
||||
device->pinned_memory.erase(device->pinned_memory.begin() + index);
|
||||
}
|
||||
|
||||
static void ggml_vk_host_get(vk_device& device, const void * ptr, vk_buffer& buf, size_t& buf_offset) {
|
||||
static void ggml_vk_host_get(const vk_device& device, const void * ptr, vk_buffer& buf, size_t& buf_offset) {
|
||||
std::lock_guard<std::recursive_mutex> guard(device->mutex);
|
||||
buf = nullptr;
|
||||
buf_offset = 0;
|
||||
@@ -5402,6 +5402,32 @@ static void ggml_vk_host_get(vk_device& device, const void * ptr, vk_buffer& buf
|
||||
}
|
||||
}
|
||||
|
||||
static vk_subbuffer ggml_vk_tensor_subbuffer(
|
||||
const ggml_backend_vk_context * ctx, const ggml_tensor * tensor, bool allow_misalign = false) {
|
||||
|
||||
vk_buffer buffer = nullptr;
|
||||
size_t offset = 0;
|
||||
if (ctx->device->uma) {
|
||||
ggml_vk_host_get(ctx->device, tensor->data, buffer, offset);
|
||||
}
|
||||
if (!buffer) {
|
||||
auto buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
|
||||
buffer = buf_ctx->dev_buffer;
|
||||
offset = vk_tensor_offset(tensor) + tensor->view_offs;
|
||||
}
|
||||
GGML_ASSERT(buffer != nullptr);
|
||||
|
||||
size_t size = ggml_nbytes(tensor);
|
||||
|
||||
size_t misalign_bytes = offset & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
||||
// The shader must support misaligned offsets when indexing into the buffer
|
||||
GGML_ASSERT(allow_misalign || misalign_bytes == 0);
|
||||
offset &= ~misalign_bytes;
|
||||
size += misalign_bytes;
|
||||
|
||||
return vk_subbuffer{buffer, offset, size};
|
||||
}
|
||||
|
||||
static vk_submission ggml_vk_begin_submission(vk_device& device, vk_command_pool& p, bool one_time = true) {
|
||||
vk_submission s;
|
||||
s.buffer = ggml_vk_create_cmd_buffer(device, p);
|
||||
@@ -7888,12 +7914,15 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
|
||||
vk_pipeline pipeline = nullptr;
|
||||
|
||||
auto &pipelines = ctx->device->pipeline_flash_attn_f32_f16[k->type];
|
||||
auto it = pipelines.find(fa_pipeline_state);
|
||||
if (it != pipelines.end()) {
|
||||
pipeline = it->second;
|
||||
} else {
|
||||
pipelines[fa_pipeline_state] = pipeline = std::make_shared<vk_pipeline_struct>();
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> guard(ctx->device->mutex);
|
||||
auto &pipelines = ctx->device->pipeline_flash_attn_f32_f16[k->type];
|
||||
auto it = pipelines.find(fa_pipeline_state);
|
||||
if (it != pipelines.end()) {
|
||||
pipeline = it->second;
|
||||
} else {
|
||||
pipelines[fa_pipeline_state] = pipeline = std::make_shared<vk_pipeline_struct>();
|
||||
}
|
||||
}
|
||||
|
||||
assert(pipeline);
|
||||
@@ -7953,72 +7982,12 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
vk_buffer d_Q = nullptr, d_K = nullptr, d_V = nullptr, d_D = nullptr, d_M = nullptr, d_S = nullptr;
|
||||
size_t q_buf_offset = 0, k_buf_offset = 0, v_buf_offset = 0, d_buf_offset = 0, m_buf_offset = 0, s_buf_offset = 0;
|
||||
|
||||
bool Q_uma = false, K_uma = false, V_uma = false, D_uma = false, M_uma = false, S_uma = false;
|
||||
|
||||
if (ctx->device->uma) {
|
||||
ggml_vk_host_get(ctx->device, q->data, d_Q, q_buf_offset);
|
||||
ggml_vk_host_get(ctx->device, k->data, d_K, k_buf_offset);
|
||||
ggml_vk_host_get(ctx->device, v->data, d_V, v_buf_offset);
|
||||
ggml_vk_host_get(ctx->device, dst->data, d_D, d_buf_offset);
|
||||
Q_uma = d_Q != nullptr;
|
||||
K_uma = d_K != nullptr;
|
||||
V_uma = d_V != nullptr;
|
||||
D_uma = d_D != nullptr;
|
||||
if (mask) {
|
||||
ggml_vk_host_get(ctx->device, mask->data, d_M, m_buf_offset);
|
||||
M_uma = d_M != nullptr;
|
||||
}
|
||||
if (sinks) {
|
||||
ggml_vk_host_get(ctx->device, sinks->data, d_S, s_buf_offset);
|
||||
S_uma = d_S != nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
ggml_backend_vk_buffer_context * d_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
||||
ggml_backend_vk_buffer_context * q_buf_ctx = (ggml_backend_vk_buffer_context *)q->buffer->context;
|
||||
ggml_backend_vk_buffer_context * k_buf_ctx = (ggml_backend_vk_buffer_context *)k->buffer->context;
|
||||
ggml_backend_vk_buffer_context * v_buf_ctx = (ggml_backend_vk_buffer_context *)v->buffer->context;
|
||||
|
||||
if (!Q_uma) {
|
||||
d_Q = q_buf_ctx->dev_buffer;
|
||||
q_buf_offset = vk_tensor_offset(q) + q->view_offs;
|
||||
}
|
||||
if (!K_uma) {
|
||||
d_K = k_buf_ctx->dev_buffer;
|
||||
k_buf_offset = vk_tensor_offset(k) + k->view_offs;
|
||||
}
|
||||
if (!V_uma) {
|
||||
d_V = v_buf_ctx->dev_buffer;
|
||||
v_buf_offset = vk_tensor_offset(v) + v->view_offs;
|
||||
}
|
||||
if (!D_uma) {
|
||||
d_D = d_buf_ctx->dev_buffer;
|
||||
d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
|
||||
}
|
||||
|
||||
if (!M_uma) {
|
||||
d_M = d_Q;
|
||||
m_buf_offset = q_buf_offset;
|
||||
if (mask) {
|
||||
ggml_backend_vk_buffer_context * m_buf_ctx = (ggml_backend_vk_buffer_context*)mask->buffer->context;
|
||||
d_M = m_buf_ctx->dev_buffer;
|
||||
m_buf_offset = vk_tensor_offset(mask) + mask->view_offs;
|
||||
}
|
||||
}
|
||||
|
||||
if (!S_uma) {
|
||||
d_S = d_Q;
|
||||
s_buf_offset = q_buf_offset;
|
||||
if (sinks) {
|
||||
ggml_backend_vk_buffer_context * s_buf_ctx = (ggml_backend_vk_buffer_context*)sinks->buffer->context;
|
||||
d_S = s_buf_ctx->dev_buffer;
|
||||
s_buf_offset = vk_tensor_offset(sinks) + sinks->view_offs;
|
||||
}
|
||||
}
|
||||
vk_subbuffer q_buf = ggml_vk_tensor_subbuffer(ctx, q);
|
||||
vk_subbuffer k_buf = ggml_vk_tensor_subbuffer(ctx, k);
|
||||
vk_subbuffer v_buf = ggml_vk_tensor_subbuffer(ctx, v);
|
||||
vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst);
|
||||
vk_subbuffer mask_buf = mask ? ggml_vk_tensor_subbuffer(ctx, mask) : q_buf;
|
||||
vk_subbuffer sinks_buf = sinks ? ggml_vk_tensor_subbuffer(ctx, sinks) : q_buf;
|
||||
|
||||
uint32_t mask_n_head_log2 = ((sinks != nullptr) << 24) | ((mask != nullptr) << 16) | n_head_log2;
|
||||
|
||||
@@ -8040,15 +8009,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
}
|
||||
|
||||
vk_subbuffer split_k_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||
{
|
||||
ggml_vk_subbuffer(ctx, d_Q, q_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, d_K, k_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, d_V, v_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, d_M, m_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, d_S, s_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0),
|
||||
},
|
||||
{q_buf, k_buf, v_buf, mask_buf, sinks_buf, split_k_buf},
|
||||
// We only use split_k when group query attention is enabled, which means
|
||||
// there's no more than one tile of rows (i.e. workgroups_x would have been
|
||||
// one). We reuse workgroups_x to mean the number of splits, so we need to
|
||||
@@ -8058,23 +8021,12 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
const std::array<uint32_t, 5> pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne3, split_k, (sinks != nullptr) };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce,
|
||||
{
|
||||
ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0),
|
||||
ggml_vk_subbuffer(ctx, d_S, s_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, d_D, d_buf_offset),
|
||||
},
|
||||
{split_k_buf, sinks_buf, dst_buf},
|
||||
pc2, { (uint32_t)ne1, HSV, (uint32_t)ne3 });
|
||||
ctx->prealloc_split_k_need_sync = true;
|
||||
} else {
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||
{
|
||||
ggml_vk_subbuffer(ctx, d_Q, q_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, d_K, k_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, d_V, v_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, d_M, m_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, d_S, s_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, d_D, d_buf_offset),
|
||||
},
|
||||
{q_buf, k_buf, v_buf, mask_buf, sinks_buf, dst_buf},
|
||||
pc, { workgroups_x, workgroups_y, workgroups_z });
|
||||
}
|
||||
}
|
||||
@@ -8757,35 +8709,15 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
const uint64_t ne01 = src0->ne[1];
|
||||
const uint64_t ne02 = src0->ne[2];
|
||||
const uint64_t ne03 = src0->ne[3];
|
||||
const uint64_t ne0 = ne00 * ne01;
|
||||
|
||||
const bool use_src1 = src1 != nullptr;
|
||||
const uint64_t ne10 = use_src1 ? src1->ne[0] : 0;
|
||||
const uint64_t ne11 = use_src1 ? src1->ne[1] : 0;
|
||||
const uint64_t ne12 = use_src1 ? src1->ne[2] : 0;
|
||||
const uint64_t ne13 = use_src1 ? src1->ne[3] : 0;
|
||||
const uint64_t ne1 = ne10 * ne11;
|
||||
// const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
|
||||
|
||||
const bool use_src2 = src2 != nullptr;
|
||||
const uint64_t ne20 = use_src2 ? src2->ne[0] : 0;
|
||||
const uint64_t ne21 = use_src2 ? src2->ne[1] : 0;
|
||||
const uint64_t ne22 = use_src2 ? src2->ne[2] : 0;
|
||||
const uint64_t ne23 = use_src2 ? src2->ne[3] : 0;
|
||||
const uint64_t ne2 = ne20 * ne21;
|
||||
|
||||
const bool use_src3 = src3 != nullptr;
|
||||
const uint64_t ne30 = use_src3 ? src3->ne[0] : 0;
|
||||
const uint64_t ne31 = use_src3 ? src3->ne[1] : 0;
|
||||
const uint64_t ne32 = use_src3 ? src3->ne[2] : 0;
|
||||
const uint64_t ne33 = use_src3 ? src3->ne[3] : 0;
|
||||
const uint64_t ne3 = ne30 * ne31;
|
||||
|
||||
const uint64_t ned0 = dst->ne[0];
|
||||
const uint64_t ned1 = dst->ne[1];
|
||||
const uint64_t ned2 = dst->ne[2];
|
||||
const uint64_t ned3 = dst->ne[3];
|
||||
const uint64_t ned = ned0 * ned1;
|
||||
|
||||
init_pushconst_fastdiv(pc);
|
||||
|
||||
@@ -8804,74 +8736,14 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
|
||||
const bool op_supports_incontiguous = ggml_vk_op_supports_incontiguous(op);
|
||||
|
||||
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
||||
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
||||
ggml_backend_vk_buffer_context * src1_buf_ctx = use_src1 ? (ggml_backend_vk_buffer_context *)src1->buffer->context : nullptr;
|
||||
ggml_backend_vk_buffer_context * src2_buf_ctx = use_src2 ? (ggml_backend_vk_buffer_context *)src2->buffer->context : nullptr;
|
||||
ggml_backend_vk_buffer_context * src3_buf_ctx = use_src3 ? (ggml_backend_vk_buffer_context *)src3->buffer->context : nullptr;
|
||||
vk_subbuffer src0_buf = ggml_vk_tensor_subbuffer(ctx, src0, op_supports_incontiguous);
|
||||
vk_subbuffer src1_buf = use_src1 ? ggml_vk_tensor_subbuffer(ctx, src1, op_supports_incontiguous) : vk_subbuffer{};
|
||||
vk_subbuffer src2_buf = use_src2 ? ggml_vk_tensor_subbuffer(ctx, src2, op_supports_incontiguous) : vk_subbuffer{};
|
||||
vk_subbuffer src3_buf = use_src3 ? ggml_vk_tensor_subbuffer(ctx, src3, op_supports_incontiguous) : vk_subbuffer{};
|
||||
vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, op_supports_incontiguous);
|
||||
|
||||
vk_buffer d_X = nullptr;
|
||||
size_t x_buf_offset = 0;
|
||||
vk_buffer d_Y = nullptr;
|
||||
size_t y_buf_offset = 0;
|
||||
vk_buffer d_Z = nullptr;
|
||||
size_t z_buf_offset = 0;
|
||||
vk_buffer d_W = nullptr;
|
||||
size_t w_buf_offset = 0;
|
||||
|
||||
bool src0_uma = false;
|
||||
bool src1_uma = false;
|
||||
bool src2_uma = false;
|
||||
bool src3_uma = false;
|
||||
|
||||
if (ctx->device->uma) {
|
||||
ggml_vk_host_get(ctx->device, src0->data, d_X, x_buf_offset);
|
||||
src0_uma = d_X != nullptr;
|
||||
if (use_src1) {
|
||||
ggml_vk_host_get(ctx->device, src1->data, d_Y, y_buf_offset);
|
||||
src1_uma = d_Y != nullptr;
|
||||
}
|
||||
if (use_src2) {
|
||||
ggml_vk_host_get(ctx->device, src2->data, d_Z, z_buf_offset);
|
||||
src2_uma = d_Z != nullptr;
|
||||
}
|
||||
if (use_src3) {
|
||||
ggml_vk_host_get(ctx->device, src3->data, d_W, w_buf_offset);
|
||||
src3_uma = d_W != nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
vk_buffer d_D = dst_buf_ctx->dev_buffer;
|
||||
|
||||
GGML_ASSERT(d_D != nullptr);
|
||||
uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
|
||||
if(!src0_uma) {
|
||||
d_X = src0_buf_ctx->dev_buffer;
|
||||
x_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
|
||||
GGML_ASSERT(d_X != nullptr);
|
||||
}
|
||||
if (use_src1 && !src1_uma) {
|
||||
d_Y = src1_buf_ctx->dev_buffer;
|
||||
y_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
|
||||
GGML_ASSERT(d_Y != nullptr);
|
||||
}
|
||||
if (use_src2 && !src2_uma) {
|
||||
d_Z = src2_buf_ctx->dev_buffer;
|
||||
z_buf_offset = vk_tensor_offset(src2) + src2->view_offs;
|
||||
GGML_ASSERT(d_Z != nullptr);
|
||||
}
|
||||
if (use_src3 && !src3_uma) {
|
||||
d_W = src3_buf_ctx->dev_buffer;
|
||||
w_buf_offset = vk_tensor_offset(src3) + src3->view_offs;
|
||||
GGML_ASSERT(d_W != nullptr);
|
||||
}
|
||||
// Compute misalignment offset for descriptors and store it in in push constants, then align the descriptor offsets.
|
||||
// Compute misalignment offset for descriptors and store it in in push constants.
|
||||
init_pushconst_tensor_offsets(ctx, pc, src0, src1, src2, src3, dst);
|
||||
x_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
||||
y_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
||||
z_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
||||
w_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
||||
d_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
||||
|
||||
std::array<uint32_t, 3> elements;
|
||||
|
||||
@@ -8955,9 +8827,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
const uint32_t KH = ne01;
|
||||
const uint32_t KW = ne00;
|
||||
|
||||
const uint32_t OD = ned3 / N;
|
||||
const uint32_t OH = ned2;
|
||||
const uint32_t OW = ned1;
|
||||
const uint32_t OD = dst->ne[3] / N;
|
||||
const uint32_t OH = dst->ne[2];
|
||||
const uint32_t OW = dst->ne[1];
|
||||
|
||||
const uint32_t IC_KD_KH_KW = IC*KD*KH*KW;
|
||||
const uint32_t N_OD_OH = N*OD*OH;
|
||||
@@ -9072,112 +8944,50 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
break;
|
||||
}
|
||||
|
||||
uint64_t x_sz, y_sz, z_sz, w_sz, d_sz;
|
||||
|
||||
if (op_supports_incontiguous) {
|
||||
x_sz = ggml_nbytes(src0) + get_misalign_bytes(ctx, src0);
|
||||
y_sz = use_src1 ? ggml_nbytes(src1) + get_misalign_bytes(ctx, src1) : 0;
|
||||
z_sz = use_src2 ? ggml_nbytes(src2) + get_misalign_bytes(ctx, src2) : 0;
|
||||
w_sz = use_src3 ? ggml_nbytes(src3) + get_misalign_bytes(ctx, src3) : 0;
|
||||
d_sz = ggml_nbytes(dst) + get_misalign_bytes(ctx, dst);
|
||||
|
||||
if (x_buf_offset + x_sz >= d_X->size) {
|
||||
x_sz = ggml_vk_get_max_buffer_range(ctx, d_X, x_buf_offset);
|
||||
}
|
||||
if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
|
||||
y_sz = ggml_vk_get_max_buffer_range(ctx, d_Y, y_buf_offset);
|
||||
}
|
||||
if (use_src2 && z_buf_offset + z_sz >= d_Z->size) {
|
||||
z_sz = ggml_vk_get_max_buffer_range(ctx, d_Z, z_buf_offset);
|
||||
}
|
||||
if (use_src3 && w_buf_offset + w_sz >= d_W->size) {
|
||||
w_sz = ggml_vk_get_max_buffer_range(ctx, d_W, w_buf_offset);
|
||||
}
|
||||
if (d_buf_offset + d_sz >= d_D->size) {
|
||||
d_sz = ggml_vk_get_max_buffer_range(ctx, d_D, d_buf_offset);
|
||||
}
|
||||
} else {
|
||||
x_sz = ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0 * ne02 * ne03;
|
||||
y_sz = use_src1 ? ggml_type_size(src1->type) * ne1 * ne12 * ne13 : 0;
|
||||
z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 * ne22 * ne23 : 0;
|
||||
w_sz = use_src3 ? ggml_type_size(src3->type) * ne3 * ne32 * ne33 : 0;
|
||||
d_sz = ggml_type_size(dst->type) * ned * ned2 * ned3;
|
||||
}
|
||||
|
||||
if (op == GGML_OP_ADD || op == GGML_OP_RMS_NORM) {
|
||||
vk_buffer d_A = ctx->do_add_rms_partials ? ctx->prealloc_add_rms_partials : d_X;
|
||||
size_t a_buf_offset = ctx->do_add_rms_partials ? ctx->prealloc_size_add_rms_partials_offset : 0;
|
||||
vk_subbuffer a_buf = src0_buf;
|
||||
if (ctx->do_add_rms_partials) {
|
||||
a_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_add_rms_partials, ctx->prealloc_size_add_rms_partials_offset);
|
||||
}
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||
{ vk_subbuffer{ d_X, x_buf_offset, x_sz },
|
||||
vk_subbuffer{ d_Y, y_buf_offset, y_sz },
|
||||
vk_subbuffer{ d_D, d_buf_offset, d_sz },
|
||||
ggml_vk_subbuffer(ctx, d_A, a_buf_offset),
|
||||
}, pc, elements);
|
||||
{ src0_buf, src1_buf, dst_buf, a_buf }, pc, elements);
|
||||
} else if (op == GGML_OP_GLU) {
|
||||
// Empty src1 is possible in glu, but the shader needs a buffer
|
||||
vk_subbuffer subbuf_y;
|
||||
if (use_src1) {
|
||||
subbuf_y = { d_Y, y_buf_offset, y_sz };
|
||||
} else {
|
||||
subbuf_y = { d_X, 0, x_sz };
|
||||
}
|
||||
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
||||
vk_subbuffer subbuf1 = use_src1 ? src1_buf : src0_buf;
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, subbuf1, dst_buf }, pc, elements);
|
||||
} else if (op == GGML_OP_SOFT_MAX) {
|
||||
// Empty src1 and src2 is possible in soft_max, but the shader needs a buffer
|
||||
vk_subbuffer subbuf_y;
|
||||
if (use_src1) {
|
||||
subbuf_y = { d_Y, y_buf_offset, y_sz };
|
||||
} else {
|
||||
subbuf_y = { d_X, 0, x_sz };
|
||||
}
|
||||
|
||||
vk_subbuffer subbuf_z;
|
||||
if (use_src2) {
|
||||
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
||||
} else {
|
||||
subbuf_z = { d_X, 0, x_sz };
|
||||
}
|
||||
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
||||
vk_subbuffer subbuf1 = use_src1 ? src1_buf : src0_buf;
|
||||
vk_subbuffer subbuf2 = use_src2 ? src2_buf : src0_buf;
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, subbuf1, subbuf2, dst_buf }, pc, elements);
|
||||
} else if (op == GGML_OP_ROPE || op == GGML_OP_ROPE_BACK) {
|
||||
// Empty src2 is possible in rope, but the shader needs a buffer
|
||||
vk_subbuffer subbuf_z, subbuf_w;
|
||||
if (use_src2) {
|
||||
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
||||
} else {
|
||||
subbuf_z = { d_X, 0, x_sz };
|
||||
}
|
||||
if (use_src3) {
|
||||
subbuf_w = { d_W, w_buf_offset, w_sz };
|
||||
} else {
|
||||
subbuf_w = { d_X, 0, x_sz };
|
||||
}
|
||||
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz }, subbuf_w }, pc, elements);
|
||||
// Empty src2 and src3 is possible in rope, but the shader needs a buffer
|
||||
vk_subbuffer subbuf2 = use_src2 ? src2_buf : src0_buf;
|
||||
vk_subbuffer subbuf3 = use_src3 ? src3_buf : src0_buf;
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, subbuf2, dst_buf, subbuf3 }, pc, elements);
|
||||
} else if (op == GGML_OP_IM2COL || op == GGML_OP_IM2COL_3D) {
|
||||
if (ctx->device->shader_int64 && ctx->device->buffer_device_address) {
|
||||
// buffer device address path doesn't use dst buffer
|
||||
d_sz = 1;
|
||||
dst_buf.size = 1;
|
||||
}
|
||||
// im2col uses only src1 and dst buffers
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src1_buf, dst_buf }, pc, elements);
|
||||
} else if (op == GGML_OP_COUNT_EQUAL) {
|
||||
// count_equal assumes that destination buffer is initialized with zeroes
|
||||
ggml_vk_buffer_memset_async(subctx, d_D, d_buf_offset, 0, d_sz);
|
||||
ggml_vk_buffer_memset_async(subctx, dst_buf.buffer, dst_buf.offset, 0, dst_buf.size);
|
||||
ggml_vk_sync_buffers(ctx, subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, dst_buf }, pc, elements);
|
||||
} else if (op == GGML_OP_OPT_STEP_SGD) {
|
||||
// OPT_STEP_SGD works on src0, it does not need dst
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz } }, pc, elements);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, src2_buf }, pc, elements);
|
||||
} else if (use_src3) {
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_W, w_buf_offset, w_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, src2_buf, src3_buf, dst_buf }, pc, elements);
|
||||
} else if (use_src2) {
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, src2_buf, dst_buf }, pc, elements);
|
||||
} else if (use_src1) {
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, dst_buf }, pc, elements);
|
||||
} else {
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, dst_buf }, pc, elements);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9413,39 +9223,10 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||
|
||||
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
||||
ggml_backend_vk_buffer_context * src_buf_ctxs[7] = { nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr };
|
||||
vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst);
|
||||
vk_subbuffer src_buf[7] = {};
|
||||
for (int i = 0; i < num_srcs; i++) {
|
||||
src_buf_ctxs[i] = (ggml_backend_vk_buffer_context *)dst->src[i]->buffer->context;
|
||||
}
|
||||
|
||||
vk_buffer d_D = nullptr, d_srcs[7] = { nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr };
|
||||
size_t dst_offset = 0, src_offsets[7] = { 0, 0, 0, 0, 0, 0, 0 };
|
||||
bool dst_uma = false, srcs_uma[7] = { false, false, false, false, false, false, false };
|
||||
|
||||
if (ctx->device->uma) {
|
||||
for (int i = 0; i < num_srcs; i++) {
|
||||
ggml_vk_host_get(ctx->device, dst->src[i]->data, d_srcs[i], src_offsets[i]);
|
||||
srcs_uma[i] = d_srcs[i] != nullptr;
|
||||
}
|
||||
|
||||
ggml_vk_host_get(ctx->device, dst->data, d_D, dst_offset);
|
||||
dst_uma = d_D != nullptr;
|
||||
}
|
||||
|
||||
uint64_t src_sizes[7] = { 0, 0, 0, 0, 0, 0, 0 };
|
||||
for (int i = 0; i < num_srcs; i++) {
|
||||
src_sizes[i] = ggml_nbytes(dst->src[i]);
|
||||
if (!srcs_uma[i]) {
|
||||
d_srcs[i] = src_buf_ctxs[i]->dev_buffer;
|
||||
src_offsets[i] = vk_tensor_offset(dst->src[i]) + dst->src[i]->view_offs;
|
||||
}
|
||||
}
|
||||
|
||||
const uint64_t dst_size = ggml_nbytes(dst);
|
||||
if (!dst_uma) {
|
||||
d_D = dst_buf_ctx->dev_buffer;
|
||||
dst_offset = vk_tensor_offset(dst) + dst->view_offs;
|
||||
src_buf[i] = ggml_vk_tensor_subbuffer(ctx, dst->src[i]);
|
||||
}
|
||||
|
||||
std::array<uint32_t, 3> elements = {
|
||||
@@ -9455,26 +9236,13 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
};
|
||||
|
||||
if (version == 6) {
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {
|
||||
vk_subbuffer{ d_srcs[0], src_offsets[0], src_sizes[0] },
|
||||
vk_subbuffer{ d_srcs[1], src_offsets[1], src_sizes[1] },
|
||||
vk_subbuffer{ d_srcs[2], src_offsets[2], src_sizes[2] },
|
||||
vk_subbuffer{ d_srcs[3], src_offsets[3], src_sizes[3] },
|
||||
vk_subbuffer{ d_srcs[4], src_offsets[4], src_sizes[4] },
|
||||
vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] },
|
||||
vk_subbuffer{ d_D, dst_offset, dst_size }
|
||||
}, pc, elements);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||
{src_buf[0], src_buf[1], src_buf[2], src_buf[3], src_buf[4], src_buf[5], dst_buf},
|
||||
pc, elements);
|
||||
} else if (version == 7) {
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {
|
||||
vk_subbuffer{ d_srcs[0], src_offsets[0], src_sizes[0] },
|
||||
vk_subbuffer{ d_srcs[1], src_offsets[1], src_sizes[1] },
|
||||
vk_subbuffer{ d_srcs[2], src_offsets[2], src_sizes[2] },
|
||||
vk_subbuffer{ d_srcs[3], src_offsets[3], src_sizes[3] },
|
||||
vk_subbuffer{ d_srcs[4], src_offsets[4], src_sizes[4] },
|
||||
vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] },
|
||||
vk_subbuffer{ d_srcs[6], src_offsets[6], src_sizes[6] },
|
||||
vk_subbuffer{ d_D, dst_offset, dst_size }
|
||||
}, pc, elements);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||
{src_buf[0], src_buf[1], src_buf[2], src_buf[3], src_buf[4], src_buf[5], src_buf[6], dst_buf},
|
||||
pc, elements);
|
||||
} else {
|
||||
// shouldn't happen
|
||||
GGML_ASSERT(false);
|
||||
@@ -9554,40 +9322,10 @@ static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx,
|
||||
n_head, head_dim, n_group, n_tok
|
||||
};
|
||||
|
||||
ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
|
||||
ggml_backend_vk_buffer_context * src_buf_ctxs[GGML_MAX_SRC];
|
||||
for (int i = 0; i < GGML_MAX_SRC && dst->src[i] != nullptr; i++) {
|
||||
src_buf_ctxs[i] = (ggml_backend_vk_buffer_context *)dst->src[i]->buffer->context;
|
||||
}
|
||||
|
||||
vk_buffer d_D = nullptr, d_srcs[GGML_MAX_SRC] = { nullptr };
|
||||
size_t dst_offset = 0, src_offsets[GGML_MAX_SRC] = { 0 };
|
||||
bool dst_uma = false, srcs_uma[GGML_MAX_SRC] = { false };
|
||||
|
||||
if (ctx->device->uma) {
|
||||
for (int i = 0; i < GGML_MAX_SRC && dst->src[i] != nullptr; i++) {
|
||||
ggml_vk_host_get(ctx->device, dst->src[i]->data, d_srcs[i], src_offsets[i]);
|
||||
srcs_uma[i] = d_srcs[i] != nullptr;
|
||||
}
|
||||
ggml_vk_host_get(ctx->device, dst->data, d_D, dst_offset);
|
||||
dst_uma = d_D != nullptr;
|
||||
}
|
||||
|
||||
if (!dst_uma) {
|
||||
d_D = dst_buf_ctx->dev_buffer;
|
||||
dst_offset = vk_tensor_offset(dst) + dst->view_offs;
|
||||
}
|
||||
for (int i = 0; i < GGML_MAX_SRC && dst->src[i] != nullptr; i++) {
|
||||
if (!srcs_uma[i]) {
|
||||
d_srcs[i] = src_buf_ctxs[i]->dev_buffer;
|
||||
src_offsets[i] = vk_tensor_offset(dst->src[i]) + dst->src[i]->view_offs;
|
||||
}
|
||||
}
|
||||
|
||||
size_t dst_size = ggml_nbytes(dst);
|
||||
size_t src_sizes[GGML_MAX_SRC];
|
||||
for (int i = 0; i < GGML_MAX_SRC && dst->src[i] != nullptr; i++) {
|
||||
src_sizes[i] = ggml_nbytes(dst->src[i]);
|
||||
vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst);
|
||||
vk_subbuffer src_buf[7] = {};
|
||||
for (int i = 0; i < 7 && dst->src[i] != nullptr; i++) {
|
||||
src_buf[i] = ggml_vk_tensor_subbuffer(ctx, dst->src[i]);
|
||||
}
|
||||
|
||||
std::array<uint32_t, 3> elements;
|
||||
@@ -9597,16 +9335,9 @@ static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx,
|
||||
const uint32_t num_workgroups_y = n_seq;
|
||||
elements = { num_workgroups_x, num_workgroups_y, 1 };
|
||||
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {
|
||||
vk_subbuffer{ d_srcs[0], src_offsets[0], src_sizes[0] },
|
||||
vk_subbuffer{ d_srcs[1], src_offsets[1], src_sizes[1] },
|
||||
vk_subbuffer{ d_srcs[2], src_offsets[2], src_sizes[2] },
|
||||
vk_subbuffer{ d_srcs[3], src_offsets[3], src_sizes[3] },
|
||||
vk_subbuffer{ d_srcs[4], src_offsets[4], src_sizes[4] },
|
||||
vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] },
|
||||
vk_subbuffer{ d_srcs[6], src_offsets[6], src_sizes[6] },
|
||||
vk_subbuffer{ d_D, dst_offset, dst_size }
|
||||
}, pc, elements);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||
{src_buf[0], src_buf[1], src_buf[2], src_buf[3], src_buf[4], src_buf[5], src_buf[6], dst_buf},
|
||||
pc, elements);
|
||||
}
|
||||
|
||||
static void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
|
||||
@@ -9653,66 +9384,17 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont
|
||||
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||
|
||||
ggml_backend_vk_buffer_context * x_buf_ctx = (ggml_backend_vk_buffer_context *)x->buffer->context;
|
||||
ggml_backend_vk_buffer_context * g_buf_ctx = (ggml_backend_vk_buffer_context *)g->buffer->context;
|
||||
ggml_backend_vk_buffer_context * gm_buf_ctx = (ggml_backend_vk_buffer_context *)gm->buffer->context;
|
||||
ggml_backend_vk_buffer_context * gv_buf_ctx = (ggml_backend_vk_buffer_context *)gv->buffer->context;
|
||||
ggml_backend_vk_buffer_context * p_buf_ctx = (ggml_backend_vk_buffer_context *)p->buffer->context;
|
||||
|
||||
vk_buffer d_X = nullptr, d_G = nullptr, d_GM = nullptr, d_GV = nullptr, d_P = nullptr;
|
||||
size_t x_offset = 0, g_offset = 0, gm_offset = 0, gv_offset = 0, p_offset = 0;
|
||||
bool X_uma = false, G_uma = false, GM_uma = false, GV_uma = false, P_uma = false;
|
||||
|
||||
if (ctx->device->uma) {
|
||||
ggml_vk_host_get(ctx->device, x->data, d_X, x_offset);
|
||||
ggml_vk_host_get(ctx->device, g->data, d_G, g_offset);
|
||||
ggml_vk_host_get(ctx->device, gm->data, d_GM, gm_offset);
|
||||
ggml_vk_host_get(ctx->device, gv->data, d_GV, gv_offset);
|
||||
ggml_vk_host_get(ctx->device, p->data, d_P, p_offset);
|
||||
|
||||
X_uma = d_X != nullptr;
|
||||
G_uma = d_G != nullptr;
|
||||
GM_uma = d_GM != nullptr;
|
||||
GV_uma = d_GV != nullptr;
|
||||
P_uma = d_P != nullptr;
|
||||
}
|
||||
|
||||
if (!X_uma) {
|
||||
d_X = x_buf_ctx->dev_buffer;
|
||||
x_offset = vk_tensor_offset(x) + x->view_offs;
|
||||
}
|
||||
if (!G_uma) {
|
||||
d_G = g_buf_ctx->dev_buffer;
|
||||
g_offset = vk_tensor_offset(g) + g->view_offs;
|
||||
}
|
||||
if (!GM_uma) {
|
||||
d_GM = gm_buf_ctx->dev_buffer;
|
||||
gm_offset = vk_tensor_offset(gm) + gm->view_offs;
|
||||
}
|
||||
if (!GV_uma) {
|
||||
d_GV = gv_buf_ctx->dev_buffer;
|
||||
gv_offset = vk_tensor_offset(gv) + gv->view_offs;
|
||||
}
|
||||
if (!P_uma) {
|
||||
d_P = p_buf_ctx->dev_buffer;
|
||||
p_offset = vk_tensor_offset(p) + p->view_offs;
|
||||
}
|
||||
|
||||
const uint64_t x_size = ggml_nbytes(x);
|
||||
const uint64_t g_size = ggml_nbytes(g);
|
||||
const uint64_t gm_size = ggml_nbytes(gm);
|
||||
const uint64_t gv_size = ggml_nbytes(gv);
|
||||
const uint64_t p_size = ggml_nbytes(p);
|
||||
vk_subbuffer x_buf = ggml_vk_tensor_subbuffer(ctx, x);
|
||||
vk_subbuffer g_buf = ggml_vk_tensor_subbuffer(ctx, g);
|
||||
vk_subbuffer gm_buf = ggml_vk_tensor_subbuffer(ctx, gm);
|
||||
vk_subbuffer gv_buf = ggml_vk_tensor_subbuffer(ctx, gv);
|
||||
vk_subbuffer p_buf = ggml_vk_tensor_subbuffer(ctx, p);
|
||||
|
||||
std::array<uint32_t, 3> elements = { (uint32_t)ggml_nelements(x), 1, 1 };
|
||||
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {
|
||||
vk_subbuffer{ d_X, x_offset, x_size },
|
||||
vk_subbuffer{ d_G, g_offset, g_size },
|
||||
vk_subbuffer{ d_GM, gm_offset, gm_size },
|
||||
vk_subbuffer{ d_GV, gv_offset, gv_size },
|
||||
vk_subbuffer{ d_P, p_offset, p_size },
|
||||
}, pc, elements);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||
{x_buf, g_buf, gm_buf, gv_buf, p_buf},
|
||||
pc, elements);
|
||||
}
|
||||
|
||||
static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
|
||||
@@ -10044,45 +9726,9 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
|
||||
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||
|
||||
ggml_backend_vk_buffer_context * logits_buf_ctx = (ggml_backend_vk_buffer_context *)logits->buffer->context;
|
||||
ggml_backend_vk_buffer_context * weights_buf_ctx = (ggml_backend_vk_buffer_context *)weights->buffer->context;
|
||||
ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
|
||||
|
||||
vk_buffer d_logits = nullptr;
|
||||
size_t logits_buf_offset = 0;
|
||||
vk_buffer d_weights = nullptr;
|
||||
size_t weights_buf_offset = 0;
|
||||
vk_buffer d_ids = nullptr;
|
||||
size_t ids_buf_offset = 0;
|
||||
|
||||
bool logits_uma = false;
|
||||
bool weights_uma = false;
|
||||
bool ids_uma = false;
|
||||
|
||||
if (ctx->device->uma) {
|
||||
ggml_vk_host_get(ctx->device, logits->data, d_logits, logits_buf_offset);
|
||||
ggml_vk_host_get(ctx->device, weights->data, d_weights, weights_buf_offset);
|
||||
ggml_vk_host_get(ctx->device, ids->data, d_ids, ids_buf_offset);
|
||||
logits_uma = d_logits != nullptr;
|
||||
weights_uma = d_weights != nullptr;
|
||||
ids_uma = d_ids != nullptr;
|
||||
}
|
||||
|
||||
if (!logits_uma) {
|
||||
d_logits = logits_buf_ctx->dev_buffer;
|
||||
logits_buf_offset = vk_tensor_offset(logits) + logits->view_offs;
|
||||
GGML_ASSERT(d_logits != nullptr);
|
||||
}
|
||||
if (!weights_uma) {
|
||||
d_weights = weights_buf_ctx->dev_buffer;
|
||||
weights_buf_offset = vk_tensor_offset(weights) + weights->view_offs;
|
||||
GGML_ASSERT(d_weights != nullptr);
|
||||
}
|
||||
if (!ids_uma) {
|
||||
d_ids = ids_buf_ctx->dev_buffer;
|
||||
ids_buf_offset = vk_tensor_offset(ids) + ids->view_offs;
|
||||
GGML_ASSERT(d_ids != nullptr);
|
||||
}
|
||||
vk_subbuffer logits_buf = ggml_vk_tensor_subbuffer(ctx, logits);
|
||||
vk_subbuffer weights_buf = ggml_vk_tensor_subbuffer(ctx, weights);
|
||||
vk_subbuffer ids_buf = ggml_vk_tensor_subbuffer(ctx, ids);
|
||||
|
||||
vk_op_topk_moe_push_constants pc {};
|
||||
pc.n_rows = n_rows;
|
||||
@@ -10098,12 +9744,7 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
|
||||
const uint32_t rows_per_block = 4;
|
||||
std::array<uint32_t, 3> elements = { CEIL_DIV(n_rows, rows_per_block), 1, 1 };
|
||||
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||
{
|
||||
ggml_vk_subbuffer(ctx, d_logits, logits_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, d_weights, weights_buf_offset),
|
||||
ggml_vk_subbuffer(ctx, d_ids, ids_buf_offset),
|
||||
}, pc, elements);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {logits_buf, weights_buf, ids_buf}, pc, elements);
|
||||
}
|
||||
|
||||
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_cgraph * cgraph, int node_idx, bool backprop) {
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include <condition_variable>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
@@ -73,6 +74,30 @@
|
||||
// For operations which process a row in parallel, this seems like a reasonable default
|
||||
#define WEBGPU_ROW_SPLIT_WG_SIZE 64
|
||||
|
||||
// Matrix multiplication parameters
|
||||
|
||||
// Register tiling parameters
|
||||
#define WEBGPU_MUL_MAT_TILE_M 8
|
||||
#define WEBGPU_MUL_MAT_TILE_N 8
|
||||
#define WEBGPU_MUL_MAT_WG_SIZE_M 8
|
||||
#define WEBGPU_MUL_MAT_WG_SIZE_N 8
|
||||
#define WEBGPU_MUL_MAT_TILE_K 32
|
||||
|
||||
// Subgroup matrix parameters
|
||||
// The number of subgroups in the M dimension
|
||||
#define WEBGPU_MUL_MAT_SUBGROUP_M 2
|
||||
// The number of subgroups in the N dimension
|
||||
#define WEBGPU_MUL_MAT_SUBGROUP_N 2
|
||||
// The number of subgroup matrices each subgroup accumulates over
|
||||
#define WEBGPU_MUL_MAT_SUBGROUP_MATRIX_M 4
|
||||
#define WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N 2
|
||||
|
||||
// Matrix-vector multiplication parameters
|
||||
#define WEBGPU_MUL_MAT_VEC_WG_SIZE 256
|
||||
// Must be multiple of 4 to work with vectorized paths, and must divide mul_mat_vec wg size
|
||||
#define WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG 64
|
||||
#define WEBGPU_MUL_MAT_VEC_TILE_K 256
|
||||
|
||||
/* End Constants */
|
||||
|
||||
// This is a "fake" base pointer, since WebGPU buffers do not have pointers to their locations.
|
||||
@@ -236,6 +261,10 @@ struct webgpu_context_struct {
|
||||
wgpu::Queue queue;
|
||||
wgpu::Limits limits;
|
||||
|
||||
bool supports_subgroup_matrix = false;
|
||||
uint32_t subgroup_size;
|
||||
wgpu::SubgroupMatrixConfig subgroup_matrix_config;
|
||||
|
||||
// Separate this out from limits since on some Metal systems, the limit returned by
|
||||
// querying the limits is higher than the actual allowed maximum.
|
||||
uint32_t max_wg_size_x;
|
||||
@@ -247,6 +276,11 @@ struct webgpu_context_struct {
|
||||
webgpu_buf_pool set_rows_error_buf_pool;
|
||||
|
||||
webgpu_pipeline memset_pipeline;
|
||||
|
||||
std::map<int, std::map<int, std::map<int, webgpu_pipeline>>> mul_mat_pipelines; // src0_type, src1_type, vectorized
|
||||
std::map<int, std::map<int, std::map<int, webgpu_pipeline>>>
|
||||
mul_mat_vec_pipelines; // src0_type, src1_type, vectorized
|
||||
|
||||
webgpu_pipeline mul_mat_pipeline[30][2];
|
||||
webgpu_pipeline set_rows_pipeline[1][2]; // dst->type, vectorized
|
||||
webgpu_pipeline get_rows_pipeline[30];
|
||||
@@ -321,6 +355,25 @@ struct ggml_backend_webgpu_buffer_context {
|
||||
|
||||
/* WebGPU object initializations */
|
||||
|
||||
// Process a WGSL shader string, replacing tokens of the form {{KEY}} with
|
||||
// the corresponding values provided in `repls`.
|
||||
static std::string ggml_webgpu_process_shader_repls(const char * src,
|
||||
const std::map<std::string, std::string> & repls) {
|
||||
if (!src) {
|
||||
return std::string();
|
||||
}
|
||||
std::string s = src;
|
||||
for (const auto & kv : repls) {
|
||||
std::string token = "{{" + kv.first + "}}";
|
||||
size_t pos = 0;
|
||||
while ((pos = s.find(token, pos)) != std::string::npos) {
|
||||
s.replace(pos, token.length(), kv.second);
|
||||
pos += kv.second.length();
|
||||
}
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
static void ggml_webgpu_create_pipeline(wgpu::Device & device,
|
||||
webgpu_pipeline & pipeline,
|
||||
const char * shader_code,
|
||||
@@ -346,6 +399,30 @@ static void ggml_webgpu_create_pipeline(wgpu::Device &
|
||||
pipeline = { device.CreateComputePipeline(&pipeline_desc), label };
|
||||
}
|
||||
|
||||
static webgpu_pipeline ggml_webgpu_create_pipeline2(wgpu::Device & device,
|
||||
const char * shader_code,
|
||||
const char * label,
|
||||
const std::vector<wgpu::ConstantEntry> & constants = {}) {
|
||||
wgpu::ShaderSourceWGSL shader_source;
|
||||
shader_source.code = shader_code;
|
||||
|
||||
wgpu::ShaderModuleDescriptor shader_desc;
|
||||
shader_desc.nextInChain = &shader_source;
|
||||
|
||||
wgpu::ShaderModule shader_module = device.CreateShaderModule(&shader_desc);
|
||||
|
||||
wgpu::ComputePipelineDescriptor pipeline_desc;
|
||||
pipeline_desc.label = label;
|
||||
pipeline_desc.compute.module = shader_module;
|
||||
pipeline_desc.compute.entryPoint = "main"; // Entry point in the WGSL code
|
||||
pipeline_desc.layout = nullptr; // nullptr means auto layout
|
||||
if (constants.size() > 0) {
|
||||
pipeline_desc.compute.constants = constants.data();
|
||||
pipeline_desc.compute.constantCount = constants.size();
|
||||
}
|
||||
return { device.CreateComputePipeline(&pipeline_desc), label };
|
||||
}
|
||||
|
||||
static void ggml_webgpu_create_buffer(wgpu::Device & device,
|
||||
wgpu::Buffer & buffer,
|
||||
size_t size,
|
||||
@@ -512,6 +589,7 @@ static webgpu_command ggml_backend_webgpu_build(webgpu_context &
|
||||
std::vector<uint32_t> params,
|
||||
std::vector<wgpu::BindGroupEntry> bind_group_entries,
|
||||
uint32_t wg_x,
|
||||
uint32_t wg_y = 1,
|
||||
std::optional<webgpu_pool_bufs> set_rows_error_bufs = std::nullopt) {
|
||||
webgpu_pool_bufs params_bufs = ctx->param_buf_pool.alloc_bufs();
|
||||
|
||||
@@ -557,7 +635,7 @@ static webgpu_command ggml_backend_webgpu_build(webgpu_context &
|
||||
#endif
|
||||
pass.SetPipeline(pipeline.pipeline);
|
||||
pass.SetBindGroup(0, bind_group);
|
||||
pass.DispatchWorkgroups(wg_x, 1, 1);
|
||||
pass.DispatchWorkgroups(wg_x, wg_y, 1);
|
||||
pass.End();
|
||||
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
@@ -779,7 +857,7 @@ static std::optional<webgpu_command> ggml_webgpu_set_rows(webgpu_context & ctx,
|
||||
|
||||
uint32_t wg_x = (threads + max_wg_size - 1) / max_wg_size;
|
||||
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, error_bufs);
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, 1, error_bufs);
|
||||
}
|
||||
|
||||
static webgpu_command ggml_webgpu_get_rows(webgpu_context & ctx,
|
||||
@@ -835,8 +913,8 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
|
||||
(uint32_t) dst->ne[1], // number of rows in result (M)
|
||||
(uint32_t) dst->ne[0], // number of columns in result (N)
|
||||
(uint32_t) dst->ne[0], // number of rows in result (M, transposed)
|
||||
(uint32_t) dst->ne[1], // number of columns in result (N)
|
||||
(uint32_t) src0->ne[0], // number of columns in src0/src1 (K)
|
||||
(uint32_t) (src0->nb[1] / ggml_type_size(src0->type)), // stride (elements/blocks) of src0 in dimension 1
|
||||
(uint32_t) (src1->nb[1] / ggml_type_size(src1->type)), // stride (elements/blocks) of src1 in dimension 1
|
||||
@@ -865,9 +943,67 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
|
||||
.size = ggml_webgpu_tensor_binding_size(ctx, dst) },
|
||||
};
|
||||
|
||||
webgpu_pipeline pipeline = ctx->mul_mat_pipeline[src0->type][src1->type];
|
||||
|
||||
uint32_t wg_x =
|
||||
(dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3] + WEBGPU_MUL_MAT_WG_SIZE - 1) / WEBGPU_MUL_MAT_WG_SIZE;
|
||||
return ggml_backend_webgpu_build(ctx, ctx->mul_mat_pipeline[src0->type][src1->type], params, entries, wg_x);
|
||||
uint32_t wg_y = 1;
|
||||
|
||||
bool use_fast = false;
|
||||
switch (src1->type) {
|
||||
case GGML_TYPE_F16:
|
||||
use_fast = (src0->type == GGML_TYPE_F16);
|
||||
break;
|
||||
case GGML_TYPE_F32:
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
use_fast = true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
if (use_fast) {
|
||||
int vectorized = src0->ne[0] % 4 == 0 && dst->ne[0] % 4 == 0 && dst->ne[1] % 4 == 0;
|
||||
if (dst->ne[1] == 1) {
|
||||
// We don't support vectorized mul_mat_vec for quantized types
|
||||
vectorized = vectorized && (src0->type < 2);
|
||||
pipeline = ctx->mul_mat_vec_pipelines[src0->type][src1->type][vectorized];
|
||||
uint32_t batches = dst->ne[2] * dst->ne[3];
|
||||
uint32_t output_groups =
|
||||
(dst->ne[0] + WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG - 1) / WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG;
|
||||
uint32_t total_wg = output_groups * batches;
|
||||
wg_x = total_wg % ctx->limits.maxComputeWorkgroupsPerDimension;
|
||||
wg_y = (total_wg + ctx->limits.maxComputeWorkgroupsPerDimension - 1) /
|
||||
ctx->limits.maxComputeWorkgroupsPerDimension;
|
||||
} else {
|
||||
pipeline = ctx->mul_mat_pipelines[src0->type][src1->type][vectorized];
|
||||
uint32_t wg_m;
|
||||
uint32_t wg_n;
|
||||
if (ctx->supports_subgroup_matrix) {
|
||||
// The total number of subgroups/workgroups needed per matrix.
|
||||
uint32_t wg_m_sg_tile =
|
||||
WEBGPU_MUL_MAT_SUBGROUP_M * WEBGPU_MUL_MAT_SUBGROUP_MATRIX_M * ctx->subgroup_matrix_config.M;
|
||||
wg_m = (dst->ne[0] + wg_m_sg_tile - 1) / wg_m_sg_tile;
|
||||
uint32_t wg_n_sg_tile =
|
||||
WEBGPU_MUL_MAT_SUBGROUP_N * WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N * ctx->subgroup_matrix_config.N;
|
||||
wg_n = (dst->ne[1] + wg_n_sg_tile - 1) / wg_n_sg_tile;
|
||||
} else {
|
||||
uint32_t tile_m_s = WEBGPU_MUL_MAT_TILE_M * WEBGPU_MUL_MAT_WG_SIZE_M;
|
||||
uint32_t tile_n_s = WEBGPU_MUL_MAT_TILE_N * WEBGPU_MUL_MAT_WG_SIZE_N;
|
||||
wg_m = (dst->ne[0] + tile_m_s - 1) / tile_m_s;
|
||||
wg_n = (dst->ne[1] + tile_n_s - 1) / tile_n_s;
|
||||
}
|
||||
wg_x = wg_m * wg_n * dst->ne[2] * dst->ne[3];
|
||||
}
|
||||
}
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
|
||||
}
|
||||
|
||||
static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx,
|
||||
@@ -1583,12 +1719,6 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) {
|
||||
}
|
||||
|
||||
static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F32][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_f32_f32, "mul_mat_f32_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F16][GGML_TYPE_F16],
|
||||
wgsl_mul_mat_f16_f16, "mul_mat_f16_f16");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F16][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_f16_f32, "mul_mat_f16_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_0][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_q4_0_f32, "mul_mat_q4_0_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_1][GGML_TYPE_F32],
|
||||
@@ -1627,6 +1757,136 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
|
||||
wgsl_mul_mat_iq4_nl_f32, "mul_mat_iq4_nl_f32");
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_XS][GGML_TYPE_F32],
|
||||
wgsl_mul_mat_iq4_xs_f32, "mul_mat_iq4_xs_f32");
|
||||
|
||||
if (webgpu_ctx->supports_subgroup_matrix) {
|
||||
std::map<std::string, std::string> sg_matrix_repls;
|
||||
sg_matrix_repls["WEBGPU_MAX_SUBGROUP_SIZE"] = std::to_string(webgpu_ctx->subgroup_size);
|
||||
sg_matrix_repls["WEBGPU_TILE_K"] = std::to_string(WEBGPU_MUL_MAT_TILE_K);
|
||||
sg_matrix_repls["WEBGPU_SUBGROUP_M"] = std::to_string(WEBGPU_MUL_MAT_SUBGROUP_M);
|
||||
sg_matrix_repls["WEBGPU_SUBGROUP_N"] = std::to_string(WEBGPU_MUL_MAT_SUBGROUP_N);
|
||||
sg_matrix_repls["WEBGPU_SUBGROUP_MATRIX_M"] = std::to_string(WEBGPU_MUL_MAT_SUBGROUP_MATRIX_M);
|
||||
sg_matrix_repls["WEBGPU_SUBGROUP_MATRIX_N"] = std::to_string(WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N);
|
||||
sg_matrix_repls["WEBGPU_SG_MAT_M_SIZE"] = std::to_string(webgpu_ctx->subgroup_matrix_config.M);
|
||||
sg_matrix_repls["WEBGPU_SG_MAT_N_SIZE"] = std::to_string(webgpu_ctx->subgroup_matrix_config.N);
|
||||
sg_matrix_repls["WEBGPU_SG_MAT_K_SIZE"] = std::to_string(webgpu_ctx->subgroup_matrix_config.K);
|
||||
|
||||
std::string proc_mul_mat_subgroup_matrix_f32_f32 =
|
||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32, sg_matrix_repls);
|
||||
std::string proc_mul_mat_subgroup_matrix_f32_f32_vec =
|
||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32_vec, sg_matrix_repls);
|
||||
std::string proc_mul_mat_subgroup_matrix_f16_f32 =
|
||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32, sg_matrix_repls);
|
||||
std::string proc_mul_mat_subgroup_matrix_f16_f32_vec =
|
||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32_vec, sg_matrix_repls);
|
||||
std::string proc_mul_mat_subgroup_matrix_f16_f16 =
|
||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16, sg_matrix_repls);
|
||||
std::string proc_mul_mat_subgroup_matrix_f16_f16_vec =
|
||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16_vec, sg_matrix_repls);
|
||||
std::string proc_mul_mat_subgroup_matrix_q4_0_f32 =
|
||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32, sg_matrix_repls);
|
||||
std::string proc_mul_mat_subgroup_matrix_q4_0_f32_vec =
|
||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32_vec, sg_matrix_repls);
|
||||
|
||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
||||
webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f32_f32.c_str(), "mul_mat_subgroup_matrix_f32_f32");
|
||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] =
|
||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f32_f32_vec.c_str(),
|
||||
"mul_mat_subgroup_matrix_f32_f32_vec");
|
||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
||||
webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f32.c_str(), "mul_mat_subgroup_matrix_f16_f32");
|
||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] =
|
||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f32_vec.c_str(),
|
||||
"mul_mat_subgroup_matrix_f16_f32_vec");
|
||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] = ggml_webgpu_create_pipeline2(
|
||||
webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f16.c_str(), "mul_mat_subgroup_matrix_f16_f16");
|
||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
|
||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f16_vec.c_str(),
|
||||
"mul_mat_subgroup_matrix_f16_f16_vec");
|
||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
||||
webgpu_ctx->device, proc_mul_mat_subgroup_matrix_q4_0_f32.c_str(), "mul_mat_subgroup_matrix_q4_0_f32");
|
||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] =
|
||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_q4_0_f32_vec.c_str(),
|
||||
"mul_mat_subgroup_matrix_q4_0_f32_vec");
|
||||
} else {
|
||||
std::vector<wgpu::ConstantEntry> mul_mat_reg_tile_constants(3);
|
||||
mul_mat_reg_tile_constants[0].key = "TILE_K";
|
||||
mul_mat_reg_tile_constants[0].value = WEBGPU_MUL_MAT_TILE_K;
|
||||
mul_mat_reg_tile_constants[1].key = "WORKGROUP_SIZE_M";
|
||||
mul_mat_reg_tile_constants[1].value = WEBGPU_MUL_MAT_WG_SIZE_M;
|
||||
mul_mat_reg_tile_constants[2].key = "WORKGROUP_SIZE_N";
|
||||
mul_mat_reg_tile_constants[2].value = WEBGPU_MUL_MAT_WG_SIZE_N;
|
||||
|
||||
std::map<std::string, std::string> reg_repls;
|
||||
reg_repls["WEBGPU_TILE_M"] = std::to_string(WEBGPU_MUL_MAT_TILE_M);
|
||||
reg_repls["WEBGPU_TILE_N"] = std::to_string(WEBGPU_MUL_MAT_TILE_N);
|
||||
|
||||
// Process each reg-tile shader with tile replacements.
|
||||
// Keep the processed strings in-scope so .c_str() remains valid.
|
||||
std::string proc_mul_mat_reg_tile_f32_f32 =
|
||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32, reg_repls);
|
||||
std::string proc_mul_mat_reg_tile_f32_f32_vec =
|
||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32_vec, reg_repls);
|
||||
std::string proc_mul_mat_reg_tile_f16_f32 =
|
||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32, reg_repls);
|
||||
std::string proc_mul_mat_reg_tile_f16_f32_vec =
|
||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32_vec, reg_repls);
|
||||
std::string proc_mul_mat_reg_tile_f16_f16 =
|
||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16, reg_repls);
|
||||
std::string proc_mul_mat_reg_tile_f16_f16_vec =
|
||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16_vec, reg_repls);
|
||||
std::string proc_mul_mat_reg_tile_q4_0_f32 =
|
||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32, reg_repls);
|
||||
std::string proc_mul_mat_reg_tile_q4_0_f32_vec =
|
||||
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32_vec, reg_repls);
|
||||
|
||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] =
|
||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f32_f32.c_str(),
|
||||
"mul_mat_reg_tile_f32_f32", mul_mat_reg_tile_constants);
|
||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] =
|
||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f32_f32_vec.c_str(),
|
||||
"mul_mat_reg_tile_f32_f32_vec", mul_mat_reg_tile_constants);
|
||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] =
|
||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f32.c_str(),
|
||||
"mul_mat_reg_tile_f16_f32", mul_mat_reg_tile_constants);
|
||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] =
|
||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f32_vec.c_str(),
|
||||
"mul_mat_reg_tile_f16_f32_vec", mul_mat_reg_tile_constants);
|
||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] =
|
||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f16.c_str(),
|
||||
"mul_mat_reg_tile_f16_f16", mul_mat_reg_tile_constants);
|
||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
|
||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f16_vec.c_str(),
|
||||
"mul_mat_reg_tile_f16_f16_vec", mul_mat_reg_tile_constants);
|
||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] =
|
||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_q4_0_f32.c_str(),
|
||||
"mul_mat_reg_tile_q4_0_f32", mul_mat_reg_tile_constants);
|
||||
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] =
|
||||
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_q4_0_f32_vec.c_str(),
|
||||
"mul_mat_reg_tile_q4_0_f32_vec", mul_mat_reg_tile_constants);
|
||||
}
|
||||
|
||||
std::vector<wgpu::ConstantEntry> mul_mat_vec_constants(3);
|
||||
mul_mat_vec_constants[0].key = "WORKGROUP_SIZE";
|
||||
mul_mat_vec_constants[0].value = WEBGPU_MUL_MAT_VEC_WG_SIZE;
|
||||
mul_mat_vec_constants[1].key = "TILE_K";
|
||||
mul_mat_vec_constants[1].value = WEBGPU_MUL_MAT_VEC_TILE_K;
|
||||
mul_mat_vec_constants[2].key = "OUTPUTS_PER_WG";
|
||||
mul_mat_vec_constants[2].value = WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG;
|
||||
|
||||
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
||||
webgpu_ctx->device, wgsl_mul_mat_vec_f32_f32, "mul_mat_vec_f32_f32", mul_mat_vec_constants);
|
||||
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
|
||||
webgpu_ctx->device, wgsl_mul_mat_vec_f32_f32_vec, "mul_mat_vec_f32_f32_vec", mul_mat_vec_constants);
|
||||
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
||||
webgpu_ctx->device, wgsl_mul_mat_vec_f16_f32, "mul_mat_vec_f16_f32", mul_mat_vec_constants);
|
||||
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
|
||||
webgpu_ctx->device, wgsl_mul_mat_vec_f16_f32_vec, "mul_mat_vec_f16_f32_vec", mul_mat_vec_constants);
|
||||
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] = ggml_webgpu_create_pipeline2(
|
||||
webgpu_ctx->device, wgsl_mul_mat_vec_f16_f16, "mul_mat_vec_f16_f16", mul_mat_vec_constants);
|
||||
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] = ggml_webgpu_create_pipeline2(
|
||||
webgpu_ctx->device, wgsl_mul_mat_vec_f16_f16_vec, "mul_mat_vec_f16_f16_vec", mul_mat_vec_constants);
|
||||
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
||||
webgpu_ctx->device, wgsl_mul_mat_vec_q4_0_f32, "mul_mat_vec_q4_0_f32", mul_mat_vec_constants);
|
||||
}
|
||||
|
||||
static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) {
|
||||
@@ -2124,7 +2384,13 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
||||
|
||||
webgpu_context ctx = reg_ctx->webgpu_ctx;
|
||||
|
||||
wgpu::RequestAdapterOptions options = {};
|
||||
// TODO: track need for these toggles: https://issues.chromium.org/issues/42251215
|
||||
const char * const adapterEnabledToggles[] = { "vulkan_enable_f16_on_nvidia", "use_vulkan_memory_model" };
|
||||
wgpu::DawnTogglesDescriptor adapterTogglesDesc;
|
||||
adapterTogglesDesc.enabledToggles = adapterEnabledToggles;
|
||||
adapterTogglesDesc.enabledToggleCount = 2;
|
||||
wgpu::RequestAdapterOptions options = {};
|
||||
options.nextInChain = &adapterTogglesDesc;
|
||||
ctx->instance.WaitAny(ctx->instance.RequestAdapter(
|
||||
&options, wgpu::CallbackMode::AllowSpontaneous,
|
||||
[&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
|
||||
@@ -2140,12 +2406,46 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
||||
ctx->adapter.GetLimits(&ctx->limits);
|
||||
ctx->max_wg_size_x = 288; // default value
|
||||
|
||||
wgpu::AdapterInfo info{};
|
||||
wgpu::AdapterInfo info{};
|
||||
wgpu::AdapterPropertiesSubgroupMatrixConfigs subgroup_matrix_configs{};
|
||||
if (ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
|
||||
info.nextInChain = &subgroup_matrix_configs;
|
||||
}
|
||||
ctx->adapter.GetInfo(&info);
|
||||
|
||||
wgpu::SupportedFeatures features;
|
||||
ctx->adapter.GetFeatures(&features);
|
||||
// we require f16 support
|
||||
GGML_ASSERT(ctx->adapter.HasFeature(wgpu::FeatureName::ShaderF16));
|
||||
|
||||
// Only support square f16 matrices of size 8 or 16 for now
|
||||
bool valid_subgroup_matrix_config = false;
|
||||
if (ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
|
||||
for (size_t i = 0; i < subgroup_matrix_configs.configCount; i++) {
|
||||
const wgpu::SubgroupMatrixConfig config = subgroup_matrix_configs.configs[i];
|
||||
if (config.M == config.N && config.N == config.K && (config.K == 8 || config.K == 16) &&
|
||||
config.componentType == wgpu::SubgroupMatrixComponentType::F16 &&
|
||||
config.resultComponentType == wgpu::SubgroupMatrixComponentType::F16) {
|
||||
ctx->subgroup_matrix_config = config;
|
||||
valid_subgroup_matrix_config = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// For subgroup matrix code to be the most efficient, we would like the subgroup size to be consistent and accurate.
|
||||
// Unfortunately, that is not possible, so we use the maximum subgroup size reported by the adapter.
|
||||
ctx->subgroup_size = info.subgroupMaxSize;
|
||||
ctx->supports_subgroup_matrix = valid_subgroup_matrix_config;
|
||||
|
||||
// Initialize device
|
||||
std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16,
|
||||
wgpu::FeatureName::ImplicitDeviceSynchronization };
|
||||
if (ctx->supports_subgroup_matrix) {
|
||||
required_features.push_back(wgpu::FeatureName::Subgroups);
|
||||
required_features.push_back(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix);
|
||||
}
|
||||
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
required_features.push_back(wgpu::FeatureName::TimestampQuery);
|
||||
#endif
|
||||
|
||||
@@ -72,9 +72,12 @@ def generate_variants(fname, input_dir, output_dir, outfile):
|
||||
except ValueError:
|
||||
decls_map = {}
|
||||
|
||||
with open(os.path.join(input_dir, "common_decls.tmpl"), "r", encoding="utf-8") as f:
|
||||
common_decls = f.read()
|
||||
decls_map.update(parse_decls(common_decls))
|
||||
for fname in sorted(os.listdir(input_dir)):
|
||||
if fname.endswith(".tmpl"):
|
||||
tmpl_path = os.path.join(input_dir, fname)
|
||||
with open(tmpl_path, "r", encoding="utf-8") as f_tmpl:
|
||||
decls = f_tmpl.read()
|
||||
decls_map.update(parse_decls(decls))
|
||||
|
||||
shader_template = extract_block(text, "SHADER")
|
||||
for variant in variants:
|
||||
|
||||
@@ -864,8 +864,8 @@ struct MulMatParams {
|
||||
broadcast3: u32
|
||||
};
|
||||
|
||||
@group(0) @binding(0) var<storage, read_write> src0: array<{{SRC0_TYPE}}>; // N rows, K columns
|
||||
@group(0) @binding(1) var<storage, read_write> src1: array<{{SRC1_TYPE}}>; // M rows, K columns (transposed)
|
||||
@group(0) @binding(0) var<storage, read_write> src0: array<{{SRC0_TYPE}}>; // M rows, K columns
|
||||
@group(0) @binding(1) var<storage, read_write> src1: array<{{SRC1_TYPE}}>; // K rows, N columns (transposed)
|
||||
@group(0) @binding(2) var<storage, read_write> dst: array<f32>; // M rows, N columns
|
||||
|
||||
@group(0) @binding(3) var<uniform> params: MulMatParams;
|
||||
@@ -891,8 +891,8 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
|
||||
|
||||
let dst2_rem = dst3_rem % dst2_stride;
|
||||
|
||||
let row = dst2_rem / params.n; // output row
|
||||
let col = dst2_rem % params.n; // output column
|
||||
let row = dst2_rem / params.m; // output row
|
||||
let col = dst2_rem % params.m; // output column
|
||||
|
||||
let src0_idx_base = params.offset_src0 + src03_idx * params.stride_03 + src02_idx * params.stride_02 + col * params.stride_01;
|
||||
let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12 + row * params.stride_11;
|
||||
@@ -901,7 +901,7 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
|
||||
for (var i: u32 = 0u; i < params.k/{{BLOCK_SIZE}}; i = i + 1u) {
|
||||
sum += multiply_add(src0_idx_base, src1_idx_base, i);
|
||||
}
|
||||
dst[params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.n + col] = sum;
|
||||
dst[params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.m + col] = sum;
|
||||
}
|
||||
|
||||
#end(SHADER)
|
||||
|
||||
97
ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl
Normal file
97
ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl
Normal file
@@ -0,0 +1,97 @@
|
||||
#decl(SHMEM_VEC)
|
||||
fn store_shmem(val: vec4<f16>, idx: u32) {
|
||||
shmem[idx] = val.x;
|
||||
shmem[idx + 1] = val.y;
|
||||
shmem[idx + 2] = val.z;
|
||||
shmem[idx + 3] = val.w;
|
||||
}
|
||||
#enddecl(SHMEM_VEC)
|
||||
|
||||
#decl(SHMEM_SCALAR)
|
||||
fn store_shmem(val: f16, idx: u32) {
|
||||
shmem[idx] = val;
|
||||
}
|
||||
#enddecl(SHMEM_SCALAR)
|
||||
|
||||
#decl(INIT_SRC0_SHMEM_FLOAT)
|
||||
|
||||
fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
|
||||
for (var elem_idx = thread_id * {{VEC_SIZE}}; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE * {{VEC_SIZE}}) {
|
||||
let tile_m = elem_idx / TILE_K;
|
||||
let tile_k = elem_idx % TILE_K;
|
||||
let global_m = offset_m + tile_m;
|
||||
let global_k = k_outer + tile_k;
|
||||
let src0_idx = batch_offset + global_m * params.stride_01 + global_k;
|
||||
let src0_val = select( // taking a slight performance hit to avoid oob
|
||||
{{SRC0_TYPE}}(0.0),
|
||||
src0[src0_idx/{{VEC_SIZE}}],
|
||||
global_m < params.m && global_k < params.k);
|
||||
store_shmem({{SHMEM_TYPE}}(src0_val), elem_idx);
|
||||
}
|
||||
}
|
||||
|
||||
#enddecl(INIT_SRC0_SHMEM_FLOAT)
|
||||
|
||||
#decl(INIT_SRC1_SHMEM)
|
||||
|
||||
fn init_shmem_src1(thread_id: u32, batch_offset: u32, offset_n: u32, k_outer: u32) {
|
||||
for (var elem_idx = thread_id * {{VEC_SIZE}}; elem_idx < TILE_SRC1_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE * {{VEC_SIZE}}) {
|
||||
let tile_n = elem_idx / TILE_K;
|
||||
let tile_k = elem_idx % TILE_K;
|
||||
let global_n = offset_n + tile_n;
|
||||
let global_k = k_outer + tile_k;
|
||||
let src1_idx = batch_offset + global_n * params.stride_11 + global_k;
|
||||
let src1_val = select(
|
||||
{{SRC1_TYPE}}(0.0),
|
||||
src1[src1_idx/{{VEC_SIZE}}],
|
||||
global_n < params.n && global_k < params.k);
|
||||
store_shmem({{SHMEM_TYPE}}(src1_val), TILE_SRC0_SHMEM + elem_idx);
|
||||
}
|
||||
}
|
||||
|
||||
#enddecl(INIT_SRC1_SHMEM)
|
||||
|
||||
#decl(INIT_SRC0_SHMEM_Q4_0)
|
||||
|
||||
const BLOCK_SIZE = 32u;
|
||||
// the number of blocks per k-tile. Note that this currently only works if TILE_K is a multiple of BLOCK_SIZE, which may need to be rethought for larger quantized types.
|
||||
override BLOCKS_K = TILE_K/BLOCK_SIZE;
|
||||
const NQ = 16u;
|
||||
const F16_PER_BLOCK = 9u; // 1 scale + 8x4 packed weights
|
||||
const WEIGHTS_PER_F16 = 4u; // 4 weights per f16
|
||||
const F16_PER_THREAD = NQ / WEIGHTS_PER_F16;
|
||||
|
||||
fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
|
||||
for (var i = thread_id * NQ; i < TILE_SRC0_SHMEM; i += TOTAL_WORKGROUP_SIZE * NQ) {
|
||||
let blck_idx = i / BLOCK_SIZE;
|
||||
let block_offset = (i % BLOCK_SIZE) / WEIGHTS_PER_F16;
|
||||
let shmem_idx = blck_idx * BLOCK_SIZE + block_offset * 2u;
|
||||
|
||||
let tile_m = blck_idx / BLOCKS_K;
|
||||
let global_m = offset_m + tile_m;
|
||||
let block_k = blck_idx % BLOCKS_K;
|
||||
let global_k = k_outer / BLOCK_SIZE + block_k;
|
||||
|
||||
if (global_m < params.m && global_k < params.k / BLOCK_SIZE) {
|
||||
let src0_idx = batch_offset + global_m * params.stride_01 + global_k;
|
||||
let scale_idx = src0_idx * F16_PER_BLOCK;
|
||||
let d = src0[scale_idx];
|
||||
|
||||
for (var j = 0u; j < F16_PER_THREAD; j += 2) {
|
||||
let q_0 = src0[scale_idx + 1u + block_offset + j];
|
||||
let q_1 = src0[scale_idx + 1u + block_offset + j + 1];
|
||||
|
||||
let q_packed = bitcast<u32>(vec2(q_0, q_1));
|
||||
for (var k = 0u; k < 4u; k++) {
|
||||
let q_byte = get_byte(q_packed, k);
|
||||
let q_hi = (f16((q_byte >> 4) & 0xF) - 8.0) * d;
|
||||
let q_lo = (f16(q_byte & 0xF) - 8.0) * d;
|
||||
shmem[shmem_idx + j * 2 + k] = q_lo;
|
||||
shmem[shmem_idx + j * 2 + k + 16u] = q_hi;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#enddecl(INIT_SRC0_SHMEM_Q4_0)
|
||||
247
ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl
Normal file
247
ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl
Normal file
@@ -0,0 +1,247 @@
|
||||
#define(VARIANTS)
|
||||
[
|
||||
{
|
||||
"SHADER_SUFFIX": "f32_f32_vec",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "vec4<f32>",
|
||||
"SRC1_TYPE" : "vec4<f32>",
|
||||
"DST_TYPE" : "vec4<f32>",
|
||||
"SHMEM_TYPE" : "vec4<f16>",
|
||||
"VEC_SIZE" : 4,
|
||||
},
|
||||
"DECLS": ["VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
|
||||
},
|
||||
{
|
||||
"SHADER_SUFFIX": "f32_f32",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "f32",
|
||||
"SRC1_TYPE" : "f32",
|
||||
"DST_TYPE" : "f32",
|
||||
"SHMEM_TYPE" : "f16",
|
||||
"VEC_SIZE" : 1,
|
||||
},
|
||||
"DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
|
||||
},
|
||||
{
|
||||
"SHADER_SUFFIX": "f16_f32_vec",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "vec4<f16>",
|
||||
"SRC1_TYPE" : "vec4<f32>",
|
||||
"DST_TYPE" : "vec4<f32>",
|
||||
"SHMEM_TYPE" : "vec4<f16>",
|
||||
"VEC_SIZE" : 4,
|
||||
},
|
||||
"DECLS": ["VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
|
||||
},
|
||||
{
|
||||
"SHADER_SUFFIX": "f16_f32",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "f16",
|
||||
"SRC1_TYPE" : "f32",
|
||||
"DST_TYPE" : "f32",
|
||||
"SHMEM_TYPE" : "f16",
|
||||
"VEC_SIZE" : 1,
|
||||
},
|
||||
"DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
|
||||
},
|
||||
{
|
||||
"SHADER_SUFFIX": "f16_f16_vec",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "vec4<f16>",
|
||||
"SRC1_TYPE" : "vec4<f16>",
|
||||
"DST_TYPE" : "vec4<f32>",
|
||||
"SHMEM_TYPE" : "vec4<f16>",
|
||||
"VEC_SIZE" : 4,
|
||||
},
|
||||
"DECLS": ["VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
|
||||
},
|
||||
{
|
||||
"SHADER_SUFFIX": "f16_f16",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "f16",
|
||||
"SRC1_TYPE" : "f16",
|
||||
"DST_TYPE" : "f32",
|
||||
"SHMEM_TYPE" : "f16",
|
||||
"VEC_SIZE" : 1,
|
||||
},
|
||||
"DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
|
||||
},
|
||||
{
|
||||
"SHADER_SUFFIX": "q4_0_f32_vec",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "f16",
|
||||
"SRC1_TYPE" : "vec4<f32>",
|
||||
"DST_TYPE" : "vec4<f32>",
|
||||
"SHMEM_TYPE" : "vec4<f16>",
|
||||
"VEC_SIZE" : 4,
|
||||
},
|
||||
"DECLS": ["BYTE_HELPERS", "VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_Q4_0", "INIT_SRC1_SHMEM"]
|
||||
},
|
||||
{
|
||||
"SHADER_SUFFIX": "q4_0_f32",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "f16",
|
||||
"SRC1_TYPE" : "f32",
|
||||
"DST_TYPE" : "f32",
|
||||
"SHMEM_TYPE" : "f16",
|
||||
"VEC_SIZE" : 1,
|
||||
},
|
||||
"DECLS": ["BYTE_HELPERS", "SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_Q4_0", "INIT_SRC1_SHMEM"]
|
||||
}
|
||||
]
|
||||
|
||||
#end(VARIANTS)
|
||||
|
||||
#define(DECLS)
|
||||
|
||||
#decl(VEC)
|
||||
fn store_val(acc: array<array<f16, TILE_N>, TILE_M>, tn: u32, tm: u32) -> vec4<f32> {
|
||||
return vec4<f32>(f32(acc[tm][tn]), f32(acc[tm + 1][tn]), f32(acc[tm + 2][tn]), f32(acc[tm + 3][tn]));
|
||||
}
|
||||
#enddecl(VEC)
|
||||
|
||||
#decl(SCALAR)
|
||||
fn store_val(acc: array<array<f16, TILE_N>, TILE_M>, tn: u32, tm: u32) -> f32 {
|
||||
return f32(acc[tm][tn]);
|
||||
}
|
||||
#enddecl(SCALAR)
|
||||
|
||||
#end(DECLS)
|
||||
|
||||
#define(SHADER)
|
||||
enable f16;
|
||||
|
||||
struct MulMatParams {
|
||||
offset_src0: u32,
|
||||
offset_src1: u32,
|
||||
offset_dst: u32,
|
||||
m: u32,
|
||||
n: u32,
|
||||
k: u32,
|
||||
stride_01: u32,
|
||||
stride_11: u32,
|
||||
stride_02: u32,
|
||||
stride_12: u32,
|
||||
stride_03: u32,
|
||||
stride_13: u32,
|
||||
bs02: u32,
|
||||
bs03: u32,
|
||||
broadcast2: u32,
|
||||
broadcast3: u32
|
||||
};
|
||||
|
||||
@group(0) @binding(0) var<storage, read_write> src0: array<{{SRC0_TYPE}}>; // M rows, K columns
|
||||
@group(0) @binding(1) var<storage, read_write> src1: array<{{SRC1_TYPE}}>; // K rows, N columns (transposed)
|
||||
@group(0) @binding(2) var<storage, read_write> dst: array<{{DST_TYPE}}>; // M rows, N columns (transposed)
|
||||
|
||||
@group(0) @binding(3) var<uniform> params: MulMatParams;
|
||||
|
||||
DECLS
|
||||
|
||||
fn get_local_n(thread_id: u32) -> u32 {
|
||||
return thread_id / WORKGROUP_SIZE_M;
|
||||
}
|
||||
fn get_local_m(thread_id: u32) -> u32 {
|
||||
return thread_id % WORKGROUP_SIZE_M;
|
||||
}
|
||||
|
||||
// TILE_M must be multiple of 4 for vec4 loads
|
||||
const TILE_M = {{WEBGPU_TILE_M}}u;
|
||||
const TILE_N = {{WEBGPU_TILE_N}}u;
|
||||
|
||||
override WORKGROUP_SIZE_M: u32;
|
||||
override WORKGROUP_SIZE_N: u32;
|
||||
override TILE_K: u32;
|
||||
|
||||
override TOTAL_WORKGROUP_SIZE = WORKGROUP_SIZE_M * WORKGROUP_SIZE_N;
|
||||
override TILE_SRC0_SHMEM = TILE_K * WORKGROUP_SIZE_M * TILE_M;
|
||||
override TILE_SRC1_SHMEM = TILE_K * WORKGROUP_SIZE_N * TILE_N;
|
||||
|
||||
var<workgroup> shmem: array<f16, TILE_SRC0_SHMEM + TILE_SRC1_SHMEM>;
|
||||
|
||||
@compute @workgroup_size(TOTAL_WORKGROUP_SIZE)
|
||||
fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
@builtin(local_invocation_id) local_id: vec3<u32>) {
|
||||
|
||||
let thread_id = local_id.x;
|
||||
let local_m = get_local_m(thread_id);
|
||||
let local_n = get_local_n(thread_id);
|
||||
|
||||
let wg_n_count = (params.n + WORKGROUP_SIZE_N * TILE_N - 1u) / (WORKGROUP_SIZE_N * TILE_N);
|
||||
let wg_m_count = (params.m + WORKGROUP_SIZE_M * TILE_M - 1u) / (WORKGROUP_SIZE_M * TILE_M);
|
||||
let wg_per_matrix = wg_m_count * wg_n_count;
|
||||
|
||||
let batch_idx = wg_id.x / wg_per_matrix;
|
||||
|
||||
let wg_in_batch = wg_id.x % wg_per_matrix;
|
||||
let wg_m = wg_in_batch % wg_m_count;
|
||||
let wg_n = wg_in_batch / wg_m_count;
|
||||
|
||||
let output_row_base = wg_m * WORKGROUP_SIZE_M * TILE_M + local_m * TILE_M;
|
||||
let output_col_base = wg_n * WORKGROUP_SIZE_N * TILE_N + local_n * TILE_N;
|
||||
|
||||
let dst2_stride = params.m * params.n;
|
||||
let dst3_stride = dst2_stride * params.bs02 * params.broadcast2;
|
||||
|
||||
let dst3_idx = batch_idx / (params.bs02 * params.broadcast2);
|
||||
let src03_idx = dst3_idx / params.broadcast3;
|
||||
let src13_idx = dst3_idx;
|
||||
let dst2_idx = batch_idx % (params.bs02 * params.broadcast2);
|
||||
let src02_idx = dst2_idx / params.broadcast2;
|
||||
let src12_idx = dst2_idx;
|
||||
|
||||
let src0_batch_offset = params.offset_src0 + src03_idx * params.stride_03 + src02_idx * params.stride_02;
|
||||
let src1_batch_offset = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12;
|
||||
|
||||
let offset_m = wg_m * WORKGROUP_SIZE_M * TILE_M;
|
||||
let offset_n = wg_n * WORKGROUP_SIZE_N * TILE_N;
|
||||
|
||||
var acc: array<array<f16, TILE_N>, TILE_M>;
|
||||
|
||||
for (var k_outer = 0u; k_outer < params.k; k_outer += TILE_K) {
|
||||
|
||||
// see mul_mat_decls.tmpl
|
||||
init_shmem_src0(thread_id, src0_batch_offset, offset_m, k_outer);
|
||||
init_shmem_src1(thread_id, src1_batch_offset, offset_n, k_outer);
|
||||
|
||||
workgroupBarrier();
|
||||
|
||||
let k_end = min(TILE_K, params.k - k_outer);
|
||||
|
||||
for (var k_inner = 0u; k_inner < k_end; k_inner++) {
|
||||
var src0_tile: array<f16, TILE_M>;
|
||||
for (var tm = 0u; tm < TILE_M; tm++) {
|
||||
let src0_m = local_m * TILE_M + tm;
|
||||
let src0_idx = k_inner + src0_m * TILE_K;
|
||||
src0_tile[tm] = shmem[src0_idx];
|
||||
}
|
||||
for (var tn = 0u; tn < TILE_N; tn++) {
|
||||
let src1_n = local_n * TILE_N + tn;
|
||||
let src1_idx = src1_n * TILE_K + k_inner;
|
||||
let src1_val = shmem[TILE_SRC0_SHMEM + src1_idx];
|
||||
for (var tm = 0u; tm < TILE_M; tm++) {
|
||||
acc[tm][tn] += src0_tile[tm] * src1_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
workgroupBarrier();
|
||||
}
|
||||
|
||||
let dst_batch_offset = params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride;
|
||||
|
||||
for (var tn = 0u; tn < TILE_N; tn++) {
|
||||
let global_col = output_col_base + tn;
|
||||
if (global_col < params.n) {
|
||||
for (var tm = 0u; tm < TILE_M; tm += {{VEC_SIZE}}) {
|
||||
let global_row = output_row_base + tm;
|
||||
if (global_row < params.m) {
|
||||
let dst_idx = dst_batch_offset + global_col * params.m + global_row;
|
||||
dst[dst_idx/{{VEC_SIZE}}] = store_val(acc, tn, tm);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#end(SHADER)
|
||||
@@ -0,0 +1,302 @@
|
||||
#define(VARIANTS)
|
||||
[
|
||||
{
|
||||
"SHADER_SUFFIX": "f32_f32_vec",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "vec4<f32>",
|
||||
"SRC1_TYPE" : "vec4<f32>",
|
||||
"DST_TYPE" : "vec4<f32>",
|
||||
"SHMEM_TYPE" : "vec4<f16>",
|
||||
"VEC_SIZE" : 4,
|
||||
},
|
||||
"DECLS": ["VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
|
||||
},
|
||||
{
|
||||
"SHADER_SUFFIX": "f32_f32",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "f32",
|
||||
"SRC1_TYPE" : "f32",
|
||||
"DST_TYPE" : "f32",
|
||||
"SHMEM_TYPE" : "f16",
|
||||
"VEC_SIZE" : 1,
|
||||
},
|
||||
"DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
|
||||
},
|
||||
{
|
||||
"SHADER_SUFFIX": "f16_f32_vec",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "vec4<f16>",
|
||||
"SRC1_TYPE" : "vec4<f32>",
|
||||
"DST_TYPE" : "vec4<f32>",
|
||||
"SHMEM_TYPE" : "vec4<f16>",
|
||||
"VEC_SIZE" : 4,
|
||||
},
|
||||
"DECLS": ["VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
|
||||
},
|
||||
{
|
||||
"SHADER_SUFFIX": "f16_f32",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "f16",
|
||||
"SRC1_TYPE" : "f32",
|
||||
"DST_TYPE" : "f32",
|
||||
"SHMEM_TYPE" : "f16",
|
||||
"VEC_SIZE" : 1,
|
||||
},
|
||||
"DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
|
||||
},
|
||||
{
|
||||
"SHADER_SUFFIX": "f16_f16_vec",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "vec4<f16>",
|
||||
"SRC1_TYPE" : "vec4<f16>",
|
||||
"DST_TYPE" : "vec4<f32>",
|
||||
"SHMEM_TYPE" : "vec4<f16>",
|
||||
"VEC_SIZE" : 4,
|
||||
},
|
||||
"DECLS": ["VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
|
||||
},
|
||||
{
|
||||
"SHADER_SUFFIX": "f16_f16",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "f16",
|
||||
"SRC1_TYPE" : "f16",
|
||||
"DST_TYPE" : "f32",
|
||||
"SHMEM_TYPE" : "f16",
|
||||
"VEC_SIZE" : 1,
|
||||
},
|
||||
"DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
|
||||
},
|
||||
{
|
||||
"SHADER_SUFFIX": "q4_0_f32_vec",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "f16",
|
||||
"SRC1_TYPE" : "vec4<f32>",
|
||||
"DST_TYPE" : "vec4<f32>",
|
||||
"SHMEM_TYPE" : "vec4<f16>",
|
||||
"VEC_SIZE" : 4,
|
||||
},
|
||||
"DECLS": ["BYTE_HELPERS", "VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_Q4_0", "INIT_SRC1_SHMEM"]
|
||||
},
|
||||
{
|
||||
"SHADER_SUFFIX": "q4_0_f32",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "f16",
|
||||
"SRC1_TYPE" : "f32",
|
||||
"DST_TYPE" : "f32",
|
||||
"SHMEM_TYPE" : "f16",
|
||||
"VEC_SIZE" : 1,
|
||||
},
|
||||
"DECLS": ["BYTE_HELPERS", "SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_Q4_0", "INIT_SRC1_SHMEM"]
|
||||
}
|
||||
]
|
||||
|
||||
#end(VARIANTS)
|
||||
|
||||
#define(DECLS)
|
||||
|
||||
#decl(VEC)
|
||||
fn store_dst(shmem_idx: u32, dst_idx: u32) {
|
||||
dst[dst_idx] = vec4<f32>(
|
||||
f32(shmem[shmem_idx]),
|
||||
f32(shmem[shmem_idx + 1]),
|
||||
f32(shmem[shmem_idx + 2]),
|
||||
f32(shmem[shmem_idx + 3])
|
||||
);
|
||||
}
|
||||
#enddecl(VEC)
|
||||
|
||||
#decl(SCALAR)
|
||||
fn store_dst(shmem_idx: u32, dst_idx: u32) {
|
||||
dst[dst_idx] = f32(shmem[shmem_idx]);
|
||||
}
|
||||
#enddecl(SCALAR)
|
||||
|
||||
#end(DECLS)
|
||||
|
||||
#define(SHADER)
|
||||
diagnostic(off, chromium.subgroup_matrix_uniformity);
|
||||
enable f16;
|
||||
enable subgroups;
|
||||
enable chromium_experimental_subgroup_matrix;
|
||||
|
||||
struct MulMatParams {
|
||||
offset_src0: u32,
|
||||
offset_src1: u32,
|
||||
offset_dst: u32,
|
||||
m: u32,
|
||||
n: u32,
|
||||
k: u32,
|
||||
stride_01: u32,
|
||||
stride_11: u32,
|
||||
stride_02: u32,
|
||||
stride_12: u32,
|
||||
stride_03: u32,
|
||||
stride_13: u32,
|
||||
bs02: u32,
|
||||
bs03: u32,
|
||||
broadcast2: u32,
|
||||
broadcast3: u32
|
||||
};
|
||||
|
||||
@group(0) @binding(0) var<storage, read_write> src0: array<{{SRC0_TYPE}}>; // M rows, K columns
|
||||
@group(0) @binding(1) var<storage, read_write> src1: array<{{SRC1_TYPE}}>; // K rows, N columns (transposed)
|
||||
@group(0) @binding(2) var<storage, read_write> dst: array<{{DST_TYPE}}>; // M rows, N columns (transposed)
|
||||
|
||||
@group(0) @binding(3) var<uniform> params: MulMatParams;
|
||||
|
||||
DECLS
|
||||
|
||||
// Note: These are string interpolated at build time, cannot use override constants due to limitations in
|
||||
// current Dawn version type definitions/matrix load requirements for constant memory sizes.
|
||||
const SUBGROUP_M = {{WEBGPU_SUBGROUP_M}}u;
|
||||
const SUBGROUP_N = {{WEBGPU_SUBGROUP_N}}u;
|
||||
// For portability we assume the max subgroup size, meaning some subgroups will be masked out if the
|
||||
// runtime subgroup size is smaller.
|
||||
const MAX_SUBGROUP_SIZE = {{WEBGPU_MAX_SUBGROUP_SIZE}}u;
|
||||
|
||||
const EXPECTED_SUBGROUPS = SUBGROUP_M * SUBGROUP_N;
|
||||
|
||||
const SUBGROUP_MATRIX_M_SIZE = {{WEBGPU_SG_MAT_M_SIZE}}u;
|
||||
const SUBGROUP_MATRIX_N_SIZE = {{WEBGPU_SG_MAT_N_SIZE}}u;
|
||||
const SUBGROUP_MATRIX_K_SIZE = {{WEBGPU_SG_MAT_K_SIZE}}u;
|
||||
|
||||
const SUBGROUP_MATRIX_M = {{WEBGPU_SUBGROUP_MATRIX_M}}u;
|
||||
const SUBGROUP_MATRIX_N = {{WEBGPU_SUBGROUP_MATRIX_N}}u;
|
||||
|
||||
const TILE_K = {{WEBGPU_TILE_K}}u;
|
||||
|
||||
const WG_M_SG_TILE_SIZE = SUBGROUP_M * SUBGROUP_MATRIX_M * SUBGROUP_MATRIX_M_SIZE;
|
||||
const WG_N_SG_TILE_SIZE = SUBGROUP_N * SUBGROUP_MATRIX_N * SUBGROUP_MATRIX_N_SIZE;
|
||||
|
||||
const TOTAL_WORKGROUP_SIZE = SUBGROUP_M * SUBGROUP_N * MAX_SUBGROUP_SIZE;
|
||||
const TILE_SRC0_SHMEM = TILE_K * SUBGROUP_M * SUBGROUP_MATRIX_M * SUBGROUP_MATRIX_M_SIZE;
|
||||
const TILE_SRC1_SHMEM = TILE_K * SUBGROUP_N * SUBGROUP_MATRIX_N * SUBGROUP_MATRIX_N_SIZE;
|
||||
|
||||
const SG_MAT_ACCUM_SHMEM = SUBGROUP_M * SUBGROUP_MATRIX_M * SUBGROUP_N * SUBGROUP_MATRIX_N * SUBGROUP_MATRIX_M_SIZE * SUBGROUP_MATRIX_N_SIZE;
|
||||
|
||||
// We reuse shmem for accumulation matrices
|
||||
const SHMEM_SIZE = max(TILE_SRC0_SHMEM + TILE_SRC1_SHMEM, SG_MAT_ACCUM_SHMEM);
|
||||
|
||||
var<workgroup> shmem: array<f16, SHMEM_SIZE>;
|
||||
|
||||
@compute @workgroup_size(TOTAL_WORKGROUP_SIZE)
|
||||
fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
@builtin(local_invocation_id) local_id: vec3<u32>,
|
||||
@builtin(subgroup_id) subgroup_id: u32) {
|
||||
|
||||
let thread_id = local_id.x;
|
||||
let subgroup_m = subgroup_id % SUBGROUP_M;
|
||||
let subgroup_n = subgroup_id / SUBGROUP_M;
|
||||
|
||||
let wg_m_count = (params.m + WG_M_SG_TILE_SIZE - 1) / WG_M_SG_TILE_SIZE;
|
||||
let wg_n_count = (params.n + WG_N_SG_TILE_SIZE - 1) / WG_N_SG_TILE_SIZE;
|
||||
let wg_per_matrix = wg_m_count * wg_n_count;
|
||||
|
||||
let batch_idx = wg_id.x / wg_per_matrix;
|
||||
|
||||
let wg_in_batch = wg_id.x % wg_per_matrix;
|
||||
let wg_m = wg_in_batch % wg_m_count;
|
||||
let wg_n = wg_in_batch / wg_m_count;
|
||||
|
||||
let dst2_stride = params.m * params.n;
|
||||
let dst3_stride = dst2_stride * params.bs02 * params.broadcast2;
|
||||
|
||||
let dst3_idx = batch_idx / (params.bs02 * params.broadcast2);
|
||||
let src03_idx = dst3_idx / params.broadcast3;
|
||||
let src13_idx = dst3_idx;
|
||||
let dst2_idx = batch_idx % (params.bs02 * params.broadcast2);
|
||||
let src02_idx = dst2_idx / params.broadcast2;
|
||||
let src12_idx = dst2_idx;
|
||||
|
||||
let src0_batch_offset = params.offset_src0 + src03_idx * params.stride_03 + src02_idx * params.stride_02;
|
||||
let src1_batch_offset = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12;
|
||||
|
||||
let offset_m = wg_m * SUBGROUP_M * SUBGROUP_MATRIX_M * SUBGROUP_MATRIX_M_SIZE;
|
||||
let offset_n = wg_n * SUBGROUP_N * SUBGROUP_MATRIX_N * SUBGROUP_MATRIX_N_SIZE;
|
||||
|
||||
var acc_sg_mat : array<array<subgroup_matrix_result<f16, SUBGROUP_MATRIX_N_SIZE, SUBGROUP_MATRIX_M_SIZE>, SUBGROUP_MATRIX_N>, SUBGROUP_MATRIX_M>;
|
||||
|
||||
for (var k_outer = 0u; k_outer < params.k; k_outer += TILE_K) {
|
||||
|
||||
// see mul_mat_decls.tmpl
|
||||
init_shmem_src0(thread_id, src0_batch_offset, offset_m, k_outer);
|
||||
init_shmem_src1(thread_id, src1_batch_offset, offset_n, k_outer);
|
||||
|
||||
workgroupBarrier();
|
||||
|
||||
if (subgroup_id < EXPECTED_SUBGROUPS) {
|
||||
|
||||
for (var k_inner = 0u; k_inner < TILE_K; k_inner += SUBGROUP_MATRIX_K_SIZE) {
|
||||
|
||||
let src0_shmem_idx_base = subgroup_m * SUBGROUP_MATRIX_M * SUBGROUP_MATRIX_M_SIZE * TILE_K + k_inner;
|
||||
var src0_sg_mats: array<subgroup_matrix_left<f16, SUBGROUP_MATRIX_K_SIZE, SUBGROUP_MATRIX_M_SIZE>, SUBGROUP_MATRIX_M>;
|
||||
for (var m = 0u; m < SUBGROUP_MATRIX_M; m++) {
|
||||
src0_sg_mats[m] = subgroupMatrixLoad<subgroup_matrix_left<f16, SUBGROUP_MATRIX_K_SIZE, SUBGROUP_MATRIX_M_SIZE>>(
|
||||
&shmem,
|
||||
src0_shmem_idx_base + m * SUBGROUP_MATRIX_M_SIZE * TILE_K,
|
||||
false,
|
||||
TILE_K
|
||||
);
|
||||
}
|
||||
|
||||
let src1_shmem_idx_base = TILE_SRC0_SHMEM + subgroup_n * SUBGROUP_MATRIX_N * SUBGROUP_MATRIX_N_SIZE * TILE_K + k_inner;
|
||||
for (var n = 0u; n < SUBGROUP_MATRIX_N; n++) {
|
||||
let src1_sg_mat = subgroupMatrixLoad<subgroup_matrix_right<f16, SUBGROUP_MATRIX_N_SIZE, SUBGROUP_MATRIX_K_SIZE>>(
|
||||
&shmem,
|
||||
src1_shmem_idx_base + n * SUBGROUP_MATRIX_N_SIZE * TILE_K,
|
||||
true,
|
||||
TILE_K
|
||||
);
|
||||
for (var m = 0u; m < SUBGROUP_MATRIX_M; m++) {
|
||||
acc_sg_mat[m][n] = subgroupMatrixMultiplyAccumulate(src0_sg_mats[m], src1_sg_mat, acc_sg_mat[m][n]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
workgroupBarrier();
|
||||
}
|
||||
|
||||
let dst_batch_offset = params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride;
|
||||
|
||||
// Stage the subgroup matrix tiles into shared memory
|
||||
// This uses WG_M_SG_TILE_SIZE as the stride (number of columns in the workgroup tile).
|
||||
let WG_TILE_STRIDE = WG_M_SG_TILE_SIZE;
|
||||
let tile_row_base_local = subgroup_n * SUBGROUP_MATRIX_N * SUBGROUP_MATRIX_N_SIZE;
|
||||
let tile_col_base_local = subgroup_m * SUBGROUP_MATRIX_M * SUBGROUP_MATRIX_M_SIZE;
|
||||
|
||||
if (subgroup_id < EXPECTED_SUBGROUPS) { // 2-5% performance hit :(
|
||||
for (var n = 0u; n < SUBGROUP_MATRIX_N; n++) {
|
||||
for (var m = 0u; m < SUBGROUP_MATRIX_M; m++) {
|
||||
let local_row = tile_row_base_local + n * SUBGROUP_MATRIX_N_SIZE;
|
||||
let local_col = tile_col_base_local + m * SUBGROUP_MATRIX_M_SIZE;
|
||||
let out_base = local_row * WG_TILE_STRIDE + local_col;
|
||||
subgroupMatrixStore(&shmem, out_base, acc_sg_mat[m][n], true, WG_TILE_STRIDE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
workgroupBarrier();
|
||||
|
||||
// Cooperative write: iterate over the entire workgroup tile
|
||||
let tile_rows = WG_N_SG_TILE_SIZE;
|
||||
let tile_cols = WG_M_SG_TILE_SIZE;
|
||||
let total_tile_elems = tile_rows * tile_cols;
|
||||
let tile_dst_row_base = wg_m * SUBGROUP_M * SUBGROUP_MATRIX_M * SUBGROUP_MATRIX_M_SIZE;
|
||||
let tile_dst_col_base = wg_n * SUBGROUP_N * SUBGROUP_MATRIX_N * SUBGROUP_MATRIX_N_SIZE;
|
||||
|
||||
for (var idx = thread_id * {{VEC_SIZE}}; idx < total_tile_elems; idx += TOTAL_WORKGROUP_SIZE * {{VEC_SIZE}}) {
|
||||
let local_row = idx % WG_TILE_STRIDE;
|
||||
let local_col = idx / WG_TILE_STRIDE;
|
||||
|
||||
let global_row = tile_dst_row_base + local_row;
|
||||
let global_col = tile_dst_col_base + local_col;
|
||||
|
||||
if (global_col < params.n && global_row < params.m) {
|
||||
let dst_idx = dst_batch_offset + global_col * params.m + global_row;
|
||||
store_dst(idx, dst_idx/{{VEC_SIZE}});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#end(SHADER)
|
||||
267
ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl
Normal file
267
ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl
Normal file
@@ -0,0 +1,267 @@
|
||||
#define(VARIANTS)
|
||||
[
|
||||
{
|
||||
"SHADER_SUFFIX": "f32_f32_vec",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "vec4<f32>",
|
||||
"SRC1_TYPE" : "vec4<f32>",
|
||||
"DST_TYPE": "vec4<f32>",
|
||||
"VEC_SIZE" : 4,
|
||||
},
|
||||
"DECLS": ["VEC", "MUL_ACC_FLOAT"]
|
||||
},
|
||||
{
|
||||
"SHADER_SUFFIX": "f32_f32",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "f32",
|
||||
"SRC1_TYPE" : "f32",
|
||||
"DST_TYPE": "f32",
|
||||
"VEC_SIZE" : 1,
|
||||
},
|
||||
"DECLS": ["SCALAR", "MUL_ACC_FLOAT"]
|
||||
},
|
||||
{
|
||||
"SHADER_SUFFIX": "f16_f32_vec",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "vec4<f16>",
|
||||
"SRC1_TYPE" : "vec4<f32>",
|
||||
"DST_TYPE": "vec4<f32>",
|
||||
"VEC_SIZE" : 4,
|
||||
},
|
||||
"DECLS": ["VEC", "MUL_ACC_FLOAT"]
|
||||
},
|
||||
{
|
||||
"SHADER_SUFFIX": "f16_f32",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "f16",
|
||||
"SRC1_TYPE" : "f32",
|
||||
"DST_TYPE": "f32",
|
||||
"VEC_SIZE" : 1,
|
||||
},
|
||||
"DECLS": ["SCALAR", "MUL_ACC_FLOAT"]
|
||||
},
|
||||
{
|
||||
"SHADER_SUFFIX": "f16_f16_vec",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "vec4<f16>",
|
||||
"SRC1_TYPE" : "vec4<f16>",
|
||||
"DST_TYPE": "vec4<f32>",
|
||||
"VEC_SIZE" : 4,
|
||||
},
|
||||
"DECLS": ["VEC", "MUL_ACC_FLOAT"]
|
||||
},
|
||||
{
|
||||
"SHADER_SUFFIX": "f16_f16",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "f16",
|
||||
"SRC1_TYPE" : "f16",
|
||||
"DST_TYPE": "f32",
|
||||
"VEC_SIZE" : 1,
|
||||
},
|
||||
"DECLS": ["SCALAR", "MUL_ACC_FLOAT"]
|
||||
},
|
||||
{
|
||||
"SHADER_SUFFIX": "q4_0_f32",
|
||||
"REPLS": {
|
||||
"SRC0_TYPE" : "f16",
|
||||
"SRC1_TYPE" : "f32",
|
||||
"DST_TYPE": "f32",
|
||||
"VEC_SIZE" : 1,
|
||||
},
|
||||
"DECLS": ["BYTE_HELPERS", "SCALAR", "MUL_ACC_Q4_0"]
|
||||
}
|
||||
]
|
||||
|
||||
#end(VARIANTS)
|
||||
|
||||
#define(DECLS)
|
||||
|
||||
#decl(VEC)
|
||||
fn inner_dot(src0_val: {{SRC0_TYPE}}, src1_val: {{SRC1_TYPE}}) -> f32 {
|
||||
return f32(dot({{SRC1_TYPE}}(src0_val), src1_val));
|
||||
}
|
||||
|
||||
fn store_val(group_base: u32) -> vec4<f32> {
|
||||
return vec4<f32>(partial_sums[group_base],
|
||||
partial_sums[group_base + THREADS_PER_OUTPUT],
|
||||
partial_sums[group_base + THREADS_PER_OUTPUT * 2],
|
||||
partial_sums[group_base + THREADS_PER_OUTPUT * 3]);
|
||||
}
|
||||
#enddecl(VEC)
|
||||
|
||||
#decl(SCALAR)
|
||||
fn inner_dot(src0_val: {{SRC0_TYPE}}, src1_val: {{SRC1_TYPE}}) -> f32 {
|
||||
return f32(src0_val) * f32(src1_val);
|
||||
}
|
||||
|
||||
fn store_val(group_base: u32) -> f32 {
|
||||
return partial_sums[group_base];
|
||||
}
|
||||
#enddecl(SCALAR)
|
||||
|
||||
#decl(MUL_ACC_FLOAT)
|
||||
|
||||
fn mul_acc(tig:u32, tile_size: u32, idx_base: u32, k_outer: u32) -> f32 {
|
||||
var local_sum = 0.0;
|
||||
for (var i = tig * {{VEC_SIZE}}; i < tile_size; i += THREADS_PER_OUTPUT * {{VEC_SIZE}}) {
|
||||
let a = src0[(idx_base + k_outer + i) / {{VEC_SIZE}}];
|
||||
let b = shared_vector[i / {{VEC_SIZE}}];
|
||||
local_sum += inner_dot(a, b);
|
||||
}
|
||||
return local_sum;
|
||||
}
|
||||
|
||||
#enddecl(MUL_ACC_FLOAT)
|
||||
|
||||
#decl(MUL_ACC_Q4_0)
|
||||
|
||||
const BLOCK_SIZE = 32;
|
||||
const NQ = 16u; // number of weights per thread
|
||||
const F16_PER_BLOCK = 9u; // 1 scale + 8x4 packed weights
|
||||
const WEIGHTS_PER_F16 = 4u; // 4 weights per f16
|
||||
const F16_PER_THREAD = NQ / WEIGHTS_PER_F16;
|
||||
|
||||
fn mul_acc(tig:u32, tile_size: u32, idx_base: u32, k_outer: u32) -> f32 {
|
||||
var local_sum = 0.0;
|
||||
for (var i = tig * NQ; i < tile_size; i += THREADS_PER_OUTPUT * NQ) {
|
||||
let blck_idx = i / BLOCK_SIZE;
|
||||
let block_offset = (i % BLOCK_SIZE) / WEIGHTS_PER_F16;
|
||||
let scale_idx = (idx_base + k_outer / BLOCK_SIZE + blck_idx) * F16_PER_BLOCK;
|
||||
// each f16 contains offsets [block_offset, block_offset + 1] and [block_offset + 16, block_offset + 17]
|
||||
let shmem_idx = blck_idx * BLOCK_SIZE + block_offset * 2u;
|
||||
let d = f32(src0[scale_idx]);
|
||||
for (var j = 0u; j < F16_PER_THREAD; j += 2) {
|
||||
let q_0 = src0[scale_idx + 1 + block_offset + j];
|
||||
let q_1 = src0[scale_idx + 1 + block_offset + j + 1];
|
||||
let q_packed = bitcast<u32>(vec2(q_0, q_1));
|
||||
for (var k: u32 = 0; k < 4; k++) {
|
||||
let q_byte = get_byte(q_packed, k);
|
||||
let q_hi = (f32((q_byte >> 4) & 0xF) - 8.0) * d;
|
||||
let q_lo = (f32(q_byte & 0xF) - 8.0) * d;
|
||||
local_sum += q_lo * shared_vector[shmem_idx + j * 2 + k];
|
||||
local_sum += q_hi * shared_vector[shmem_idx + j * 2 + k + 16];
|
||||
}
|
||||
}
|
||||
}
|
||||
return local_sum;
|
||||
}
|
||||
|
||||
#enddecl(MUL_ACC_Q4_0)
|
||||
|
||||
#end(DECLS)
|
||||
|
||||
#define(SHADER)
|
||||
enable f16;
|
||||
|
||||
DECLS
|
||||
|
||||
struct MulMatParams {
|
||||
offset_src0: u32,
|
||||
offset_src1: u32,
|
||||
offset_dst: u32,
|
||||
m: u32,
|
||||
n: u32,
|
||||
k: u32,
|
||||
stride_01: u32,
|
||||
stride_11: u32,
|
||||
stride_02: u32,
|
||||
stride_12: u32,
|
||||
stride_03: u32,
|
||||
stride_13: u32,
|
||||
bs02: u32,
|
||||
bs03: u32,
|
||||
broadcast2: u32,
|
||||
broadcast3: u32
|
||||
};
|
||||
|
||||
@group(0) @binding(0) var<storage, read_write> src0: array<{{SRC0_TYPE}}>; // Matrix (M x K)
|
||||
@group(0) @binding(1) var<storage, read_write> src1: array<{{SRC1_TYPE}}>; // Vector (K x 1, transposed)
|
||||
@group(0) @binding(2) var<storage, read_write> dst: array<{{DST_TYPE}}>; // Result vector (transposed)
|
||||
|
||||
@group(0) @binding(3) var<uniform> params: MulMatParams;
|
||||
|
||||
override WORKGROUP_SIZE: u32;
|
||||
override TILE_K: u32;
|
||||
override OUTPUTS_PER_WG: u32;
|
||||
override THREADS_PER_OUTPUT = WORKGROUP_SIZE / OUTPUTS_PER_WG;
|
||||
|
||||
// Shared memory for collaborative loading and reduction
|
||||
var<workgroup> shared_vector: array<{{SRC1_TYPE}}, TILE_K/{{VEC_SIZE}}>; // Cache vector tile
|
||||
var<workgroup> partial_sums: array<f32, WORKGROUP_SIZE>; // For reduction
|
||||
|
||||
@compute @workgroup_size(WORKGROUP_SIZE)
|
||||
fn main(
|
||||
@builtin(local_invocation_id) local_id: vec3<u32>,
|
||||
@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
@builtin(num_workgroups) num_wg: vec3<u32>) {
|
||||
let thread_id = local_id.x;
|
||||
|
||||
// Handle batch dimensions
|
||||
let total_batches = params.bs02 * params.broadcast2 * params.bs03 * params.broadcast3;
|
||||
let wg_linear = wg_id.y * num_wg.x + wg_id.x;
|
||||
let output_groups = (params.m + OUTPUTS_PER_WG - 1u) / OUTPUTS_PER_WG;
|
||||
let batch_idx = wg_linear / output_groups;
|
||||
if (batch_idx >= total_batches) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Which of the outputs does this thread belong to?
|
||||
let thread_group = thread_id / THREADS_PER_OUTPUT;
|
||||
let thread_in_group = thread_id % THREADS_PER_OUTPUT;
|
||||
|
||||
// Each workgroup computes OUTPUTS_PER_WG consecutive outputs
|
||||
let output_row = (wg_linear % output_groups) * OUTPUTS_PER_WG + thread_group;
|
||||
|
||||
let dst2_stride = params.m * params.n;
|
||||
let dst2_idx = batch_idx % (params.bs02 * params.broadcast2);
|
||||
let dst3_stride = dst2_stride * params.bs02 * params.broadcast2;
|
||||
let dst3_idx = batch_idx / (params.bs02 * params.broadcast2);
|
||||
let src03_idx = dst3_idx / params.broadcast3;
|
||||
let src13_idx = dst3_idx;
|
||||
let src02_idx = dst2_idx / params.broadcast2;
|
||||
let src12_idx = dst2_idx;
|
||||
|
||||
let src0_idx_base = params.offset_src0 + src03_idx * params.stride_03 + src02_idx * params.stride_02 + output_row * params.stride_01;
|
||||
let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12;
|
||||
let dst_idx = params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride + output_row;
|
||||
|
||||
var local_sum = 0.0;
|
||||
|
||||
// Each thread processes multiple K elements and accumulates
|
||||
for (var k_tile = 0u; k_tile < params.k; k_tile += TILE_K) {
|
||||
let tile_size = min(TILE_K, params.k - k_tile);
|
||||
|
||||
// Cooperatively load vector tile into shared memory (all threads)
|
||||
for (var i = thread_id * {{VEC_SIZE}}; i < tile_size; i += WORKGROUP_SIZE * {{VEC_SIZE}}) {
|
||||
shared_vector[i / {{VEC_SIZE}}] = src1[(src1_idx_base + k_tile + i) / {{VEC_SIZE}}];
|
||||
}
|
||||
|
||||
workgroupBarrier();
|
||||
|
||||
if (output_row < params.m) {
|
||||
local_sum += mul_acc(thread_in_group, tile_size, src0_idx_base, k_tile);
|
||||
}
|
||||
|
||||
workgroupBarrier();
|
||||
}
|
||||
|
||||
// Store partial sums and reduce within each partition
|
||||
partial_sums[thread_id] = local_sum;
|
||||
workgroupBarrier();
|
||||
let group_base = thread_group * THREADS_PER_OUTPUT;
|
||||
let thread_base = group_base + thread_in_group;
|
||||
var offset = THREADS_PER_OUTPUT / 2;
|
||||
while (offset > 0) {
|
||||
if (thread_in_group < offset) {
|
||||
partial_sums[thread_base] += partial_sums[thread_base + offset];
|
||||
}
|
||||
offset = offset / 2;
|
||||
workgroupBarrier();
|
||||
}
|
||||
|
||||
// Store back to global memory
|
||||
if (output_row < params.m && thread_group % {{VEC_SIZE}} == 0 && thread_in_group == 0) {
|
||||
dst[dst_idx / {{VEC_SIZE}}] = store_val(group_base);
|
||||
}
|
||||
}
|
||||
#end(SHADER)
|
||||
@@ -463,6 +463,7 @@ extern "C" {
|
||||
|
||||
// NOTE: After creating a llama_context, it is recommended to query the actual values using these functions
|
||||
// In some cases the requested values via llama_context_params may differ from the actual values used by the context
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/17046#discussion_r2503085732
|
||||
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
||||
LLAMA_API uint32_t llama_n_ctx_seq (const struct llama_context * ctx);
|
||||
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
||||
@@ -485,6 +486,7 @@ extern "C" {
|
||||
|
||||
LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
|
||||
|
||||
@@ -21,6 +21,8 @@ llama_context::llama_context(
|
||||
llama_context_params params) :
|
||||
model(model),
|
||||
balloc(std::make_unique<llama_batch_allocr>(model.hparams.n_pos_per_embd())) {
|
||||
// TODO warning when creating llama_context with awkward ctx size that is not a power of 2,
|
||||
// may need to be backend-dependent
|
||||
LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
|
||||
|
||||
t_start_us = model.t_start_us;
|
||||
@@ -112,10 +114,14 @@ llama_context::llama_context(
|
||||
}
|
||||
}
|
||||
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/17046#discussion_r2503085732
|
||||
cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
|
||||
|
||||
if (cparams.kv_unified) {
|
||||
cparams.n_ctx_seq = cparams.n_ctx;
|
||||
} else {
|
||||
cparams.n_ctx_seq = cparams.n_ctx / cparams.n_seq_max;
|
||||
cparams.n_ctx_seq = GGML_PAD(cparams.n_ctx_seq, 256);
|
||||
|
||||
if (cparams.n_ctx_seq == 0) {
|
||||
throw std::runtime_error("n_ctx_seq == 0");
|
||||
@@ -821,7 +827,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_embd = hparams.n_embd_inp();
|
||||
const int64_t n_vocab = model.vocab.n_tokens();
|
||||
|
||||
// note: during encode, we always pass the full sequence starting from pos = 0
|
||||
@@ -990,7 +996,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int64_t n_vocab = vocab.n_tokens();
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_embd = hparams.n_embd_inp();
|
||||
|
||||
// when computing embeddings, all tokens are output
|
||||
const bool output_all = cparams.embeddings;
|
||||
@@ -2148,7 +2154,7 @@ void llama_context::opt_epoch_iter(
|
||||
batch.logits [pos_batch] = true;
|
||||
}
|
||||
|
||||
if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
|
||||
if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd_inp(), cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1142,7 +1142,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
|
||||
// input embeddings with optional lora
|
||||
ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_embd = hparams.n_embd_inp();
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_embd>();
|
||||
|
||||
@@ -1279,7 +1279,7 @@ ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
|
||||
// return cur;
|
||||
//}
|
||||
|
||||
const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd;
|
||||
const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_inp();
|
||||
const auto n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
|
||||
|
||||
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
|
||||
|
||||
@@ -60,6 +60,16 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const {
|
||||
return n_head/n_head_kv;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_inp() const {
|
||||
uint32_t n_embd_inp = n_embd;
|
||||
|
||||
if (n_deepstack_layers > 0) {
|
||||
n_embd_inp += n_embd * n_deepstack_layers;
|
||||
}
|
||||
|
||||
return n_embd_inp;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
|
||||
const uint32_t n_head_kv = this->n_head_kv(il);
|
||||
|
||||
|
||||
@@ -227,6 +227,9 @@ struct llama_hparams {
|
||||
|
||||
uint32_t n_gqa(uint32_t il = 0) const;
|
||||
|
||||
// dimension of main + auxiliary input embeddings
|
||||
uint32_t n_embd_inp() const;
|
||||
|
||||
// dimension of key embeddings across all k-v heads
|
||||
uint32_t n_embd_k_gqa(uint32_t il = 0) const;
|
||||
|
||||
|
||||
@@ -45,7 +45,9 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
|
||||
|
||||
const uint32_t size_base = kv_size;
|
||||
|
||||
uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*(unified ? n_seq_max : 1) + n_ubatch, n_pad));
|
||||
// note: the SWA cache is always padded to 256 for performance
|
||||
// https://github.com/ggml-org/llama.cpp/issues/17037
|
||||
uint32_t size_swa = GGML_PAD(std::min(size_base, hparams.n_swa*(unified ? n_seq_max : 1) + n_ubatch), 256);
|
||||
|
||||
// when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
|
||||
if (swa_full) {
|
||||
|
||||
@@ -276,8 +276,8 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
|
||||
} break;
|
||||
case GGML_OP_IM2COL:
|
||||
{
|
||||
const int n_embd = hparams.n_embd;
|
||||
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
|
||||
const int n_embd_inp = hparams.n_embd_inp();
|
||||
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
|
||||
op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
|
||||
} break;
|
||||
case GGML_OP_SCALE:
|
||||
@@ -1039,9 +1039,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
case 64: type = LLM_TYPE_32B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
// since vision model stacks deepstack features along feature dim
|
||||
// we also create a fake "n_embd" for text model to be the main embd + deepstack embds
|
||||
hparams.n_embd *= hparams.n_deepstack_layers + 1;
|
||||
} break;
|
||||
case LLM_ARCH_QWEN3MOE:
|
||||
{
|
||||
@@ -1065,9 +1062,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
case 94: type = LLM_TYPE_235B_A22B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
// since vision model stacks deepstack features along feature dim
|
||||
// we also create a fake "n_embd" for text model to be the main embd + deepstack embds
|
||||
hparams.n_embd *= hparams.n_deepstack_layers + 1;
|
||||
} break;
|
||||
case LLM_ARCH_PHI2:
|
||||
{
|
||||
@@ -3341,10 +3335,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
case LLM_ARCH_QWEN3:
|
||||
case LLM_ARCH_QWEN3VL:
|
||||
{
|
||||
// for model loading, the weights only have the main embd
|
||||
// so we need to divide by the number of deepstack layers + 1
|
||||
// n_embd is const int so we declare a new variable
|
||||
int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1);
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
// output
|
||||
@@ -3380,10 +3370,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
case LLM_ARCH_QWEN3MOE:
|
||||
case LLM_ARCH_QWEN3VLMOE:
|
||||
{
|
||||
// for model loading, the weights only have the main embd
|
||||
// so we need to divide by the number of deepstack layers + 1
|
||||
// n_embd is const int so we declare a new variable
|
||||
int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1);
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
// output
|
||||
@@ -6535,6 +6521,7 @@ void llama_model::print_info() const {
|
||||
if (!hparams.vocab_only) {
|
||||
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
||||
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
|
||||
LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp());
|
||||
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
||||
LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
|
||||
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
|
||||
@@ -7380,6 +7367,10 @@ int32_t llama_model_n_embd(const llama_model * model) {
|
||||
return model->hparams.n_embd;
|
||||
}
|
||||
|
||||
int32_t llama_model_n_embd_inp(const llama_model * model) {
|
||||
return model->hparams.n_embd_inp();
|
||||
}
|
||||
|
||||
int32_t llama_model_n_layer(const llama_model * model) {
|
||||
return model->hparams.n_layer;
|
||||
}
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
#include "models.h"
|
||||
|
||||
llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds
|
||||
const size_t n_deepstack_layers = hparams.n_deepstack_layers;
|
||||
const int64_t n_embd = n_embd_full / (n_deepstack_layers + 1);
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
@@ -1,13 +1,10 @@
|
||||
#include "models.h"
|
||||
|
||||
llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
|
||||
const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds
|
||||
const size_t n_deepstack_layers = hparams.n_deepstack_layers;
|
||||
const int64_t n_embd = n_embd_full / (n_deepstack_layers + 1);
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
|
||||
@@ -3385,11 +3385,11 @@ struct test_mul_mat : public test_case {
|
||||
const std::array<int64_t, 2> bs; // dims 3 and 4
|
||||
const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
|
||||
const std::array<int64_t, 4> per; // permutation of dimensions
|
||||
const bool v; // whether a and b are non-contiguous views
|
||||
const int64_t k_v; // size of k in memory, resulting in a non-contiguous view for k_v > k, no view for k_v == 0
|
||||
const uint32_t o; // number of outputs
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR10(type_a, type_b, m, n, k, bs, nr, per, v, o);
|
||||
return VARS_TO_STR10(type_a, type_b, m, n, k, bs, nr, per, k_v, o);
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
@@ -3410,8 +3410,8 @@ struct test_mul_mat : public test_case {
|
||||
std::array<int64_t, 2> bs = {10, 10},
|
||||
std::array<int64_t, 2> nr = {2, 2},
|
||||
std::array<int64_t, 4> per = {0, 1, 2, 3},
|
||||
bool v = false, uint32_t o = 1)
|
||||
: type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v), o(o) {}
|
||||
int64_t k_v = 0, uint32_t o = 1)
|
||||
: type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), k_v(k_v), o(o) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
// C^T = A * B^T: (k, m) * (k, n) => (m, n)
|
||||
@@ -3421,7 +3421,7 @@ struct test_mul_mat : public test_case {
|
||||
const int npermuted = (per[0] != 0) + (per[1] != 1) + (per[2] != 2) + (per[3] != 3);
|
||||
if (npermuted > 0) {
|
||||
GGML_ASSERT(npermuted == 2);
|
||||
GGML_ASSERT(!v); // not handled
|
||||
GGML_ASSERT(k_v == 0); // not handled
|
||||
GGML_ASSERT(!ggml_is_quantized(type_a) || per[0] == 0);
|
||||
GGML_ASSERT(!ggml_is_quantized(type_b) || per[0] == 0);
|
||||
|
||||
@@ -3445,29 +3445,21 @@ struct test_mul_mat : public test_case {
|
||||
ggml_set_name(a, "a_permuted");
|
||||
ggml_set_name(b, "b_permuted");
|
||||
} else {
|
||||
if (v) {
|
||||
a = ggml_new_tensor_4d(ctx, type_a, k*2, m, bs[0], bs[1]);
|
||||
b = ggml_new_tensor_4d(ctx, type_b, k*2, n, bs[0]*nr[0], bs[1]*nr[1]);
|
||||
const int64_t k_physical = k_v == 0 ? k : k_v;
|
||||
a = ggml_new_tensor_4d(ctx, type_a, k_physical, m, bs[0], bs[1]);
|
||||
b = ggml_new_tensor_4d(ctx, type_b, k_physical, n, bs[0]*nr[0], bs[1]*nr[1]);
|
||||
|
||||
if (!ggml_is_quantized(type_a)) {
|
||||
if (bs[1] == 1 && nr[1] == 1) {
|
||||
ggml_set_param(a);
|
||||
}
|
||||
ggml_set_param(b);
|
||||
if (!ggml_is_quantized(type_a)) {
|
||||
if (bs[1] == 1 && nr[1] == 1) {
|
||||
ggml_set_param(a);
|
||||
}
|
||||
ggml_set_param(b);
|
||||
}
|
||||
|
||||
if (k_v != 0) {
|
||||
GGML_ASSERT(k_v > k);
|
||||
a = ggml_view_4d(ctx, a, k, m, bs[0], bs[1], a->nb[1], a->nb[2], a->nb[3], 0);
|
||||
b = ggml_view_4d(ctx, b, k, n, bs[0]*nr[0], bs[1]*nr[1], b->nb[1], b->nb[2], b->nb[3], 0);
|
||||
} else {
|
||||
a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0], bs[1]);
|
||||
b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
|
||||
|
||||
if (!ggml_is_quantized(type_a)) {
|
||||
if (bs[1] == 1 && nr[1] == 1) {
|
||||
ggml_set_param(a);
|
||||
}
|
||||
ggml_set_param(b);
|
||||
}
|
||||
}
|
||||
ggml_set_name(a, "a");
|
||||
ggml_set_name(b, "b");
|
||||
@@ -6656,6 +6648,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
|
||||
|
||||
test_cases.emplace_back(new test_cont());
|
||||
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1}));
|
||||
@@ -6901,7 +6894,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1, 1}, {4, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67, {1, 1}, {4, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 16, 32, 32, { 1, 1}, {1, 1}, {0, 1, 2, 3}, true, 3));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 16, 32, 32, { 1, 1}, {1, 1}, {0, 1, 2, 3}, 64, 3));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 64, 77, 77, {12,1}, {1,1}));
|
||||
|
||||
#if 0
|
||||
@@ -6927,7 +6920,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
for (uint32_t k = 0; k < 2; ++k) {
|
||||
for (ggml_type type: {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
|
||||
test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 1056 + m, 1, 128 + k, {bs, bs2}, {nr, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 128 + m, 1, 1056 + k, {bs, bs2}, {nr, 1}, {0, 1, 2, 3}, true));
|
||||
test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 128 + m, 1, 1056 + k, {bs, bs2}, {nr, 1}, {0, 1, 2, 3}, 2*1056 + k));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -7432,7 +7425,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 4, 1}));
|
||||
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8, 1}, {4, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8, 1}, {4, 1}, {0, 1, 2, 3}, true));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8, 1}, {4, 1}, {0, 1, 2, 3}, 2*16416));
|
||||
|
||||
for (int bs : {1, 2, 3, 4, 5, 8, 512}) {
|
||||
for (ggml_type type_a : all_types) {
|
||||
|
||||
@@ -1919,6 +1919,12 @@ struct sql_printer : public printer {
|
||||
}
|
||||
};
|
||||
|
||||
struct ctx_state {
|
||||
int depth = 0; // in tokens
|
||||
|
||||
std::vector<uint8_t> buf; // the llama_context state buffer
|
||||
};
|
||||
|
||||
static bool test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
|
||||
llama_set_n_threads(ctx, n_threads, n_threads);
|
||||
|
||||
@@ -2051,6 +2057,10 @@ int main(int argc, char ** argv) {
|
||||
llama_model * lmodel = nullptr;
|
||||
const cmd_params_instance * prev_inst = nullptr;
|
||||
|
||||
// store the llama_context state at the previous depth that we performed a test
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/16944#issuecomment-3478151721
|
||||
ctx_state cstate;
|
||||
|
||||
int params_idx = 0;
|
||||
auto params_count = params_instances.size();
|
||||
for (const auto & inst : params_instances) {
|
||||
@@ -2134,14 +2144,37 @@ int main(int argc, char ** argv) {
|
||||
llama_memory_clear(llama_get_memory(ctx), false);
|
||||
|
||||
if (t.n_depth > 0) {
|
||||
if (params.progress) {
|
||||
fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
|
||||
i + 1, params.reps);
|
||||
bool is_cached = t.n_depth == cstate.depth;
|
||||
|
||||
if (is_cached) {
|
||||
// if previously we have computed at this depth, just restore the state
|
||||
const size_t ret = llama_state_seq_set_data(ctx, cstate.buf.data(), cstate.buf.size(), 0);
|
||||
if (ret == 0) {
|
||||
// if the old state is incompatible with the current context - reprocess from scratch
|
||||
is_cached = false;
|
||||
}
|
||||
}
|
||||
bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
|
||||
if (!res) {
|
||||
fprintf(stderr, "%s: error: failed to run depth\n", __func__);
|
||||
exit(1);
|
||||
|
||||
if (!is_cached) {
|
||||
if (params.progress) {
|
||||
fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
|
||||
i + 1, params.reps);
|
||||
}
|
||||
bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
|
||||
if (!res) {
|
||||
fprintf(stderr, "%s: error: failed to run depth\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// store the context state for reuse in later runs
|
||||
cstate.depth = t.n_depth;
|
||||
cstate.buf.resize(llama_state_seq_get_size(ctx, 0));
|
||||
llama_state_seq_get_data(ctx, cstate.buf.data(), cstate.buf.size(), 0);
|
||||
} else {
|
||||
if (params.progress) {
|
||||
fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d (cached)\n", params_idx, params_count,
|
||||
i + 1, params.reps);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1083,16 +1083,24 @@ struct clip_graph {
|
||||
}
|
||||
|
||||
ggml_cgraph * build_minicpmv() {
|
||||
const int batch_size = 1;
|
||||
|
||||
GGML_ASSERT(model.class_embedding == nullptr);
|
||||
const int n_pos = n_patches;
|
||||
const int n_pos = n_patches;
|
||||
const int n_embd_proj = clip_n_mmproj_embd(ctx);
|
||||
|
||||
// position embeddings for the projector (not for ViT)
|
||||
int n_output_dim = clip_n_mmproj_embd(ctx);
|
||||
ggml_tensor * pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, n_pos, batch_size);
|
||||
ggml_set_name(pos_embed, "pos_embed");
|
||||
ggml_set_input(pos_embed);
|
||||
// see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70
|
||||
// base frequency omega
|
||||
ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4);
|
||||
ggml_set_name(omega, "omega");
|
||||
ggml_set_input(omega);
|
||||
|
||||
// 2D input positions (using float for sinusoidal embeddings)
|
||||
ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
|
||||
ggml_set_name(pos_h, "pos_h");
|
||||
ggml_set_input(pos_h);
|
||||
ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
|
||||
ggml_set_name(pos_w, "pos_w");
|
||||
ggml_set_input(pos_w);
|
||||
|
||||
// for selecting learned pos embd, used by ViT
|
||||
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
|
||||
@@ -1103,7 +1111,7 @@ struct clip_graph {
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * embeddings = build_vit(
|
||||
inp, n_patches,
|
||||
inp, n_pos,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
learned_pos_embd,
|
||||
@@ -1115,17 +1123,39 @@ struct clip_graph {
|
||||
ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
|
||||
|
||||
// norm
|
||||
q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
|
||||
q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
|
||||
v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
|
||||
|
||||
// calculate sinusoidal pos embd
|
||||
ggml_tensor * pos_embed = nullptr;
|
||||
{
|
||||
// outer product
|
||||
ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows
|
||||
ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w);
|
||||
ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h);
|
||||
// sin and cos
|
||||
ggml_tensor * pos_embd_x = ggml_concat(
|
||||
ctx0,
|
||||
ggml_sin(ctx0, theta_x),
|
||||
ggml_cos(ctx0, theta_x),
|
||||
0 // concat on first dim
|
||||
);
|
||||
ggml_tensor * pos_embd_y = ggml_concat(
|
||||
ctx0,
|
||||
ggml_sin(ctx0, theta_y),
|
||||
ggml_cos(ctx0, theta_y),
|
||||
0 // concat on first dim
|
||||
);
|
||||
pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0);
|
||||
}
|
||||
|
||||
// k = v + pos_embed
|
||||
ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
|
||||
|
||||
// attention
|
||||
{
|
||||
int n_embd = clip_n_mmproj_embd(ctx);
|
||||
const int d_head = 128;
|
||||
int n_head = n_embd/d_head;
|
||||
int n_head = n_embd_proj/d_head;
|
||||
// Use actual config value if available, otherwise fall back to hardcoded values
|
||||
int num_query = ctx->model.hparams.minicpmv_query_num;
|
||||
ggml_tensor * Q = ggml_add(ctx0,
|
||||
@@ -4564,92 +4594,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
return n_patches;
|
||||
}
|
||||
|
||||
static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
|
||||
assert(embed_dim % 2 == 0);
|
||||
int H = pos.size();
|
||||
int W = pos[0].size();
|
||||
|
||||
std::vector<float> omega(embed_dim / 2);
|
||||
for (int i = 0; i < embed_dim / 2; ++i) {
|
||||
omega[i] = 1.0 / pow(10000.0, static_cast<float>(i) / (embed_dim / 2));
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
|
||||
for (int h = 0; h < H; ++h) {
|
||||
for (int w = 0; w < W; ++w) {
|
||||
for (int d = 0; d < embed_dim / 2; ++d) {
|
||||
float out_value = pos[h][w] * omega[d];
|
||||
emb[h][w][d] = sin(out_value);
|
||||
emb[h][w][d + embed_dim / 2] = cos(out_value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return emb;
|
||||
}
|
||||
|
||||
static std::vector<std::vector<std::vector<float>>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector<std::vector<std::vector<float>>> & grid) {
|
||||
assert(embed_dim % 2 == 0);
|
||||
std::vector<std::vector<std::vector<float>>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2)
|
||||
std::vector<std::vector<std::vector<float>>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2)
|
||||
|
||||
int H = emb_h.size();
|
||||
int W = emb_h[0].size();
|
||||
std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
|
||||
|
||||
for (int h = 0; h < H; ++h) {
|
||||
for (int w = 0; w < W; ++w) {
|
||||
for (int d = 0; d < embed_dim / 2; ++d) {
|
||||
emb[h][w][d] = emb_h[h][w][d];
|
||||
emb[h][w][d + embed_dim / 2] = emb_w[h][w][d];
|
||||
}
|
||||
}
|
||||
}
|
||||
return emb;
|
||||
}
|
||||
|
||||
static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, const std::pair<int, int> image_size) {
|
||||
int grid_h_size = image_size.first;
|
||||
int grid_w_size = image_size.second;
|
||||
|
||||
std::vector<float> grid_h(grid_h_size);
|
||||
std::vector<float> grid_w(grid_w_size);
|
||||
|
||||
for (int i = 0; i < grid_h_size; ++i) {
|
||||
grid_h[i] = static_cast<float>(i);
|
||||
}
|
||||
for (int i = 0; i < grid_w_size; ++i) {
|
||||
grid_w[i] = static_cast<float>(i);
|
||||
}
|
||||
|
||||
std::vector<std::vector<float>> grid(grid_h_size, std::vector<float>(grid_w_size));
|
||||
for (int h = 0; h < grid_h_size; ++h) {
|
||||
for (int w = 0; w < grid_w_size; ++w) {
|
||||
grid[h][w] = grid_w[w];
|
||||
}
|
||||
}
|
||||
std::vector<std::vector<std::vector<float>>> grid_2d = {grid, grid};
|
||||
for (int h = 0; h < grid_h_size; ++h) {
|
||||
for (int w = 0; w < grid_w_size; ++w) {
|
||||
grid_2d[0][h][w] = grid_h[h];
|
||||
grid_2d[1][h][w] = grid_w[w];
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::vector<float>>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d);
|
||||
|
||||
int H = image_size.first;
|
||||
int W = image_size.second;
|
||||
std::vector<std::vector<float>> pos_embed_2d(H * W, std::vector<float>(embed_dim));
|
||||
for (int h = 0; h < H; ++h) {
|
||||
for (int w = 0; w < W; ++w) {
|
||||
pos_embed_2d[w * H + h] = pos_embed_3d[h][w];
|
||||
}
|
||||
}
|
||||
|
||||
return pos_embed_2d;
|
||||
}
|
||||
|
||||
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
||||
clip_image_f32_batch imgs;
|
||||
clip_image_f32_ptr img_copy(clip_image_f32_init());
|
||||
@@ -4788,22 +4732,28 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
}
|
||||
set_input_i32("positions", positions);
|
||||
|
||||
// inspired from resampler of Qwen-VL:
|
||||
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
|
||||
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
|
||||
int embed_dim = clip_n_mmproj_embd(ctx);
|
||||
|
||||
// TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
|
||||
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
||||
|
||||
std::vector<float> pos_embed(embed_dim * pos_w * pos_h);
|
||||
for(int i = 0; i < pos_w * pos_h; ++i){
|
||||
for(int j = 0; j < embed_dim; ++j){
|
||||
pos_embed[i * embed_dim + j] = pos_embed_t[i][j];
|
||||
}
|
||||
// inputs for resampler projector
|
||||
// set the 2D positions (using float for sinusoidal embedding)
|
||||
int n_patches_per_col = image_size_width / patch_size;
|
||||
std::vector<float> pos_data(n_pos);
|
||||
// dimension H
|
||||
for (int i = 0; i < n_pos; i++) {
|
||||
pos_data[i] = static_cast<float>(i / n_patches_per_col);
|
||||
}
|
||||
|
||||
set_input_f32("pos_embed", pos_embed);
|
||||
set_input_f32("pos_h", pos_data);
|
||||
// dimension W
|
||||
for (int i = 0; i < n_pos; i++) {
|
||||
pos_data[i] = static_cast<float>(i % n_patches_per_col);
|
||||
}
|
||||
set_input_f32("pos_w", pos_data);
|
||||
// base frequency omega
|
||||
const float base_freq = 10000.0f;
|
||||
const int n_embd_proj = clip_n_mmproj_embd(ctx);
|
||||
std::vector<float> omega(n_embd_proj / 4);
|
||||
for (int i = 0; i < n_embd_proj / 4; ++i) {
|
||||
omega[i] = 1.0f / std::pow(base_freq, static_cast<float>(i) / (n_embd_proj / 4));
|
||||
}
|
||||
set_input_f32("omega", omega);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
|
||||
@@ -163,7 +163,7 @@ struct mtmd_context {
|
||||
print_timings(ctx_params.print_timings),
|
||||
n_threads (ctx_params.n_threads),
|
||||
media_marker (ctx_params.media_marker),
|
||||
n_embd_text (llama_model_n_embd(text_model))
|
||||
n_embd_text (llama_model_n_embd_inp(text_model))
|
||||
{
|
||||
if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
|
||||
throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
|
||||
|
||||
@@ -2823,6 +2823,8 @@ struct server_context {
|
||||
send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
|
||||
return false;
|
||||
}
|
||||
|
||||
SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl).c_str());
|
||||
}
|
||||
|
||||
// initialize draft batch
|
||||
@@ -3832,7 +3834,9 @@ struct server_context {
|
||||
// the largest pos_min required for a checkpoint to be useful
|
||||
const auto pos_min_thold = std::max(0, n_past - n_swa);
|
||||
|
||||
if (n_past > 0 && n_past < slot.prompt.n_tokens()) {
|
||||
// note: disallow with mtmd contexts for now
|
||||
// https://github.com/ggml-org/llama.cpp/issues/17043
|
||||
if (!mctx && n_past > 0 && n_past < slot.prompt.n_tokens()) {
|
||||
const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
|
||||
if (pos_min == -1) {
|
||||
SLT_ERR(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min);
|
||||
|
||||
@@ -77,10 +77,10 @@ def test_different_draft_min_draft_max():
|
||||
|
||||
def test_slot_ctx_not_exceeded():
|
||||
global server
|
||||
server.n_ctx = 64
|
||||
server.n_ctx = 256
|
||||
server.start()
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "Hello " * 56,
|
||||
"prompt": "Hello " * 248,
|
||||
"temperature": 0.0,
|
||||
"top_k": 1,
|
||||
"speculative.p_min": 0.0,
|
||||
@@ -91,19 +91,19 @@ def test_slot_ctx_not_exceeded():
|
||||
|
||||
def test_with_ctx_shift():
|
||||
global server
|
||||
server.n_ctx = 64
|
||||
server.n_ctx = 256
|
||||
server.enable_ctx_shift = True
|
||||
server.start()
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "Hello " * 56,
|
||||
"prompt": "Hello " * 248,
|
||||
"temperature": 0.0,
|
||||
"top_k": 1,
|
||||
"n_predict": 64,
|
||||
"n_predict": 256,
|
||||
"speculative.p_min": 0.0,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert len(res.body["content"]) > 0
|
||||
assert res.body["tokens_predicted"] == 64
|
||||
assert res.body["tokens_predicted"] == 256
|
||||
assert res.body["truncated"] == True
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user