Compare commits

...

7 Commits
b4196 ... b4203

Author SHA1 Message Date
Chenguang Li
b7420131bf CANN: ROPE operator optimization (#10540)
* [cann] ROPE operator optimization

Co-authored-by: noemotiovon <noemotiovon@gmail.com>
2024-11-28 14:24:46 +08:00
Xuan Son Nguyen
9f912511bc common : fix duplicated file name with hf_repo and hf_file (#10550) 2024-11-27 22:30:52 +01:00
uvos
3ad5451f3b Add some minimal optimizations for CDNA (#10498)
* Add some minimal optimizations for CDNA

* ggml_cuda: set launch bounds also for GCN as it helps there too
2024-11-27 17:10:08 +01:00
Diego Devesa
46c69e0e75 ci : faster CUDA toolkit installation method and use ccache (#10537)
* ci : faster CUDA toolkit installation method and use ccache

* remove fetch-depth

* only pack CUDA runtime on master
2024-11-27 11:03:25 +01:00
Georgi Gerganov
9e2301f4a4 metal : fix group_norm support condition (#0) 2024-11-27 11:22:14 +02:00
Georgi Gerganov
fee824a1a1 sync : ggml 2024-11-27 11:10:42 +02:00
Frankie Robertson
9150f8fef9 Do not include arm_neon.h when compiling CUDA code (ggml/1028) 2024-11-27 11:10:27 +02:00
16 changed files with 364 additions and 196 deletions

View File

@@ -892,12 +892,12 @@ jobs:
cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89-real -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined -DLLAMA_FATAL_WARNINGS=ON
cmake --build build
windows-latest-cmake-cuda:
runs-on: windows-latest
windows-2019-cmake-cuda:
runs-on: windows-2019
strategy:
matrix:
cuda: ['12.6.2']
cuda: ['12.4', '11.7']
build: ['cuda']
steps:
@@ -905,13 +905,66 @@ jobs:
id: checkout
uses: actions/checkout@v4
- name: Install CUDA toolkit
id: cuda-toolkit
uses: Jimver/cuda-toolkit@v0.2.19
- name: Install Cuda Toolkit 11.7
if: ${{ matrix.cuda == '11.7' }}
run: |
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
choco install unzip -y
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
- name: Install Cuda Toolkit 12.4
if: ${{ matrix.cuda == '12.4' }}
run: |
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
choco install unzip -y
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
- name: Install ccache
uses: hendrikmuhs/ccache-action@v1.2
with:
cuda: ${{ matrix.cuda }}
method: 'network'
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
- name: Install Ninja
id: install_ninja
@@ -922,44 +975,12 @@ jobs:
id: cmake_build
shell: cmd
run: |
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON -DCMAKE_CUDA_ARCHITECTURES=89-real
cmake --build build --config Release -t ggml-cuda
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
cmake --build build --config Release
windows-2019-cmake-cuda:
runs-on: windows-2019
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
strategy:
matrix:
cuda: ['12.2.0', '11.7.1']
build: ['cuda']
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install CUDA toolkit
id: cuda-toolkit
uses: Jimver/cuda-toolkit@v0.2.15
with:
cuda: ${{ matrix.cuda }}
method: 'network'
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Determine tag name
id: tag
shell: bash
@@ -987,10 +1008,12 @@ jobs:
name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
- name: Copy and pack Cuda runtime
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
run: |
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
echo "Cuda install location: ${{ env.CUDA_PATH }}"
$dst='.\build\bin\cudart\'
robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
- name: Upload Cuda runtime

View File

@@ -128,7 +128,11 @@ static void common_params_handle_model_default(common_params & params) {
}
params.hf_file = params.model;
} else if (params.model.empty()) {
params.model = fs_get_cache_file(string_split<std::string>(params.hf_file, '/').back());
// this is to avoid different repo having same file name, or same file name in different subdirs
std::string filename = params.hf_repo + "_" + params.hf_file;
// to make sure we don't have any slashes in the filename
string_replace_all(filename, "/", "_");
params.model = fs_get_cache_file(filename);
}
} else if (!params.model_url.empty()) {
if (params.model.empty()) {

View File

@@ -829,9 +829,9 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_model * model = nullptr;
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
} else if (!params.model_url.empty()) {
model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
} else {
model = llama_load_model_from_file(params.model.c_str(), mparams);
}
@@ -1342,17 +1342,17 @@ static bool common_download_file(const std::string & url, const std::string & pa
}
struct llama_model * common_load_model_from_url(
const char * model_url,
const char * path_model,
const char * hf_token,
const std::string & model_url,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params) {
// Basic validation of the model_url
if (!model_url || strlen(model_url) == 0) {
if (model_url.empty()) {
LOG_ERR("%s: invalid model_url\n", __func__);
return NULL;
}
if (!common_download_file(model_url, path_model, hf_token)) {
if (!common_download_file(model_url, local_path, hf_token)) {
return NULL;
}
@@ -1363,9 +1363,9 @@ struct llama_model * common_load_model_from_url(
/*.no_alloc = */ true,
/*.ctx = */ NULL,
};
auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
if (!ctx_gguf) {
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model);
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
return NULL;
}
@@ -1384,13 +1384,13 @@ struct llama_model * common_load_model_from_url(
// Verify the first split file format
// and extract split URL and PATH prefixes
{
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
return NULL;
}
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
return NULL;
}
}
@@ -1417,14 +1417,14 @@ struct llama_model * common_load_model_from_url(
}
}
return llama_load_model_from_file(path_model, params);
return llama_load_model_from_file(local_path.c_str(), params);
}
struct llama_model * common_load_model_from_hf(
const char * repo,
const char * model,
const char * path_model,
const char * hf_token,
const std::string & repo,
const std::string & remote_path,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params) {
// construct hugging face model url:
//
@@ -1438,27 +1438,27 @@ struct llama_model * common_load_model_from_hf(
std::string model_url = "https://huggingface.co/";
model_url += repo;
model_url += "/resolve/main/";
model_url += model;
model_url += remote_path;
return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
return common_load_model_from_url(model_url, local_path, hf_token, params);
}
#else
struct llama_model * common_load_model_from_url(
const char * /*model_url*/,
const char * /*path_model*/,
const char * /*hf_token*/,
const std::string & /*model_url*/,
const std::string & /*local_path*/,
const std::string & /*hf_token*/,
const struct llama_model_params & /*params*/) {
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
return nullptr;
}
struct llama_model * common_load_model_from_hf(
const char * /*repo*/,
const char * /*model*/,
const char * /*path_model*/,
const char * /*hf_token*/,
const std::string & /*repo*/,
const std::string & /*remote_path*/,
const std::string & /*local_path*/,
const std::string & /*hf_token*/,
const struct llama_model_params & /*params*/) {
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
return nullptr;

View File

@@ -470,8 +470,17 @@ struct llama_model_params common_model_params_to_llama ( common_params
struct llama_context_params common_context_params_to_llama(const common_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
struct llama_model * common_load_model_from_url(
const std::string & model_url,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);
struct llama_model * common_load_model_from_hf(
const std::string & repo,
const std::string & remote_path,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);
// clear LoRA adapters from context, then apply new list of adapters
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);

View File

@@ -319,7 +319,6 @@ class ServerPreset:
server.model_hf_repo = "ggml-org/models"
server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf"
server.model_alias = "jina-reranker"
server.model_file = "./tmp/jina-reranker-v1-tiny-en.gguf"
server.n_ctx = 512
server.n_batch = 512
server.n_slots = 1

View File

@@ -21,22 +21,23 @@
*/
#include "aclnn_ops.h"
#include "ggml-impl.h"
#include <aclnnop/aclnn_addcdiv.h>
#include <aclnnop/aclnn_avgpool2d.h>
#include <aclnnop/aclnn_batch_matmul.h>
#include <aclnnop/aclnn_cast.h>
#include <aclnnop/aclnn_constant_pad_nd.h>
#include <aclnnop/aclnn_copy.h>
#include <aclnnop/aclnn_cos.h>
#include <aclnnop/aclnn_div.h>
#include <aclnnop/aclnn_exp.h>
#include <aclnnop/aclnn_fill_scalar.h>
#include <aclnnop/aclnn_group_norm.h>
#include <aclnnop/aclnn_index_fill_tensor.h>
#include <aclnnop/aclnn_layer_norm.h>
#include <aclnnop/aclnn_mm.h>
#include <aclnnop/aclnn_batch_matmul.h>
#include <aclnnop/aclnn_matmul.h>
#include <aclnnop/aclnn_max_pool.h>
#include <aclnnop/aclnn_mm.h>
#include <aclnnop/aclnn_permute.h>
#include <aclnnop/aclnn_pow_tensor_tensor.h>
#include <aclnnop/aclnn_reduce_sum.h>
@@ -56,6 +57,7 @@
#include <exception>
#include <vector>
#include "ggml-impl.h"
#include "kernels/ascendc_kernels.h"
#define GGML_COMMON_DECL_C
@@ -1103,9 +1105,9 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
}
/**
* @brief Creates an ACL tensor initialized with ones using a provided buffer.
* @brief Creates an ACL tensor initialized with value using a provided buffer.
*
* This function initializes a tensor with ones using the specified buffer and
* This function initializes a tensor with value using the specified buffer and
* tensor parameters.
*
* @param ctx The context for the CANN backend operations.
@@ -1118,12 +1120,12 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
* @param type_size The size of each element in the tensor data type.
* @param value The value to be used for initializing the tensor (default
* is 1.0).
* @return An ACL tensor initialized with ones.
* @return An ACL tensor initialized with value.
*/
static aclTensor* aclnn_ones(ggml_backend_cann_context& ctx, void* buffer,
size_t n_bytes, int64_t* ne, int64_t dims,
aclDataType type, size_t type_size,
float value = 1.0f) {
static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
size_t n_bytes, int64_t* ne, int64_t dims,
aclDataType type, size_t type_size,
float value = 1.0f) {
aclTensor* acl_tensor =
aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size);
float alpha_host = 1.0f;
@@ -1165,7 +1167,7 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src);
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
aclTensor* acl_gamma = aclnn_ones(
aclTensor* acl_gamma = aclnn_values(
ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1,
ggml_cann_type_mapping(src->type), ggml_element_size(src));
@@ -1209,9 +1211,9 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
aclTensor* mask_tensor =
aclnn_ones(ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne,
GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
ggml_element_size(src), value);
aclnn_values(ctx, one_tensor_allocator.get(), one_tensor_n_bytes,
src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
ggml_element_size(src), value);
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
@@ -1768,6 +1770,92 @@ static void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
ACL_CHECK(aclnnSin(workspaceAddr, workspaceSize, executor, ctx.stream()));
}
/**
* @brief Performs element-wise division of tensor1 by tensor2 , multiplies the
result by the scalar value and adds it to self .
*
* Performs element-wise division of tensor1 by tensor2,
* multiplies the result by the scalar value and adds it to self .
* The operation is defined as:
* \f[
* \text{out}_i = \text{selft}_i + \text{value} \times
\frac{\text{tensor1}_i}{\text{tensor2}_i}
* \f]
* @param ctx The context for the CANN backend operations.
* @param acl_self The source tensor on which the addcdiv function will be
applied.
* @param tensor1 Numerator tensor.
* @param tensor2 Denominator tensor.
* @param value The value to be used for coefficient.
*/
static void aclnn_inplace_addcdiv(ggml_backend_cann_context& ctx,
aclTensor* acl_self, aclTensor* tensor1,
aclTensor* tensor2, float value) {
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
ACL_CHECK(aclnnInplaceAddcdivGetWorkspaceSize(
acl_self, tensor1, tensor2, acl_value, &workspaceSize, &executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}
ACL_CHECK(aclnnInplaceAddcdiv(workspaceAddr, workspaceSize, executor,
ctx.stream()));
}
/**
* @brief Matrix division, optionally in-place.
*
* This function division each element of the source tensor `acl_src` by the
* tensor `acl_other` and stores the result in the destination tensor `acl_dst`.
* If `inplace` is true, `acl_dst` will not be used and the operation is
* performed in-place on `acl_src`. The operation is defined as: \f[
* \text{dst}_i = \frac{\text{acl_src}_i}{\text{acl_other}_i}
* \f]
*
* @param ctx The context for the CANN backend operations.
* @param acl_src Numerator tensor..
* @param acl_other Denominator tensor.
* @param acl_dst The destination tensor where the result will be stored if
* `inplace` is false.
* @param inplace Flag indicating whether to perform the operation in-place on
* `acl_src`.
*/
static void aclnn_div_tensor(ggml_backend_cann_context& ctx, aclTensor* acl_src,
aclTensor* acl_other, aclTensor* acl_dst,
bool inplace) {
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
if (inplace) {
ACL_CHECK(aclnnInplaceDivGetWorkspaceSize(acl_src, acl_other,
&workspaceSize, &executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}
ACL_CHECK(aclnnInplaceDiv(workspaceAddr, workspaceSize, executor,
ctx.stream()));
} else {
ACL_CHECK(aclnnDivGetWorkspaceSize(acl_src, acl_other, acl_dst,
&workspaceSize, &executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}
ACL_CHECK(
aclnnDiv(workspaceAddr, workspaceSize, executor, ctx.stream()));
}
}
void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
ggml_tensor* dst) {
const ggml_tensor* src = dst->src[0];
@@ -2311,12 +2399,13 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ctx.stream()));
switch (src0->type) {
case GGML_TYPE_F32:
{
case GGML_TYPE_F32: {
#ifdef ASCEND_310P
// Special operation for get_row_f32 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
// Special operation for get_row_f32 kernel of 310P: clear the
// content of dest data buffer when row is not aligned to 32 bytes
if ((src0->ne[0] % 8) != 0) {
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] *
src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
}
#endif
@@ -2329,12 +2418,15 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)dst->extra)->nb);
break;
}
case GGML_TYPE_F16:
{
case GGML_TYPE_F16: {
#ifdef ASCEND_310P
// Special operation for get_row_f16 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
// Special operation for get_row_f16 kernel of 310P: clear the
// content of dest data buffer when row is not aligned to 32 bytes
if ((src0->ne[0] % 16) != 0) {
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); // out is also f32, even input is f16
size_t dst_len =
src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] *
ggml_type_size(
GGML_TYPE_F32); // out is also f32, even input is f16
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
}
#endif
@@ -2459,8 +2551,9 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
* @param acl_dst The destination tensor where the result of the matrix
* multiplication will be stored.
*/
static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
aclTensor* acl_weight, aclTensor* acl_dst) {
static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx,
aclTensor* acl_input, aclTensor* acl_weight,
aclTensor* acl_dst) {
int8_t cube_math_type = 2;
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
@@ -2475,8 +2568,7 @@ static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_inpu
workspaceAddr = workspace_allocator.get();
}
ACL_CHECK(
aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
ACL_CHECK(aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
}
/**
@@ -2496,8 +2588,9 @@ static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_inpu
* @param acl_dst The destination tensor where the result of the matrix
* multiplication will be stored.
*/
static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
aclTensor* acl_weight, aclTensor* acl_dst) {
static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx,
aclTensor* acl_input, aclTensor* acl_weight,
aclTensor* acl_dst) {
int8_t cube_math_type = 2;
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
@@ -2548,31 +2641,27 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
aclTensor* acl_input_tensor =
ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
int64_t transpose_ne[] = {
bcast_weight_ne[1], bcast_weight_ne[0],
bcast_weight_ne[2], bcast_weight_ne[3],
bcast_weight_ne[4], bcast_weight_ne[5]
};
size_t transpose_nb[] = {
bcast_weight_nb[1], bcast_weight_nb[0],
bcast_weight_nb[2], bcast_weight_nb[3],
bcast_weight_nb[4], bcast_weight_nb[5]
};
int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0],
bcast_weight_ne[2], bcast_weight_ne[3],
bcast_weight_ne[4], bcast_weight_ne[5]};
size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
bcast_weight_nb[2], bcast_weight_nb[3],
bcast_weight_nb[4], bcast_weight_nb[5]};
aclTensor* acl_weight_tensor =
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
aclTensor* acl_dst =
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
switch (n_dims) {
case 2:
aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
break;
case 3:
aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
break;
default:
aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
break;
case 2:
aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
break;
case 3:
aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
break;
default:
aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
break;
}
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
@@ -2594,8 +2683,8 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
* multiplication will be stored.
*/
static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
ggml_tensor* dst,
const enum ggml_type type) {
ggml_tensor* dst,
const enum ggml_type type) {
ggml_tensor* src0 = dst->src[0]; // weight
ggml_tensor* src1 = dst->src[1]; // input
@@ -2617,14 +2706,15 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
// scale stored at the end of weight. Also need transpose.
size_t scale_elem_size = sizeof(uint16_t);
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size};
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
scale_elem_size};
size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
char* scale_offset = (char*)src0->data + weight_size;
// input
size_t input_elem_size = sizeof(uint16_t);
int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
ggml_cann_pool_alloc input_alloctor(ctx.pool());
void* input_buffer = src1->data;
@@ -2632,7 +2722,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
// case in
if (src1->type != GGML_TYPE_F16) {
aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
input_buffer =
input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
int64_t* input_cast_ne = src1->ne;
size_t input_cast_nb[GGML_MAX_DIMS];
@@ -2642,9 +2733,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
}
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
input_buffer,
ACL_FLOAT16,
input_elem_size, input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
input_cast_nb, GGML_MAX_DIMS);
aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
@@ -2655,7 +2745,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
size_t output_elem_size = sizeof(uint16_t);
size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
ggml_cann_pool_alloc output_allocator(ctx.pool());
void* output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
void* output_buffer =
output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
// aclnn
@@ -2679,7 +2770,9 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
// first split
int64_t weight_ne_offset = 0;
int64_t weight_ne[2] = {max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0]};
int64_t weight_ne[2] = {
max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size,
src0->ne[0]};
int64_t scale_ne_offset = 0;
int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
int64_t output_ne_offset = 0;
@@ -2687,24 +2780,21 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
(char*)src0->data + batch0 * weight_stride,
ggml_cann_type_mapping(type),
weight_elem_size, weight_ne, weight_nb, 2,
ACL_FORMAT_ND, weight_ne_offset);
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
scale_offset + batch0 * scale_stride,
ACL_FLOAT16,
scale_elem_size, scale_ne, scale_nb, 2,
ACL_FORMAT_ND, scale_ne_offset);
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
scale_ne_offset);
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
(char*)output_buffer + batch1 * output_stride,
ACL_FLOAT16,
output_elem_size, output_ne, output_nb, 2,
ACL_FORMAT_ND, output_ne_offset);
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
output_ne_offset);
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
nullptr, nullptr, nullptr, nullptr, QK8_0,
acl_output_tensor, &workspaceSize, &executor));
acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
&workspaceSize, &executor));
if (workspaceAddr == nullptr) {
workspaceAddr = workspace_allocator.alloc(workspaceSize);
}
@@ -2717,28 +2807,29 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
// other splits
for (int64_t split = 1; split < split_size; split++) {
weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1];
weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size;
weight_ne_offset +=
weight_elem_size * weight_ne[0] * weight_ne[1];
weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1]
? src0->ne[1] - (max_elem_size * split)
: max_elem_size;
scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
scale_ne[0] = weight_ne[0];
output_ne_offset += output_elem_size * output_ne[0] * output_ne[1];
output_ne_offset +=
output_elem_size * output_ne[0] * output_ne[1];
output_ne[0] = weight_ne[0];
acl_weight_tensor = ggml_cann_create_tensor(
(char*)src0->data + batch0 * weight_stride,
ggml_cann_type_mapping(type),
weight_elem_size, weight_ne, weight_nb, 2,
ACL_FORMAT_ND, weight_ne_offset);
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
acl_scale_tensor = ggml_cann_create_tensor(
scale_offset + batch0 * scale_stride,
ACL_FLOAT16,
scale_elem_size, scale_ne, scale_nb, 2,
ACL_FORMAT_ND, scale_ne_offset);
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
scale_ne_offset);
acl_output_tensor = ggml_cann_create_tensor(
(char*)output_buffer + batch1 * output_stride,
ACL_FLOAT16,
output_elem_size, output_ne, output_nb, 2,
ACL_FORMAT_ND, output_ne_offset);
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
output_ne_offset);
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
@@ -2766,11 +2857,11 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
}
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
output_buffer,
ACL_FLOAT16,
output_elem_size, output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
output_buffer, ACL_FLOAT16, output_elem_size, output_cast_ne,
output_cast_nb, GGML_MAX_DIMS);
aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor,
ggml_cann_type_mapping(dst->type));
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
@@ -2873,12 +2964,14 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
aclTensor* acl_cos_repeat_tensor,
aclTensor* acl_sin_repeat_tensor,
float theta_scale, bool is_neox) {
float theta_scale, float freq_scale,
bool is_neox) {
// int sin/cos cache, cache has different repeat method depond on
// @param.is_neox
ggml_tensor* src0 = dst->src[0]; // input
ggml_tensor* src1 = dst->src[1]; // position
ggml_tensor* src2 = dst->src[2]; // freq_factors
// arange, [0,1,...,ne0/2]
int64_t arange_length = src0->ne[0] / 2;
@@ -2907,11 +3000,25 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(),
arange_length * sizeof(float_t));
void* theta_scale_buffer = theta_scale_allocator.get();
aclTensor* acl_theta_scale_tensor = aclnn_ones(
aclTensor* acl_theta_scale_tensor = aclnn_values(
ctx, theta_scale_buffer, arange_length * sizeof(float_t), arange_ne,
GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), theta_scale);
aclnn_pow_tensor_tensor(ctx, acl_theta_scale_tensor, acl_arange_tensor);
// freq_scale
if (freq_scale != 1) {
aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true);
}
// freq_factors
if (src2) {
aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor(
src2->data, ggml_cann_type_mapping(src2->type),
ggml_type_size(src2->type), arange_ne, arange_nb, GGML_MAX_DIMS);
aclnn_div_tensor(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor,
nullptr, true);
}
// position
GGML_ASSERT(src1->type == GGML_TYPE_I32);
int64_t position_length = src1->ne[0];
@@ -2940,6 +3047,16 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor,
acl_theta_tensor);
// // power[] * position[] * freq_scale / freq_factors[]
// ggml_cann_pool_alloc theta_final_allocator(ctx.pool(),
// theta_length *
// sizeof(float_t));
// aclTensor* acl_theat_final_tensor = aclnn_zero(
// ctx, theta_final_allocator.get(), sizeof(float_t) * theta_length,
// theta_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t));
// aclnn_inplace_addcdiv(ctx, acl_theat_final_tensor, acl_theta_tensor,
// acl_freq_factors_tensor, freq_scale);
// permute: [0,1,2,3]->[0,2,1,3]
int64_t permute_ne[] = {arange_length, 1, position_length, 1};
size_t permute_nb[GGML_MAX_DIMS];
@@ -3038,8 +3155,6 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
// TODO: with freq_factors
GGML_ASSERT(src2 == NULL);
// TODO: attn_factor != 1
GGML_ASSERT(attn_factor == 1);
// TODO: n_dims <= ne0
@@ -3047,8 +3162,6 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
GGML_ASSERT(n_dims % 2 == 0);
// TODO: ext_factor != 0
GGML_ASSERT(ext_factor == 0);
// TODO: freq_scale != 1
GGML_ASSERT(freq_scale == 1);
// TODO: type == GGML_TYPE_F16
GGML_ASSERT(src0->type == GGML_TYPE_F32);
@@ -3081,7 +3194,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t),
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
theta_scale, is_neox);
theta_scale, freq_scale, is_neox);
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
@@ -3096,7 +3209,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
aclTensor* acl_x = ggml_cann_create_tensor(src0);
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst, &workspaceSize, &executor));
acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
acl_dst, &workspaceSize, &executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();

View File

@@ -1738,13 +1738,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
}
case GGML_OP_ROPE: {
// TODO: with ops-test v == 1
float * freq_scale = (float*)((int32_t*)op->op_params + 6);
float * ext_factor = (float*)((int32_t*)op->op_params + 7);
float * attn_factor = (float*)((int32_t*)op->op_params + 8);
// TODO: with freq_factors
if (op->src[2] != NULL) {
return false;
}
// TODO: n_dims <= ne0
if (op->src[0]->ne[0] != op->op_params[1]) {
return false;
@@ -1753,10 +1748,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
if (*ext_factor != 0) {
return false;
}
// TODO: freq_scale != 1
if (*freq_scale != 1) {
return false;
}
// TODO: attn_factor != 1
if (*attn_factor != 1) {
return false;

View File

@@ -47,9 +47,20 @@
#define CC_TURING 750
#define CC_AMPERE 800
#define CC_OFFSET_AMD 1000000
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
// GCN/CNDA, wave size is 64
#define CC_GCN4 (CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
#define CC_VEGA (CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
#define CC_VEGA20 (CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
#define CC_CDNA (CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
#define CC_CDNA2 (CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
#define CC_CDNA3 (CC_OFFSET_AMD + 942) // MI300
// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
#define CC_RDNA1 (CC_OFFSET_AMD + 1010) // RX 5000
#define CC_RDNA2 (CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
#define CC_RDNA3 (CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
#define CC_QY1 210
#define CC_QY2 220

View File

@@ -1107,6 +1107,11 @@ static void ggml_cuda_op_mul_mat_cublas(
const half alpha_f16 = 1.0f;
const half beta_f16 = 0.0f;
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
cu_compute_type = CUBLAS_COMPUTE_32F;
}
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
CUBLAS_CHECK(
cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
@@ -1114,7 +1119,7 @@ static void ggml_cuda_op_mul_mat_cublas(
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
src1_ptr, CUDA_R_16F, ne10,
&beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
CUBLAS_COMPUTE_16F,
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
@@ -1607,6 +1612,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
cudaDataType_t cu_data_type = CUDA_R_16F;
if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
cu_compute_type = CUBLAS_COMPUTE_32F;
}
// dst strides
size_t nbd2 = dst->nb[2];
size_t nbd3 = dst->nb[3];

View File

@@ -148,5 +148,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
}
return cc < CC_RDNA3 || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
return (cc < CC_RDNA3 && cc != CC_CDNA && cc != CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
}

View File

@@ -2570,9 +2570,9 @@ static __device__ void mul_mat_q_process_tile(
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
#if defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
__launch_bounds__(WARP_SIZE*nwarps, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
#else
#if __CUDA_ARCH__ >= CC_VOLTA
__launch_bounds__(WARP_SIZE*nwarps, 1)

View File

@@ -142,7 +142,7 @@ static void mul_mat_vec_q_cuda(
int64_t nwarps = 1;
int64_t rows_per_cuda_block = 1;
if (ggml_cuda_info().devices[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
if (ggml_cuda_info().devices[id].cc < CC_CDNA || ggml_cuda_info().devices[id].cc == CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA
switch(ncols_y) {
case 1:
nwarps = 4;

View File

@@ -95,6 +95,14 @@
#define __CUDA_ARCH__ 1300
#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
#define GCN
#endif
#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
#define CDNA
#endif
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
defined(__gfx1150__) || defined(__gfx1151__)
#define RDNA3

View File

@@ -14,7 +14,7 @@
#include <arm_sve.h>
#endif // __ARM_FEATURE_SVE
#if defined(__ARM_NEON)
#if defined(__ARM_NEON) && !defined(__CUDACC__)
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
//
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/

View File

@@ -997,9 +997,10 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
return ggml_is_contiguous(op->src[0]);
case GGML_OP_SUM_ROWS:
case GGML_OP_SOFT_MAX:
case GGML_OP_RMS_NORM:
case GGML_OP_GROUP_NORM:
return has_simdgroup_reduction;
case GGML_OP_RMS_NORM:
return has_simdgroup_reduction && (op->ne[0] % 4 == 0);
case GGML_OP_NORM:
case GGML_OP_ROPE:
return true;
@@ -2672,7 +2673,6 @@ static void ggml_metal_encode_node(
} break;
case GGML_OP_GROUP_NORM:
{
GGML_ASSERT(ne00 % 4 == 0);
GGML_ASSERT(ggml_is_contiguous(src0));
float eps;

View File

@@ -1 +1 @@
6fcbd60bc72ac3f7ad43f78c87e535f2e6206f58
c598cbe30621251e80acbcf3b601589a37c17f4d