mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-05 08:34:21 +00:00
Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b7420131bf | ||
|
|
9f912511bc | ||
|
|
3ad5451f3b | ||
|
|
46c69e0e75 | ||
|
|
9e2301f4a4 | ||
|
|
fee824a1a1 | ||
|
|
9150f8fef9 |
117
.github/workflows/build.yml
vendored
117
.github/workflows/build.yml
vendored
@@ -892,12 +892,12 @@ jobs:
|
||||
cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89-real -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined -DLLAMA_FATAL_WARNINGS=ON
|
||||
cmake --build build
|
||||
|
||||
windows-latest-cmake-cuda:
|
||||
runs-on: windows-latest
|
||||
windows-2019-cmake-cuda:
|
||||
runs-on: windows-2019
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.6.2']
|
||||
cuda: ['12.4', '11.7']
|
||||
build: ['cuda']
|
||||
|
||||
steps:
|
||||
@@ -905,13 +905,66 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install CUDA toolkit
|
||||
id: cuda-toolkit
|
||||
uses: Jimver/cuda-toolkit@v0.2.19
|
||||
- name: Install Cuda Toolkit 11.7
|
||||
if: ${{ matrix.cuda == '11.7' }}
|
||||
run: |
|
||||
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
|
||||
choco install unzip -y
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
|
||||
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
|
||||
- name: Install Cuda Toolkit 12.4
|
||||
if: ${{ matrix.cuda == '12.4' }}
|
||||
run: |
|
||||
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
|
||||
choco install unzip -y
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
|
||||
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
|
||||
- name: Install ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2
|
||||
with:
|
||||
cuda: ${{ matrix.cuda }}
|
||||
method: 'network'
|
||||
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
|
||||
key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
@@ -922,44 +975,12 @@ jobs:
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
run: |
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON -DCMAKE_CUDA_ARCHITECTURES=89-real
|
||||
cmake --build build --config Release -t ggml-cuda
|
||||
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
|
||||
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
|
||||
cmake --build build --config Release
|
||||
|
||||
windows-2019-cmake-cuda:
|
||||
runs-on: windows-2019
|
||||
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.2.0', '11.7.1']
|
||||
build: ['cuda']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install CUDA toolkit
|
||||
id: cuda-toolkit
|
||||
uses: Jimver/cuda-toolkit@v0.2.15
|
||||
with:
|
||||
cuda: ${{ matrix.cuda }}
|
||||
method: 'network'
|
||||
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
|
||||
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
|
||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
@@ -987,10 +1008,12 @@ jobs:
|
||||
name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||
|
||||
- name: Copy and pack Cuda runtime
|
||||
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
run: |
|
||||
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
|
||||
echo "Cuda install location: ${{ env.CUDA_PATH }}"
|
||||
$dst='.\build\bin\cudart\'
|
||||
robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
|
||||
|
||||
- name: Upload Cuda runtime
|
||||
|
||||
@@ -128,7 +128,11 @@ static void common_params_handle_model_default(common_params & params) {
|
||||
}
|
||||
params.hf_file = params.model;
|
||||
} else if (params.model.empty()) {
|
||||
params.model = fs_get_cache_file(string_split<std::string>(params.hf_file, '/').back());
|
||||
// this is to avoid different repo having same file name, or same file name in different subdirs
|
||||
std::string filename = params.hf_repo + "_" + params.hf_file;
|
||||
// to make sure we don't have any slashes in the filename
|
||||
string_replace_all(filename, "/", "_");
|
||||
params.model = fs_get_cache_file(filename);
|
||||
}
|
||||
} else if (!params.model_url.empty()) {
|
||||
if (params.model.empty()) {
|
||||
|
||||
@@ -829,9 +829,9 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
llama_model * model = nullptr;
|
||||
|
||||
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
||||
model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
||||
model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
|
||||
} else if (!params.model_url.empty()) {
|
||||
model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
||||
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
|
||||
} else {
|
||||
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||
}
|
||||
@@ -1342,17 +1342,17 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
||||
}
|
||||
|
||||
struct llama_model * common_load_model_from_url(
|
||||
const char * model_url,
|
||||
const char * path_model,
|
||||
const char * hf_token,
|
||||
const std::string & model_url,
|
||||
const std::string & local_path,
|
||||
const std::string & hf_token,
|
||||
const struct llama_model_params & params) {
|
||||
// Basic validation of the model_url
|
||||
if (!model_url || strlen(model_url) == 0) {
|
||||
if (model_url.empty()) {
|
||||
LOG_ERR("%s: invalid model_url\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!common_download_file(model_url, path_model, hf_token)) {
|
||||
if (!common_download_file(model_url, local_path, hf_token)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -1363,9 +1363,9 @@ struct llama_model * common_load_model_from_url(
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ NULL,
|
||||
};
|
||||
auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
|
||||
auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
|
||||
if (!ctx_gguf) {
|
||||
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model);
|
||||
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -1384,13 +1384,13 @@ struct llama_model * common_load_model_from_url(
|
||||
// Verify the first split file format
|
||||
// and extract split URL and PATH prefixes
|
||||
{
|
||||
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
|
||||
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
|
||||
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
|
||||
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
|
||||
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
|
||||
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
|
||||
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
@@ -1417,14 +1417,14 @@ struct llama_model * common_load_model_from_url(
|
||||
}
|
||||
}
|
||||
|
||||
return llama_load_model_from_file(path_model, params);
|
||||
return llama_load_model_from_file(local_path.c_str(), params);
|
||||
}
|
||||
|
||||
struct llama_model * common_load_model_from_hf(
|
||||
const char * repo,
|
||||
const char * model,
|
||||
const char * path_model,
|
||||
const char * hf_token,
|
||||
const std::string & repo,
|
||||
const std::string & remote_path,
|
||||
const std::string & local_path,
|
||||
const std::string & hf_token,
|
||||
const struct llama_model_params & params) {
|
||||
// construct hugging face model url:
|
||||
//
|
||||
@@ -1438,27 +1438,27 @@ struct llama_model * common_load_model_from_hf(
|
||||
std::string model_url = "https://huggingface.co/";
|
||||
model_url += repo;
|
||||
model_url += "/resolve/main/";
|
||||
model_url += model;
|
||||
model_url += remote_path;
|
||||
|
||||
return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
|
||||
return common_load_model_from_url(model_url, local_path, hf_token, params);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
struct llama_model * common_load_model_from_url(
|
||||
const char * /*model_url*/,
|
||||
const char * /*path_model*/,
|
||||
const char * /*hf_token*/,
|
||||
const std::string & /*model_url*/,
|
||||
const std::string & /*local_path*/,
|
||||
const std::string & /*hf_token*/,
|
||||
const struct llama_model_params & /*params*/) {
|
||||
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
struct llama_model * common_load_model_from_hf(
|
||||
const char * /*repo*/,
|
||||
const char * /*model*/,
|
||||
const char * /*path_model*/,
|
||||
const char * /*hf_token*/,
|
||||
const std::string & /*repo*/,
|
||||
const std::string & /*remote_path*/,
|
||||
const std::string & /*local_path*/,
|
||||
const std::string & /*hf_token*/,
|
||||
const struct llama_model_params & /*params*/) {
|
||||
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
||||
return nullptr;
|
||||
|
||||
@@ -470,8 +470,17 @@ struct llama_model_params common_model_params_to_llama ( common_params
|
||||
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
||||
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
||||
|
||||
struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||
struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||
struct llama_model * common_load_model_from_url(
|
||||
const std::string & model_url,
|
||||
const std::string & local_path,
|
||||
const std::string & hf_token,
|
||||
const struct llama_model_params & params);
|
||||
struct llama_model * common_load_model_from_hf(
|
||||
const std::string & repo,
|
||||
const std::string & remote_path,
|
||||
const std::string & local_path,
|
||||
const std::string & hf_token,
|
||||
const struct llama_model_params & params);
|
||||
|
||||
// clear LoRA adapters from context, then apply new list of adapters
|
||||
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
|
||||
|
||||
@@ -319,7 +319,6 @@ class ServerPreset:
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf"
|
||||
server.model_alias = "jina-reranker"
|
||||
server.model_file = "./tmp/jina-reranker-v1-tiny-en.gguf"
|
||||
server.n_ctx = 512
|
||||
server.n_batch = 512
|
||||
server.n_slots = 1
|
||||
|
||||
@@ -21,22 +21,23 @@
|
||||
*/
|
||||
|
||||
#include "aclnn_ops.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include <aclnnop/aclnn_addcdiv.h>
|
||||
#include <aclnnop/aclnn_avgpool2d.h>
|
||||
#include <aclnnop/aclnn_batch_matmul.h>
|
||||
#include <aclnnop/aclnn_cast.h>
|
||||
#include <aclnnop/aclnn_constant_pad_nd.h>
|
||||
#include <aclnnop/aclnn_copy.h>
|
||||
#include <aclnnop/aclnn_cos.h>
|
||||
#include <aclnnop/aclnn_div.h>
|
||||
#include <aclnnop/aclnn_exp.h>
|
||||
#include <aclnnop/aclnn_fill_scalar.h>
|
||||
#include <aclnnop/aclnn_group_norm.h>
|
||||
#include <aclnnop/aclnn_index_fill_tensor.h>
|
||||
#include <aclnnop/aclnn_layer_norm.h>
|
||||
#include <aclnnop/aclnn_mm.h>
|
||||
#include <aclnnop/aclnn_batch_matmul.h>
|
||||
#include <aclnnop/aclnn_matmul.h>
|
||||
#include <aclnnop/aclnn_max_pool.h>
|
||||
#include <aclnnop/aclnn_mm.h>
|
||||
#include <aclnnop/aclnn_permute.h>
|
||||
#include <aclnnop/aclnn_pow_tensor_tensor.h>
|
||||
#include <aclnnop/aclnn_reduce_sum.h>
|
||||
@@ -56,6 +57,7 @@
|
||||
#include <exception>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "kernels/ascendc_kernels.h"
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
@@ -1103,9 +1105,9 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Creates an ACL tensor initialized with ones using a provided buffer.
|
||||
* @brief Creates an ACL tensor initialized with value using a provided buffer.
|
||||
*
|
||||
* This function initializes a tensor with ones using the specified buffer and
|
||||
* This function initializes a tensor with value using the specified buffer and
|
||||
* tensor parameters.
|
||||
*
|
||||
* @param ctx The context for the CANN backend operations.
|
||||
@@ -1118,12 +1120,12 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
|
||||
* @param type_size The size of each element in the tensor data type.
|
||||
* @param value The value to be used for initializing the tensor (default
|
||||
* is 1.0).
|
||||
* @return An ACL tensor initialized with ones.
|
||||
* @return An ACL tensor initialized with value.
|
||||
*/
|
||||
static aclTensor* aclnn_ones(ggml_backend_cann_context& ctx, void* buffer,
|
||||
size_t n_bytes, int64_t* ne, int64_t dims,
|
||||
aclDataType type, size_t type_size,
|
||||
float value = 1.0f) {
|
||||
static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
|
||||
size_t n_bytes, int64_t* ne, int64_t dims,
|
||||
aclDataType type, size_t type_size,
|
||||
float value = 1.0f) {
|
||||
aclTensor* acl_tensor =
|
||||
aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size);
|
||||
float alpha_host = 1.0f;
|
||||
@@ -1165,7 +1167,7 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src);
|
||||
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
|
||||
|
||||
aclTensor* acl_gamma = aclnn_ones(
|
||||
aclTensor* acl_gamma = aclnn_values(
|
||||
ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1,
|
||||
ggml_cann_type_mapping(src->type), ggml_element_size(src));
|
||||
|
||||
@@ -1209,9 +1211,9 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
|
||||
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
|
||||
|
||||
aclTensor* mask_tensor =
|
||||
aclnn_ones(ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne,
|
||||
GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
|
||||
ggml_element_size(src), value);
|
||||
aclnn_values(ctx, one_tensor_allocator.get(), one_tensor_n_bytes,
|
||||
src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
|
||||
ggml_element_size(src), value);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
@@ -1768,6 +1770,92 @@ static void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
ACL_CHECK(aclnnSin(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs element-wise division of tensor1 by tensor2 , multiplies the
|
||||
result by the scalar value and adds it to self .
|
||||
*
|
||||
* Performs element-wise division of tensor1 by tensor2,
|
||||
* multiplies the result by the scalar value and adds it to self .
|
||||
* The operation is defined as:
|
||||
* \f[
|
||||
* \text{out}_i = \text{selft}_i + \text{value} \times
|
||||
\frac{\text{tensor1}_i}{\text{tensor2}_i}
|
||||
* \f]
|
||||
|
||||
* @param ctx The context for the CANN backend operations.
|
||||
* @param acl_self The source tensor on which the addcdiv function will be
|
||||
applied.
|
||||
* @param tensor1 Numerator tensor.
|
||||
* @param tensor2 Denominator tensor.
|
||||
* @param value The value to be used for coefficient.
|
||||
*/
|
||||
static void aclnn_inplace_addcdiv(ggml_backend_cann_context& ctx,
|
||||
aclTensor* acl_self, aclTensor* tensor1,
|
||||
aclTensor* tensor2, float value) {
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
void* workspaceAddr = nullptr;
|
||||
aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
|
||||
|
||||
ACL_CHECK(aclnnInplaceAddcdivGetWorkspaceSize(
|
||||
acl_self, tensor1, tensor2, acl_value, &workspaceSize, &executor));
|
||||
if (workspaceSize > 0) {
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
}
|
||||
|
||||
ACL_CHECK(aclnnInplaceAddcdiv(workspaceAddr, workspaceSize, executor,
|
||||
ctx.stream()));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Matrix division, optionally in-place.
|
||||
*
|
||||
* This function division each element of the source tensor `acl_src` by the
|
||||
* tensor `acl_other` and stores the result in the destination tensor `acl_dst`.
|
||||
* If `inplace` is true, `acl_dst` will not be used and the operation is
|
||||
* performed in-place on `acl_src`. The operation is defined as: \f[
|
||||
* \text{dst}_i = \frac{\text{acl_src}_i}{\text{acl_other}_i}
|
||||
* \f]
|
||||
*
|
||||
* @param ctx The context for the CANN backend operations.
|
||||
* @param acl_src Numerator tensor..
|
||||
* @param acl_other Denominator tensor.
|
||||
* @param acl_dst The destination tensor where the result will be stored if
|
||||
* `inplace` is false.
|
||||
* @param inplace Flag indicating whether to perform the operation in-place on
|
||||
* `acl_src`.
|
||||
*/
|
||||
static void aclnn_div_tensor(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
aclTensor* acl_other, aclTensor* acl_dst,
|
||||
bool inplace) {
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
void* workspaceAddr = nullptr;
|
||||
|
||||
if (inplace) {
|
||||
ACL_CHECK(aclnnInplaceDivGetWorkspaceSize(acl_src, acl_other,
|
||||
&workspaceSize, &executor));
|
||||
if (workspaceSize > 0) {
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
}
|
||||
|
||||
ACL_CHECK(aclnnInplaceDiv(workspaceAddr, workspaceSize, executor,
|
||||
ctx.stream()));
|
||||
} else {
|
||||
ACL_CHECK(aclnnDivGetWorkspaceSize(acl_src, acl_other, acl_dst,
|
||||
&workspaceSize, &executor));
|
||||
if (workspaceSize > 0) {
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
}
|
||||
|
||||
ACL_CHECK(
|
||||
aclnnDiv(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
|
||||
ggml_tensor* dst) {
|
||||
const ggml_tensor* src = dst->src[0];
|
||||
@@ -2311,12 +2399,13 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ctx.stream()));
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
case GGML_TYPE_F32: {
|
||||
#ifdef ASCEND_310P
|
||||
// Special operation for get_row_f32 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
|
||||
// Special operation for get_row_f32 kernel of 310P: clear the
|
||||
// content of dest data buffer when row is not aligned to 32 bytes
|
||||
if ((src0->ne[0] % 8) != 0) {
|
||||
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
|
||||
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] *
|
||||
src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
|
||||
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
|
||||
}
|
||||
#endif
|
||||
@@ -2329,12 +2418,15 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
case GGML_TYPE_F16: {
|
||||
#ifdef ASCEND_310P
|
||||
// Special operation for get_row_f16 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
|
||||
// Special operation for get_row_f16 kernel of 310P: clear the
|
||||
// content of dest data buffer when row is not aligned to 32 bytes
|
||||
if ((src0->ne[0] % 16) != 0) {
|
||||
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); // out is also f32, even input is f16
|
||||
size_t dst_len =
|
||||
src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] *
|
||||
ggml_type_size(
|
||||
GGML_TYPE_F32); // out is also f32, even input is f16
|
||||
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
|
||||
}
|
||||
#endif
|
||||
@@ -2459,8 +2551,9 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
||||
* @param acl_dst The destination tensor where the result of the matrix
|
||||
* multiplication will be stored.
|
||||
*/
|
||||
static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
||||
aclTensor* acl_weight, aclTensor* acl_dst) {
|
||||
static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx,
|
||||
aclTensor* acl_input, aclTensor* acl_weight,
|
||||
aclTensor* acl_dst) {
|
||||
int8_t cube_math_type = 2;
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
@@ -2475,8 +2568,7 @@ static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_inpu
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
}
|
||||
|
||||
ACL_CHECK(
|
||||
aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||
ACL_CHECK(aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -2496,8 +2588,9 @@ static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_inpu
|
||||
* @param acl_dst The destination tensor where the result of the matrix
|
||||
* multiplication will be stored.
|
||||
*/
|
||||
static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
||||
aclTensor* acl_weight, aclTensor* acl_dst) {
|
||||
static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx,
|
||||
aclTensor* acl_input, aclTensor* acl_weight,
|
||||
aclTensor* acl_dst) {
|
||||
int8_t cube_math_type = 2;
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
@@ -2548,31 +2641,27 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
||||
|
||||
aclTensor* acl_input_tensor =
|
||||
ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
|
||||
int64_t transpose_ne[] = {
|
||||
bcast_weight_ne[1], bcast_weight_ne[0],
|
||||
bcast_weight_ne[2], bcast_weight_ne[3],
|
||||
bcast_weight_ne[4], bcast_weight_ne[5]
|
||||
};
|
||||
size_t transpose_nb[] = {
|
||||
bcast_weight_nb[1], bcast_weight_nb[0],
|
||||
bcast_weight_nb[2], bcast_weight_nb[3],
|
||||
bcast_weight_nb[4], bcast_weight_nb[5]
|
||||
};
|
||||
int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0],
|
||||
bcast_weight_ne[2], bcast_weight_ne[3],
|
||||
bcast_weight_ne[4], bcast_weight_ne[5]};
|
||||
size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
|
||||
bcast_weight_nb[2], bcast_weight_nb[3],
|
||||
bcast_weight_nb[4], bcast_weight_nb[5]};
|
||||
aclTensor* acl_weight_tensor =
|
||||
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
|
||||
aclTensor* acl_dst =
|
||||
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
|
||||
|
||||
switch (n_dims) {
|
||||
case 2:
|
||||
aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
||||
break;
|
||||
case 3:
|
||||
aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
||||
break;
|
||||
default:
|
||||
aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
||||
break;
|
||||
case 2:
|
||||
aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
||||
break;
|
||||
case 3:
|
||||
aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
||||
break;
|
||||
default:
|
||||
aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
||||
break;
|
||||
}
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
|
||||
@@ -2594,8 +2683,8 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
||||
* multiplication will be stored.
|
||||
*/
|
||||
static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||
ggml_tensor* dst,
|
||||
const enum ggml_type type) {
|
||||
ggml_tensor* dst,
|
||||
const enum ggml_type type) {
|
||||
ggml_tensor* src0 = dst->src[0]; // weight
|
||||
ggml_tensor* src1 = dst->src[1]; // input
|
||||
|
||||
@@ -2617,14 +2706,15 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||
|
||||
// scale stored at the end of weight. Also need transpose.
|
||||
size_t scale_elem_size = sizeof(uint16_t);
|
||||
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size};
|
||||
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
|
||||
scale_elem_size};
|
||||
size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
|
||||
char* scale_offset = (char*)src0->data + weight_size;
|
||||
|
||||
// input
|
||||
size_t input_elem_size = sizeof(uint16_t);
|
||||
int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
|
||||
size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
|
||||
size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
|
||||
size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
|
||||
ggml_cann_pool_alloc input_alloctor(ctx.pool());
|
||||
void* input_buffer = src1->data;
|
||||
@@ -2632,7 +2722,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||
// case in
|
||||
if (src1->type != GGML_TYPE_F16) {
|
||||
aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
|
||||
input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
|
||||
input_buffer =
|
||||
input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
|
||||
|
||||
int64_t* input_cast_ne = src1->ne;
|
||||
size_t input_cast_nb[GGML_MAX_DIMS];
|
||||
@@ -2642,9 +2733,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||
}
|
||||
|
||||
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
|
||||
input_buffer,
|
||||
ACL_FLOAT16,
|
||||
input_elem_size, input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
|
||||
input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
|
||||
input_cast_nb, GGML_MAX_DIMS);
|
||||
aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
||||
@@ -2655,7 +2745,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||
size_t output_elem_size = sizeof(uint16_t);
|
||||
size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
|
||||
ggml_cann_pool_alloc output_allocator(ctx.pool());
|
||||
void* output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
|
||||
void* output_buffer =
|
||||
output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
|
||||
size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
|
||||
|
||||
// aclnn
|
||||
@@ -2679,7 +2770,9 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||
|
||||
// first split
|
||||
int64_t weight_ne_offset = 0;
|
||||
int64_t weight_ne[2] = {max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0]};
|
||||
int64_t weight_ne[2] = {
|
||||
max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size,
|
||||
src0->ne[0]};
|
||||
int64_t scale_ne_offset = 0;
|
||||
int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
|
||||
int64_t output_ne_offset = 0;
|
||||
@@ -2687,24 +2780,21 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||
|
||||
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
|
||||
(char*)src0->data + batch0 * weight_stride,
|
||||
ggml_cann_type_mapping(type),
|
||||
weight_elem_size, weight_ne, weight_nb, 2,
|
||||
ACL_FORMAT_ND, weight_ne_offset);
|
||||
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
|
||||
weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
|
||||
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
|
||||
scale_offset + batch0 * scale_stride,
|
||||
ACL_FLOAT16,
|
||||
scale_elem_size, scale_ne, scale_nb, 2,
|
||||
ACL_FORMAT_ND, scale_ne_offset);
|
||||
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
|
||||
scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
|
||||
scale_ne_offset);
|
||||
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
|
||||
(char*)output_buffer + batch1 * output_stride,
|
||||
ACL_FLOAT16,
|
||||
output_elem_size, output_ne, output_nb, 2,
|
||||
ACL_FORMAT_ND, output_ne_offset);
|
||||
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
|
||||
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
|
||||
output_ne_offset);
|
||||
|
||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
||||
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
|
||||
nullptr, nullptr, nullptr, nullptr, QK8_0,
|
||||
acl_output_tensor, &workspaceSize, &executor));
|
||||
acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
|
||||
nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
|
||||
&workspaceSize, &executor));
|
||||
if (workspaceAddr == nullptr) {
|
||||
workspaceAddr = workspace_allocator.alloc(workspaceSize);
|
||||
}
|
||||
@@ -2717,28 +2807,29 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||
|
||||
// other splits
|
||||
for (int64_t split = 1; split < split_size; split++) {
|
||||
weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1];
|
||||
weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size;
|
||||
weight_ne_offset +=
|
||||
weight_elem_size * weight_ne[0] * weight_ne[1];
|
||||
weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1]
|
||||
? src0->ne[1] - (max_elem_size * split)
|
||||
: max_elem_size;
|
||||
scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
|
||||
scale_ne[0] = weight_ne[0];
|
||||
output_ne_offset += output_elem_size * output_ne[0] * output_ne[1];
|
||||
output_ne_offset +=
|
||||
output_elem_size * output_ne[0] * output_ne[1];
|
||||
output_ne[0] = weight_ne[0];
|
||||
|
||||
acl_weight_tensor = ggml_cann_create_tensor(
|
||||
(char*)src0->data + batch0 * weight_stride,
|
||||
ggml_cann_type_mapping(type),
|
||||
weight_elem_size, weight_ne, weight_nb, 2,
|
||||
ACL_FORMAT_ND, weight_ne_offset);
|
||||
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
|
||||
weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
|
||||
acl_scale_tensor = ggml_cann_create_tensor(
|
||||
scale_offset + batch0 * scale_stride,
|
||||
ACL_FLOAT16,
|
||||
scale_elem_size, scale_ne, scale_nb, 2,
|
||||
ACL_FORMAT_ND, scale_ne_offset);
|
||||
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
|
||||
scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
|
||||
scale_ne_offset);
|
||||
acl_output_tensor = ggml_cann_create_tensor(
|
||||
(char*)output_buffer + batch1 * output_stride,
|
||||
ACL_FLOAT16,
|
||||
output_elem_size, output_ne, output_nb, 2,
|
||||
ACL_FORMAT_ND, output_ne_offset);
|
||||
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
|
||||
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
|
||||
output_ne_offset);
|
||||
|
||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
||||
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
|
||||
@@ -2766,11 +2857,11 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||
}
|
||||
|
||||
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
|
||||
output_buffer,
|
||||
ACL_FLOAT16,
|
||||
output_elem_size, output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
|
||||
output_buffer, ACL_FLOAT16, output_elem_size, output_cast_ne,
|
||||
output_cast_nb, GGML_MAX_DIMS);
|
||||
aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
|
||||
aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
|
||||
aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor,
|
||||
ggml_cann_type_mapping(dst->type));
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
|
||||
@@ -2873,12 +2964,14 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
|
||||
static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
|
||||
aclTensor* acl_cos_repeat_tensor,
|
||||
aclTensor* acl_sin_repeat_tensor,
|
||||
float theta_scale, bool is_neox) {
|
||||
float theta_scale, float freq_scale,
|
||||
bool is_neox) {
|
||||
// int sin/cos cache, cache has different repeat method depond on
|
||||
// @param.is_neox
|
||||
|
||||
ggml_tensor* src0 = dst->src[0]; // input
|
||||
ggml_tensor* src1 = dst->src[1]; // position
|
||||
ggml_tensor* src2 = dst->src[2]; // freq_factors
|
||||
|
||||
// arange, [0,1,...,ne0/2]
|
||||
int64_t arange_length = src0->ne[0] / 2;
|
||||
@@ -2907,11 +3000,25 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
|
||||
ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(),
|
||||
arange_length * sizeof(float_t));
|
||||
void* theta_scale_buffer = theta_scale_allocator.get();
|
||||
aclTensor* acl_theta_scale_tensor = aclnn_ones(
|
||||
aclTensor* acl_theta_scale_tensor = aclnn_values(
|
||||
ctx, theta_scale_buffer, arange_length * sizeof(float_t), arange_ne,
|
||||
GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), theta_scale);
|
||||
aclnn_pow_tensor_tensor(ctx, acl_theta_scale_tensor, acl_arange_tensor);
|
||||
|
||||
// freq_scale
|
||||
if (freq_scale != 1) {
|
||||
aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true);
|
||||
}
|
||||
|
||||
// freq_factors
|
||||
if (src2) {
|
||||
aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor(
|
||||
src2->data, ggml_cann_type_mapping(src2->type),
|
||||
ggml_type_size(src2->type), arange_ne, arange_nb, GGML_MAX_DIMS);
|
||||
aclnn_div_tensor(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor,
|
||||
nullptr, true);
|
||||
}
|
||||
|
||||
// position
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||
int64_t position_length = src1->ne[0];
|
||||
@@ -2940,6 +3047,16 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
|
||||
aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor,
|
||||
acl_theta_tensor);
|
||||
|
||||
// // power[] * position[] * freq_scale / freq_factors[]
|
||||
// ggml_cann_pool_alloc theta_final_allocator(ctx.pool(),
|
||||
// theta_length *
|
||||
// sizeof(float_t));
|
||||
// aclTensor* acl_theat_final_tensor = aclnn_zero(
|
||||
// ctx, theta_final_allocator.get(), sizeof(float_t) * theta_length,
|
||||
// theta_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t));
|
||||
// aclnn_inplace_addcdiv(ctx, acl_theat_final_tensor, acl_theta_tensor,
|
||||
// acl_freq_factors_tensor, freq_scale);
|
||||
|
||||
// permute: [0,1,2,3]->[0,2,1,3]
|
||||
int64_t permute_ne[] = {arange_length, 1, position_length, 1};
|
||||
size_t permute_nb[GGML_MAX_DIMS];
|
||||
@@ -3038,8 +3155,6 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
|
||||
memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
|
||||
|
||||
// TODO: with freq_factors
|
||||
GGML_ASSERT(src2 == NULL);
|
||||
// TODO: attn_factor != 1
|
||||
GGML_ASSERT(attn_factor == 1);
|
||||
// TODO: n_dims <= ne0
|
||||
@@ -3047,8 +3162,6 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
GGML_ASSERT(n_dims % 2 == 0);
|
||||
// TODO: ext_factor != 0
|
||||
GGML_ASSERT(ext_factor == 0);
|
||||
// TODO: freq_scale != 1
|
||||
GGML_ASSERT(freq_scale == 1);
|
||||
// TODO: type == GGML_TYPE_F16
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
|
||||
@@ -3081,7 +3194,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t),
|
||||
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
|
||||
aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
|
||||
theta_scale, is_neox);
|
||||
theta_scale, freq_scale, is_neox);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
@@ -3096,7 +3209,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
aclTensor* acl_x = ggml_cann_create_tensor(src0);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
|
||||
acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst, &workspaceSize, &executor));
|
||||
acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
|
||||
acl_dst, &workspaceSize, &executor));
|
||||
if (workspaceSize > 0) {
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
|
||||
@@ -1738,13 +1738,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
}
|
||||
case GGML_OP_ROPE: {
|
||||
// TODO: with ops-test v == 1
|
||||
float * freq_scale = (float*)((int32_t*)op->op_params + 6);
|
||||
float * ext_factor = (float*)((int32_t*)op->op_params + 7);
|
||||
float * attn_factor = (float*)((int32_t*)op->op_params + 8);
|
||||
// TODO: with freq_factors
|
||||
if (op->src[2] != NULL) {
|
||||
return false;
|
||||
}
|
||||
// TODO: n_dims <= ne0
|
||||
if (op->src[0]->ne[0] != op->op_params[1]) {
|
||||
return false;
|
||||
@@ -1753,10 +1748,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
if (*ext_factor != 0) {
|
||||
return false;
|
||||
}
|
||||
// TODO: freq_scale != 1
|
||||
if (*freq_scale != 1) {
|
||||
return false;
|
||||
}
|
||||
// TODO: attn_factor != 1
|
||||
if (*attn_factor != 1) {
|
||||
return false;
|
||||
|
||||
@@ -47,9 +47,20 @@
|
||||
#define CC_TURING 750
|
||||
#define CC_AMPERE 800
|
||||
#define CC_OFFSET_AMD 1000000
|
||||
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
|
||||
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
||||
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
|
||||
|
||||
// GCN/CNDA, wave size is 64
|
||||
#define CC_GCN4 (CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
|
||||
#define CC_VEGA (CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
|
||||
#define CC_VEGA20 (CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
|
||||
#define CC_CDNA (CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
|
||||
#define CC_CDNA2 (CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
|
||||
#define CC_CDNA3 (CC_OFFSET_AMD + 942) // MI300
|
||||
|
||||
// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
|
||||
#define CC_RDNA1 (CC_OFFSET_AMD + 1010) // RX 5000
|
||||
#define CC_RDNA2 (CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
|
||||
#define CC_RDNA3 (CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
|
||||
|
||||
#define CC_QY1 210
|
||||
#define CC_QY2 220
|
||||
|
||||
|
||||
@@ -1107,6 +1107,11 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||
const half alpha_f16 = 1.0f;
|
||||
const half beta_f16 = 0.0f;
|
||||
|
||||
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
||||
if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
|
||||
cu_compute_type = CUBLAS_COMPUTE_32F;
|
||||
}
|
||||
|
||||
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
@@ -1114,7 +1119,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
|
||||
src1_ptr, CUDA_R_16F, ne10,
|
||||
&beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
|
||||
CUBLAS_COMPUTE_16F,
|
||||
cu_compute_type,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
|
||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
||||
@@ -1607,6 +1612,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
||||
cudaDataType_t cu_data_type = CUDA_R_16F;
|
||||
|
||||
if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
|
||||
cu_compute_type = CUBLAS_COMPUTE_32F;
|
||||
}
|
||||
|
||||
// dst strides
|
||||
size_t nbd2 = dst->nb[2];
|
||||
size_t nbd3 = dst->nb[3];
|
||||
|
||||
@@ -148,5 +148,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||
return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
||||
return cc < CC_RDNA3 || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
return (cc < CC_RDNA3 && cc != CC_CDNA && cc != CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
||||
@@ -2570,9 +2570,9 @@ static __device__ void mul_mat_q_process_tile(
|
||||
|
||||
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(RDNA3) || defined(RDNA2)
|
||||
#if defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
||||
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
||||
#endif // defined(RDNA3) || defined(RDNA2)
|
||||
#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
||||
#else
|
||||
#if __CUDA_ARCH__ >= CC_VOLTA
|
||||
__launch_bounds__(WARP_SIZE*nwarps, 1)
|
||||
|
||||
@@ -142,7 +142,7 @@ static void mul_mat_vec_q_cuda(
|
||||
int64_t nwarps = 1;
|
||||
int64_t rows_per_cuda_block = 1;
|
||||
|
||||
if (ggml_cuda_info().devices[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
|
||||
if (ggml_cuda_info().devices[id].cc < CC_CDNA || ggml_cuda_info().devices[id].cc == CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA
|
||||
switch(ncols_y) {
|
||||
case 1:
|
||||
nwarps = 4;
|
||||
|
||||
8
ggml/src/ggml-cuda/vendors/hip.h
vendored
8
ggml/src/ggml-cuda/vendors/hip.h
vendored
@@ -95,6 +95,14 @@
|
||||
|
||||
#define __CUDA_ARCH__ 1300
|
||||
|
||||
#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
|
||||
#define GCN
|
||||
#endif
|
||||
|
||||
#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__)
|
||||
#define CDNA
|
||||
#endif
|
||||
|
||||
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
||||
defined(__gfx1150__) || defined(__gfx1151__)
|
||||
#define RDNA3
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
#include <arm_sve.h>
|
||||
#endif // __ARM_FEATURE_SVE
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
#if defined(__ARM_NEON) && !defined(__CUDACC__)
|
||||
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
||||
//
|
||||
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
||||
|
||||
@@ -997,9 +997,10 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
return has_simdgroup_reduction;
|
||||
case GGML_OP_RMS_NORM:
|
||||
return has_simdgroup_reduction && (op->ne[0] % 4 == 0);
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_ROPE:
|
||||
return true;
|
||||
@@ -2672,7 +2673,6 @@ static void ggml_metal_encode_node(
|
||||
} break;
|
||||
case GGML_OP_GROUP_NORM:
|
||||
{
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
float eps;
|
||||
|
||||
@@ -1 +1 @@
|
||||
6fcbd60bc72ac3f7ad43f78c87e535f2e6206f58
|
||||
c598cbe30621251e80acbcf3b601589a37c17f4d
|
||||
|
||||
Reference in New Issue
Block a user