mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-17 14:34:06 +00:00
Compare commits
27 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
84e1c33cde | ||
|
|
811872a59d | ||
|
|
9a4b79bcfa | ||
|
|
7066b4cce2 | ||
|
|
0eb4e12bee | ||
|
|
0cc63754b8 | ||
|
|
50d5cecbda | ||
|
|
9fd8c2687f | ||
|
|
47f931c8f9 | ||
|
|
106964e3d2 | ||
|
|
80acb7b430 | ||
|
|
10bce0450f | ||
|
|
1f922254f0 | ||
|
|
a9a678a6b2 | ||
|
|
9ca2e67762 | ||
|
|
5931c1f233 | ||
|
|
f6d12e7df8 | ||
|
|
b756441104 | ||
|
|
5a8987793f | ||
|
|
d9d54e498d | ||
|
|
cce5a90075 | ||
|
|
dc39012cba | ||
|
|
9336db462c | ||
|
|
96fa2c5e2d | ||
|
|
55ed008b2d | ||
|
|
6dfcfef078 | ||
|
|
599b3e0cd4 |
50
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
50
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
@@ -1,50 +0,0 @@
|
||||
name: Low Severity Bugs
|
||||
description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "low severity"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: Also tell us, what did you expect to happen?
|
||||
placeholder: Tell us what you see!
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./llama-cli --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: What operating system are you seeing the problem on?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
77
.github/ISSUE_TEMPLATE/010-bug-compilation.yml
vendored
Normal file
77
.github/ISSUE_TEMPLATE/010-bug-compilation.yml
vendored
Normal file
@@ -0,0 +1,77 @@
|
||||
name: Bug (compilation)
|
||||
description: Something goes wrong when trying to compile llama.cpp.
|
||||
title: "Compile bug: "
|
||||
labels: ["bug-unconfirmed", "compilation"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
This issue template is intended for bug reports where the compilation of llama.cpp fails.
|
||||
Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`.
|
||||
If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
|
||||
by clearing `~/.cache/ccache` (on Linux).
|
||||
- type: textarea
|
||||
id: commit
|
||||
attributes:
|
||||
label: Git commit
|
||||
description: Which commit are you trying to compile?
|
||||
placeholder: |
|
||||
$git rev-parse HEAD
|
||||
84a07a17b1b08cf2b9747c633a2372782848a27f
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: Operating systems
|
||||
description: Which operating systems do you know to be affected?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: backends
|
||||
attributes:
|
||||
label: GGML backends
|
||||
description: Which GGML backends do you know to be affected?
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
|
||||
multiple: true
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: info
|
||||
attributes:
|
||||
label: Problem description & steps to reproduce
|
||||
description: >
|
||||
Please give us a summary of the problem and tell us how to reproduce it.
|
||||
If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
|
||||
placeholder: >
|
||||
I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY.
|
||||
Here are the exact commands that I used: ...
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: first_bad_commit
|
||||
attributes:
|
||||
label: First Bad Commit
|
||||
description: >
|
||||
If the bug was not present on an earlier version: when did it start appearing?
|
||||
If possible, please do a git bisect and identify the exact commit that introduced the bug.
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: >
|
||||
Please copy and paste any relevant log output, including the command that you entered and any generated text.
|
||||
This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
validations:
|
||||
required: true
|
||||
101
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
Normal file
101
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
Normal file
@@ -0,0 +1,101 @@
|
||||
name: Bug (model use)
|
||||
description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
|
||||
title: "Eval bug: "
|
||||
labels: ["bug-unconfirmed", "model evaluation"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
This issue template is intended for bug reports where the model evaluation results
|
||||
(i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
|
||||
If you encountered the issue while using an external UI (e.g. ollama),
|
||||
please reproduce your issue using one of the examples/binaries in this repository.
|
||||
The `llama-cli` binary can be used for simple and reproducible model inference.
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./llama-cli --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: Operating systems
|
||||
description: Which operating systems do you know to be affected?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: backends
|
||||
attributes:
|
||||
label: GGML backends
|
||||
description: Which GGML backends do you know to be affected?
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
|
||||
multiple: true
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: hardware
|
||||
attributes:
|
||||
label: Hardware
|
||||
description: Which CPUs/GPUs are you using?
|
||||
placeholder: >
|
||||
e.g. Ryzen 5950X + 2x RTX 4090
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: model
|
||||
attributes:
|
||||
label: Models
|
||||
description: >
|
||||
Which model(s) at which quantization were you using when encountering the bug?
|
||||
If you downloaded a GGUF file off of Huggingface, please provide a link.
|
||||
placeholder: >
|
||||
e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: info
|
||||
attributes:
|
||||
label: Problem description & steps to reproduce
|
||||
description: >
|
||||
Please give us a summary of the problem and tell us how to reproduce it.
|
||||
If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
|
||||
that information would be very much appreciated by us.
|
||||
placeholder: >
|
||||
e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
|
||||
When I use -ngl 0 it works correctly.
|
||||
Here are the exact commands that I used: ...
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: first_bad_commit
|
||||
attributes:
|
||||
label: First Bad Commit
|
||||
description: >
|
||||
If the bug was not present on an earlier version: when did it start appearing?
|
||||
If possible, please do a git bisect and identify the exact commit that introduced the bug.
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: >
|
||||
Please copy and paste any relevant log output, including the command that you entered and any generated text.
|
||||
This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
validations:
|
||||
required: true
|
||||
81
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
Normal file
81
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
Normal file
@@ -0,0 +1,81 @@
|
||||
name: Bug (misc.)
|
||||
description: Something is not working the way it should (and it's not covered by any of the above cases).
|
||||
title: "Misc. bug: "
|
||||
labels: ["bug-unconfirmed"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
This issue template is intended for miscellaneous bugs that don't fit into any other category.
|
||||
If you encountered the issue while using an external UI (e.g. ollama),
|
||||
please reproduce your issue using one of the examples/binaries in this repository.
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which version of our software is affected? (You can use `--version` to get a version string.)
|
||||
placeholder: |
|
||||
$./llama-cli --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: Operating systems
|
||||
description: Which operating systems do you know to be affected?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: false
|
||||
- type: dropdown
|
||||
id: module
|
||||
attributes:
|
||||
label: Which llama.cpp modules do you know to be affected?
|
||||
multiple: true
|
||||
options:
|
||||
- Documentation/Github
|
||||
- libllama (core library)
|
||||
- llama-cli
|
||||
- llama-server
|
||||
- llama-bench
|
||||
- llama-quantize
|
||||
- Python/Bash scripts
|
||||
- Test code
|
||||
- Other (Please specify in the next section)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: info
|
||||
attributes:
|
||||
label: Problem description & steps to reproduce
|
||||
description: >
|
||||
Please give us a summary of the problem and tell us how to reproduce it (if applicable).
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: first_bad_commit
|
||||
attributes:
|
||||
label: First Bad Commit
|
||||
description: >
|
||||
If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing?
|
||||
If possible, please do a git bisect and identify the exact commit that introduced the bug.
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: >
|
||||
If applicable, please copy and paste any relevant log output, including the command that you entered and any generated text.
|
||||
This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
validations:
|
||||
required: false
|
||||
50
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
50
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
@@ -1,50 +0,0 @@
|
||||
name: Medium Severity Bug
|
||||
description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "medium severity"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: Also tell us, what did you expect to happen?
|
||||
placeholder: Tell us what you see!
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./llama-cli --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: What operating system are you seeing the problem on?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
@@ -1,5 +1,5 @@
|
||||
name: Enhancement
|
||||
description: Used to request enhancements for llama.cpp
|
||||
description: Used to request enhancements for llama.cpp.
|
||||
title: "Feature Request: "
|
||||
labels: ["enhancement"]
|
||||
body:
|
||||
50
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
50
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
@@ -1,50 +0,0 @@
|
||||
name: High Severity Bug
|
||||
description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "high severity"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: Also tell us, what did you expect to happen?
|
||||
placeholder: Tell us what you see!
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./llama-cli --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: What operating system are you seeing the problem on?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
@@ -1,5 +1,5 @@
|
||||
name: Research
|
||||
description: Track new technical research area
|
||||
description: Track new technical research area.
|
||||
title: "Research: "
|
||||
labels: ["research 🔬"]
|
||||
body:
|
||||
50
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
50
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
@@ -1,50 +0,0 @@
|
||||
name: Critical Severity Bug
|
||||
description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "critical severity"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: Also tell us, what did you expect to happen?
|
||||
placeholder: Tell us what you see!
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./llama-cli --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: What operating system are you seeing the problem on?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
@@ -1,5 +1,5 @@
|
||||
name: Refactor (Maintainers)
|
||||
description: Used to track refactoring opportunities
|
||||
description: Used to track refactoring opportunities.
|
||||
title: "Refactor: "
|
||||
labels: ["refactor"]
|
||||
body:
|
||||
29
.github/workflows/build.yml
vendored
29
.github/workflows/build.yml
vendored
@@ -952,7 +952,7 @@ jobs:
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -962,7 +962,8 @@ jobs:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install
|
||||
run: scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
@@ -981,26 +982,34 @@ jobs:
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Pack artifacts
|
||||
- name: Build the release package
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }}
|
||||
run: |
|
||||
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||
|
||||
echo "cp oneAPI running time dll files to ./build/bin done"
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
- name: Upload the release package
|
||||
if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
|
||||
|
||||
11
.github/workflows/docker.yml
vendored
11
.github/workflows/docker.yml
vendored
@@ -10,12 +10,10 @@
|
||||
name: Publish Docker image
|
||||
|
||||
on:
|
||||
#pull_request:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
|
||||
workflow_dispatch: # allows manual triggering, useful for debugging
|
||||
workflow_dispatch: # allows manual triggering
|
||||
schedule:
|
||||
# Rebuild daily rather than on every push because it is expensive
|
||||
- cron: '12 4 * * *'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
@@ -29,7 +27,6 @@ permissions:
|
||||
jobs:
|
||||
push_to_registry:
|
||||
name: Push Docker image to Docker Hub
|
||||
#if: github.event.pull_request.draft == false
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
|
||||
@@ -163,8 +163,11 @@ if (GGML_TARGET_DEFINES)
|
||||
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
|
||||
endif()
|
||||
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
|
||||
|
||||
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
|
||||
# all public headers
|
||||
set(LLAMA_PUBLIC_HEADERS
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h)
|
||||
set_target_properties(llama PROPERTIES PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
|
||||
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
||||
|
||||
configure_package_config_file(
|
||||
|
||||
10
Makefile
10
Makefile
@@ -34,6 +34,7 @@ BUILD_TARGETS = \
|
||||
llama-server \
|
||||
llama-simple \
|
||||
llama-simple-chat \
|
||||
llama-run \
|
||||
llama-speculative \
|
||||
llama-tokenize \
|
||||
llama-vdot \
|
||||
@@ -251,7 +252,7 @@ endif
|
||||
#
|
||||
|
||||
# keep standard at C11 and C++11
|
||||
MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon
|
||||
MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
|
||||
MK_CFLAGS = -std=c11 -fPIC
|
||||
MK_CXXFLAGS = -std=c++11 -fPIC
|
||||
MK_NVCCFLAGS = -std=c++11
|
||||
@@ -290,6 +291,7 @@ endif
|
||||
# some memory allocation are available on Linux through GNU extensions in libc
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
MK_CPPFLAGS += -D_GNU_SOURCE
|
||||
MK_LDFLAGS += -ldl
|
||||
endif
|
||||
|
||||
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
|
||||
@@ -966,6 +968,7 @@ OBJ_COMMON = \
|
||||
$(DIR_COMMON)/console.o \
|
||||
$(DIR_COMMON)/ngram-cache.o \
|
||||
$(DIR_COMMON)/sampling.o \
|
||||
$(DIR_COMMON)/speculative.o \
|
||||
$(DIR_COMMON)/build-info.o \
|
||||
$(DIR_COMMON)/json-schema-to-grammar.o
|
||||
|
||||
@@ -1165,6 +1168,11 @@ llama-infill: examples/infill/infill.cpp \
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-run: examples/run/run.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-simple: examples/simple/simple.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
|
||||
@@ -43,7 +43,8 @@ linkerSettings.append(.linkedFramework("Accelerate"))
|
||||
cSettings.append(
|
||||
contentsOf: [
|
||||
.define("GGML_USE_ACCELERATE"),
|
||||
.define("GGML_USE_METAL")
|
||||
.define("GGML_USE_METAL"),
|
||||
.define("GGML_USE_CPU")
|
||||
]
|
||||
)
|
||||
#endif
|
||||
|
||||
@@ -66,6 +66,8 @@ add_library(${TARGET} STATIC
|
||||
ngram-cache.h
|
||||
sampling.cpp
|
||||
sampling.h
|
||||
speculative.cpp
|
||||
speculative.h
|
||||
)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
|
||||
498
common/arg.cpp
498
common/arg.cpp
@@ -233,10 +233,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
}
|
||||
}
|
||||
|
||||
postprocess_cpu_params(params.cpuparams, nullptr);
|
||||
postprocess_cpu_params(params.cpuparams, nullptr);
|
||||
postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams);
|
||||
postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams);
|
||||
postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch);
|
||||
|
||||
postprocess_cpu_params(params.speculative.cpuparams, ¶ms.cpuparams);
|
||||
postprocess_cpu_params(params.speculative.cpuparams_batch, ¶ms.cpuparams_batch);
|
||||
|
||||
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
|
||||
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
||||
@@ -251,7 +252,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
for (auto & antiprompt : params.antiprompt) {
|
||||
string_process_escapes(antiprompt);
|
||||
}
|
||||
for (auto & seq_breaker : params.sparams.dry_sequence_breakers) {
|
||||
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
|
||||
string_process_escapes(seq_breaker);
|
||||
}
|
||||
}
|
||||
@@ -297,6 +298,27 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
|
||||
print_options(specific_options);
|
||||
}
|
||||
|
||||
static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
auto dev_names = string_split<std::string>(value, ',');
|
||||
if (dev_names.empty()) {
|
||||
throw std::invalid_argument("no devices specified");
|
||||
}
|
||||
if (dev_names.size() == 1 && dev_names[0] == "none") {
|
||||
devices.push_back(nullptr);
|
||||
} else {
|
||||
for (const auto & device : dev_names) {
|
||||
auto * dev = ggml_backend_dev_by_name(device.c_str());
|
||||
if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
|
||||
throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
|
||||
}
|
||||
devices.push_back(dev);
|
||||
}
|
||||
devices.push_back(nullptr);
|
||||
}
|
||||
return devices;
|
||||
}
|
||||
|
||||
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
||||
auto ctx_arg = common_params_parser_init(params, ex, print_usage);
|
||||
const common_params params_org = ctx_arg.params; // the example can modify the default params
|
||||
@@ -323,13 +345,16 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
|
||||
}
|
||||
|
||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
||||
// load dynamic backends
|
||||
ggml_backend_load_all();
|
||||
|
||||
common_params_context ctx_arg(params);
|
||||
ctx_arg.print_usage = print_usage;
|
||||
ctx_arg.ex = ex;
|
||||
|
||||
std::string sampler_type_chars;
|
||||
std::string sampler_type_names;
|
||||
for (const auto & sampler : params.sparams.samplers) {
|
||||
for (const auto & sampler : params.sampling.samplers) {
|
||||
sampler_type_chars += common_sampler_type_to_chr(sampler);
|
||||
sampler_type_names += common_sampler_type_to_str(sampler) + ";";
|
||||
}
|
||||
@@ -407,26 +432,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"-td", "--threads-draft"}, "N",
|
||||
"number of threads to use during generation (default: same as --threads)",
|
||||
[](common_params & params, int value) {
|
||||
params.draft_cpuparams.n_threads = value;
|
||||
if (params.draft_cpuparams.n_threads <= 0) {
|
||||
params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"-tbd", "--threads-batch-draft"}, "N",
|
||||
"number of threads to use during batch and prompt processing (default: same as --threads-draft)",
|
||||
[](common_params & params, int value) {
|
||||
params.draft_cpuparams_batch.n_threads = value;
|
||||
if (params.draft_cpuparams_batch.n_threads <= 0) {
|
||||
params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"-C", "--cpu-mask"}, "M",
|
||||
"CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
|
||||
@@ -515,108 +520,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.cpuparams_batch.poll = value;
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"-Cd", "--cpu-mask-draft"}, "M",
|
||||
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
||||
[](common_params & params, const std::string & mask) {
|
||||
params.draft_cpuparams.mask_valid = true;
|
||||
if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) {
|
||||
throw std::invalid_argument("invalid cpumask");
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"-Crd", "--cpu-range-draft"}, "lo-hi",
|
||||
"Ranges of CPUs for affinity. Complements --cpu-mask-draft",
|
||||
[](common_params & params, const std::string & range) {
|
||||
params.draft_cpuparams.mask_valid = true;
|
||||
if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) {
|
||||
throw std::invalid_argument("invalid range");
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"--cpu-strict-draft"}, "<0|1>",
|
||||
"Use strict CPU placement for draft model (default: same as --cpu-strict)",
|
||||
[](common_params & params, int value) {
|
||||
params.draft_cpuparams.strict_cpu = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"--prio-draft"}, "N",
|
||||
string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
|
||||
[](common_params & params, int prio) {
|
||||
if (prio < 0 || prio > 3) {
|
||||
throw std::invalid_argument("invalid value");
|
||||
}
|
||||
params.draft_cpuparams.priority = (enum ggml_sched_priority) prio;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"--poll-draft"}, "<0|1>",
|
||||
"Use polling to wait for draft model work (default: same as --poll])",
|
||||
[](common_params & params, int value) {
|
||||
params.draft_cpuparams.poll = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"-Cbd", "--cpu-mask-batch-draft"}, "M",
|
||||
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
||||
[](common_params & params, const std::string & mask) {
|
||||
params.draft_cpuparams_batch.mask_valid = true;
|
||||
if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) {
|
||||
throw std::invalid_argument("invalid cpumask");
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
|
||||
"Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
|
||||
[](common_params & params, const std::string & range) {
|
||||
params.draft_cpuparams_batch.mask_valid = true;
|
||||
if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) {
|
||||
throw std::invalid_argument("invalid cpumask");
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"--cpu-strict-batch-draft"}, "<0|1>",
|
||||
"Use strict CPU placement for draft model (default: --cpu-strict-draft)",
|
||||
[](common_params & params, int value) {
|
||||
params.draft_cpuparams_batch.strict_cpu = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"--prio-batch-draft"}, "N",
|
||||
string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
|
||||
[](common_params & params, int prio) {
|
||||
if (prio < 0 || prio > 3) {
|
||||
throw std::invalid_argument("invalid value");
|
||||
}
|
||||
params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"--poll-batch-draft"}, "<0|1>",
|
||||
"Use polling to wait for draft model work (default: --poll-draft)",
|
||||
[](common_params & params, int value) {
|
||||
params.draft_cpuparams_batch.poll = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"--draft"}, "N",
|
||||
string_format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
|
||||
[](common_params & params, int value) {
|
||||
params.n_draft = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
||||
add_opt(common_arg(
|
||||
{"-ps", "--p-split"}, "N",
|
||||
string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.p_split = std::stof(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"-lcs", "--lookup-cache-static"}, "FNAME",
|
||||
"path to static lookup cache to use for lookup decoding (not updated by generation)",
|
||||
@@ -701,7 +604,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
||||
[](common_params & params) {
|
||||
params.no_perf = true;
|
||||
params.sparams.no_perf = true;
|
||||
params.sampling.no_perf = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_NO_PERF"));
|
||||
add_opt(common_arg(
|
||||
@@ -883,155 +786,155 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
|
||||
[](common_params & params, const std::string & value) {
|
||||
const auto sampler_names = string_split<std::string>(value, ';');
|
||||
params.sparams.samplers = common_sampler_types_from_names(sampler_names, true);
|
||||
params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"-s", "--seed"}, "SEED",
|
||||
string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED),
|
||||
string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sparams.seed = std::stoul(value);
|
||||
params.sampling.seed = std::stoul(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--sampling-seq"}, "SEQUENCE",
|
||||
string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sparams.samplers = common_sampler_types_from_chars(value);
|
||||
params.sampling.samplers = common_sampler_types_from_chars(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--ignore-eos"},
|
||||
"ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
|
||||
[](common_params & params) {
|
||||
params.sparams.ignore_eos = true;
|
||||
params.sampling.ignore_eos = true;
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--penalize-nl"},
|
||||
string_format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
|
||||
string_format("penalize newline tokens (default: %s)", params.sampling.penalize_nl ? "true" : "false"),
|
||||
[](common_params & params) {
|
||||
params.sparams.penalize_nl = true;
|
||||
params.sampling.penalize_nl = true;
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--temp"}, "N",
|
||||
string_format("temperature (default: %.1f)", (double)params.sparams.temp),
|
||||
string_format("temperature (default: %.1f)", (double)params.sampling.temp),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sparams.temp = std::stof(value);
|
||||
params.sparams.temp = std::max(params.sparams.temp, 0.0f);
|
||||
params.sampling.temp = std::stof(value);
|
||||
params.sampling.temp = std::max(params.sampling.temp, 0.0f);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--top-k"}, "N",
|
||||
string_format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
|
||||
string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
|
||||
[](common_params & params, int value) {
|
||||
params.sparams.top_k = value;
|
||||
params.sampling.top_k = value;
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--top-p"}, "N",
|
||||
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
|
||||
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sparams.top_p = std::stof(value);
|
||||
params.sampling.top_p = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--min-p"}, "N",
|
||||
string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
|
||||
string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sparams.min_p = std::stof(value);
|
||||
params.sampling.min_p = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--xtc-probability"}, "N",
|
||||
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
|
||||
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sparams.xtc_probability = std::stof(value);
|
||||
params.sampling.xtc_probability = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--xtc-threshold"}, "N",
|
||||
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold),
|
||||
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sparams.xtc_threshold = std::stof(value);
|
||||
params.sampling.xtc_threshold = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--typical"}, "N",
|
||||
string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
|
||||
string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sparams.typ_p = std::stof(value);
|
||||
params.sampling.typ_p = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--repeat-last-n"}, "N",
|
||||
string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
|
||||
string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
|
||||
[](common_params & params, int value) {
|
||||
params.sparams.penalty_last_n = value;
|
||||
params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n);
|
||||
params.sampling.penalty_last_n = value;
|
||||
params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--repeat-penalty"}, "N",
|
||||
string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
|
||||
string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sparams.penalty_repeat = std::stof(value);
|
||||
params.sampling.penalty_repeat = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--presence-penalty"}, "N",
|
||||
string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
|
||||
string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sparams.penalty_present = std::stof(value);
|
||||
params.sampling.penalty_present = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--frequency-penalty"}, "N",
|
||||
string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
|
||||
string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sparams.penalty_freq = std::stof(value);
|
||||
params.sampling.penalty_freq = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--dry-multiplier"}, "N",
|
||||
string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sparams.dry_multiplier),
|
||||
string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sparams.dry_multiplier = std::stof(value);
|
||||
params.sampling.dry_multiplier = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--dry-base"}, "N",
|
||||
string_format("set DRY sampling base value (default: %.2f)", (double)params.sparams.dry_base),
|
||||
string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base),
|
||||
[](common_params & params, const std::string & value) {
|
||||
float potential_base = std::stof(value);
|
||||
if (potential_base >= 1.0f)
|
||||
{
|
||||
params.sparams.dry_base = potential_base;
|
||||
params.sampling.dry_base = potential_base;
|
||||
}
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--dry-allowed-length"}, "N",
|
||||
string_format("set allowed length for DRY sampling (default: %d)", params.sparams.dry_allowed_length),
|
||||
string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length),
|
||||
[](common_params & params, int value) {
|
||||
params.sparams.dry_allowed_length = value;
|
||||
params.sampling.dry_allowed_length = value;
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--dry-penalty-last-n"}, "N",
|
||||
string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sparams.dry_penalty_last_n),
|
||||
string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
|
||||
[](common_params & params, int value) {
|
||||
params.sparams.dry_penalty_last_n = value;
|
||||
params.sampling.dry_penalty_last_n = value;
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--dry-sequence-breaker"}, "STRING",
|
||||
string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
|
||||
params.sparams.dry_sequence_breakers.empty() ? "none" :
|
||||
std::accumulate(std::next(params.sparams.dry_sequence_breakers.begin()),
|
||||
params.sparams.dry_sequence_breakers.end(),
|
||||
std::string("'") + (params.sparams.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sparams.dry_sequence_breakers[0]) + "'",
|
||||
params.sampling.dry_sequence_breakers.empty() ? "none" :
|
||||
std::accumulate(std::next(params.sampling.dry_sequence_breakers.begin()),
|
||||
params.sampling.dry_sequence_breakers.end(),
|
||||
std::string("'") + (params.sampling.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sampling.dry_sequence_breakers[0]) + "'",
|
||||
[](const std::string& a, const std::string& b) {
|
||||
std::string formatted_b = (b == "\n") ? "\\n" : b;
|
||||
return a + ", '" + formatted_b + "'";
|
||||
@@ -1040,51 +943,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
static bool defaults_cleared = false;
|
||||
|
||||
if (!defaults_cleared) {
|
||||
params.sparams.dry_sequence_breakers.clear();
|
||||
params.sampling.dry_sequence_breakers.clear();
|
||||
defaults_cleared = true;
|
||||
}
|
||||
|
||||
if (value == "none") {
|
||||
params.sparams.dry_sequence_breakers.clear();
|
||||
params.sampling.dry_sequence_breakers.clear();
|
||||
} else {
|
||||
params.sparams.dry_sequence_breakers.emplace_back(value);
|
||||
params.sampling.dry_sequence_breakers.emplace_back(value);
|
||||
}
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--dynatemp-range"}, "N",
|
||||
string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
|
||||
string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sparams.dynatemp_range = std::stof(value);
|
||||
params.sampling.dynatemp_range = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--dynatemp-exp"}, "N",
|
||||
string_format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
|
||||
string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sparams.dynatemp_exponent = std::stof(value);
|
||||
params.sampling.dynatemp_exponent = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--mirostat"}, "N",
|
||||
string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
|
||||
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
|
||||
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
|
||||
[](common_params & params, int value) {
|
||||
params.sparams.mirostat = value;
|
||||
params.sampling.mirostat = value;
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--mirostat-lr"}, "N",
|
||||
string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
|
||||
string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sparams.mirostat_eta = std::stof(value);
|
||||
params.sampling.mirostat_eta = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--mirostat-ent"}, "N",
|
||||
string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
|
||||
string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sparams.mirostat_tau = std::stof(value);
|
||||
params.sampling.mirostat_tau = std::stof(value);
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
@@ -1100,7 +1003,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
try {
|
||||
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
||||
const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
||||
params.sparams.logit_bias.push_back({key, bias});
|
||||
params.sampling.logit_bias.push_back({key, bias});
|
||||
} else {
|
||||
throw std::invalid_argument("invalid input format");
|
||||
}
|
||||
@@ -1111,9 +1014,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
{"--grammar"}, "GRAMMAR",
|
||||
string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
|
||||
string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sparams.grammar = value;
|
||||
params.sampling.grammar = value;
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
@@ -1127,7 +1030,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
std::copy(
|
||||
std::istreambuf_iterator<char>(file),
|
||||
std::istreambuf_iterator<char>(),
|
||||
std::back_inserter(params.sparams.grammar)
|
||||
std::back_inserter(params.sampling.grammar)
|
||||
);
|
||||
}
|
||||
).set_sparam());
|
||||
@@ -1135,7 +1038,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
{"-j", "--json-schema"}, "SCHEMA",
|
||||
"JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.sparams.grammar = json_schema_to_grammar(json::parse(value));
|
||||
params.sampling.grammar = json_schema_to_grammar(json::parse(value));
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(common_arg(
|
||||
@@ -1433,6 +1336,30 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
else { throw std::invalid_argument("invalid value"); }
|
||||
}
|
||||
).set_env("LLAMA_ARG_NUMA"));
|
||||
add_opt(common_arg(
|
||||
{"-dev", "--device"}, "<dev1,dev2,..>",
|
||||
"comma-separated list of devices to use for offloading (none = don't offload)\n"
|
||||
"use --list-devices to see a list of available devices",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.devices = parse_device_list(value);
|
||||
}
|
||||
).set_env("LLAMA_ARG_DEVICE"));
|
||||
add_opt(common_arg(
|
||||
{"--list-devices"},
|
||||
"print list of available devices and exit",
|
||||
[](common_params &) {
|
||||
printf("Available devices:\n");
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
auto * dev = ggml_backend_dev_get(i);
|
||||
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
|
||||
size_t free, total;
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
|
||||
}
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
||||
"number of layers to store in VRAM",
|
||||
@@ -1444,17 +1371,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
}
|
||||
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
|
||||
add_opt(common_arg(
|
||||
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
||||
"number of layers to store in VRAM for the draft model",
|
||||
[](common_params & params, int value) {
|
||||
params.n_gpu_layers_draft = value;
|
||||
if (!llama_supports_gpu_offload()) {
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"-sm", "--split-mode"}, "{none,layer,row}",
|
||||
"how to split the model across multiple GPUs, one of:\n"
|
||||
@@ -1468,10 +1384,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
} else if (arg_next == "layer") {
|
||||
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
||||
} else if (arg_next == "row") {
|
||||
#ifdef GGML_USE_SYCL
|
||||
fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
|
||||
exit(1);
|
||||
#endif // GGML_USE_SYCL
|
||||
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
||||
} else {
|
||||
throw std::invalid_argument("invalid value");
|
||||
@@ -1593,13 +1505,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.model = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
|
||||
add_opt(common_arg(
|
||||
{"-md", "--model-draft"}, "FNAME",
|
||||
"draft model for speculative decoding (default: unused)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.model_draft = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"-mu", "--model-url"}, "MODEL_URL",
|
||||
"model download url (default: unused)",
|
||||
@@ -2037,5 +1942,176 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_env("LLAMA_LOG_TIMESTAMPS"));
|
||||
|
||||
// speculative parameters
|
||||
add_opt(common_arg(
|
||||
{"-td", "--threads-draft"}, "N",
|
||||
"number of threads to use during generation (default: same as --threads)",
|
||||
[](common_params & params, int value) {
|
||||
params.speculative.cpuparams.n_threads = value;
|
||||
if (params.speculative.cpuparams.n_threads <= 0) {
|
||||
params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"-tbd", "--threads-batch-draft"}, "N",
|
||||
"number of threads to use during batch and prompt processing (default: same as --threads-draft)",
|
||||
[](common_params & params, int value) {
|
||||
params.speculative.cpuparams_batch.n_threads = value;
|
||||
if (params.speculative.cpuparams_batch.n_threads <= 0) {
|
||||
params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"-Cd", "--cpu-mask-draft"}, "M",
|
||||
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
||||
[](common_params & params, const std::string & mask) {
|
||||
params.speculative.cpuparams.mask_valid = true;
|
||||
if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) {
|
||||
throw std::invalid_argument("invalid cpumask");
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"-Crd", "--cpu-range-draft"}, "lo-hi",
|
||||
"Ranges of CPUs for affinity. Complements --cpu-mask-draft",
|
||||
[](common_params & params, const std::string & range) {
|
||||
params.speculative.cpuparams.mask_valid = true;
|
||||
if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) {
|
||||
throw std::invalid_argument("invalid range");
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"--cpu-strict-draft"}, "<0|1>",
|
||||
"Use strict CPU placement for draft model (default: same as --cpu-strict)",
|
||||
[](common_params & params, int value) {
|
||||
params.speculative.cpuparams.strict_cpu = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"--prio-draft"}, "N",
|
||||
string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority),
|
||||
[](common_params & params, int prio) {
|
||||
if (prio < 0 || prio > 3) {
|
||||
throw std::invalid_argument("invalid value");
|
||||
}
|
||||
params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"--poll-draft"}, "<0|1>",
|
||||
"Use polling to wait for draft model work (default: same as --poll])",
|
||||
[](common_params & params, int value) {
|
||||
params.speculative.cpuparams.poll = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"-Cbd", "--cpu-mask-batch-draft"}, "M",
|
||||
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
||||
[](common_params & params, const std::string & mask) {
|
||||
params.speculative.cpuparams_batch.mask_valid = true;
|
||||
if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) {
|
||||
throw std::invalid_argument("invalid cpumask");
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
|
||||
"Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
|
||||
[](common_params & params, const std::string & range) {
|
||||
params.speculative.cpuparams_batch.mask_valid = true;
|
||||
if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) {
|
||||
throw std::invalid_argument("invalid cpumask");
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"--cpu-strict-batch-draft"}, "<0|1>",
|
||||
"Use strict CPU placement for draft model (default: --cpu-strict-draft)",
|
||||
[](common_params & params, int value) {
|
||||
params.speculative.cpuparams_batch.strict_cpu = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"--prio-batch-draft"}, "N",
|
||||
string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority),
|
||||
[](common_params & params, int prio) {
|
||||
if (prio < 0 || prio > 3) {
|
||||
throw std::invalid_argument("invalid value");
|
||||
}
|
||||
params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"--poll-batch-draft"}, "<0|1>",
|
||||
"Use polling to wait for draft model work (default: --poll-draft)",
|
||||
[](common_params & params, int value) {
|
||||
params.speculative.cpuparams_batch.poll = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"--draft-max", "--draft", "--draft-n"}, "N",
|
||||
string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
|
||||
[](common_params & params, int value) {
|
||||
params.speculative.n_max = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--draft-min", "--draft-n-min"}, "N",
|
||||
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
|
||||
[](common_params & params, int value) {
|
||||
params.speculative.n_min = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--draft-p-split"}, "P",
|
||||
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.speculative.p_split = std::stof(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"--draft-p-min"}, "P",
|
||||
string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.speculative.p_min = std::stof(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"-cd", "--ctx-size-draft"}, "N",
|
||||
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
|
||||
[](common_params & params, int value) {
|
||||
params.speculative.n_ctx = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
|
||||
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
|
||||
"use --list-devices to see a list of available devices",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.speculative.devices = parse_device_list(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
||||
"number of layers to store in VRAM for the draft model",
|
||||
[](common_params & params, int value) {
|
||||
params.speculative.n_gpu_layers = value;
|
||||
if (!llama_supports_gpu_offload()) {
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"-md", "--model-draft"}, "FNAME",
|
||||
"draft model for speculative decoding (default: unused)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.speculative.model = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||
|
||||
return ctx_arg;
|
||||
}
|
||||
|
||||
@@ -536,12 +536,12 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
|
||||
[](const unsigned char c) { return !std::isprint(c); }),
|
||||
detokenized.end());
|
||||
|
||||
buf << "\n" << std::to_string(i)
|
||||
<< ":token '" << detokenized << "'"
|
||||
<< ":pos " << std::to_string(batch.pos[i])
|
||||
<< ":n_seq_id " << std::to_string(batch.n_seq_id[i])
|
||||
<< ":seq_id " << std::to_string(batch.seq_id[i][0])
|
||||
<< ":logits " << std::to_string(batch.logits[i]);
|
||||
buf << "\n" << std::to_string(i)
|
||||
<< ", token '" << detokenized << "'"
|
||||
<< ", pos " << std::to_string(batch.pos[i])
|
||||
<< ", n_seq_id " << std::to_string(batch.n_seq_id[i])
|
||||
<< ", seq_id " << std::to_string(batch.seq_id[i][0])
|
||||
<< ", logits " << std::to_string(batch.logits[i]);
|
||||
}
|
||||
|
||||
buf << " ]";
|
||||
@@ -925,9 +925,9 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
common_lora_adapters_apply(lctx, iparams.lora_adapters);
|
||||
}
|
||||
|
||||
if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
||||
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
||||
params.sparams.ignore_eos = false;
|
||||
params.sampling.ignore_eos = false;
|
||||
}
|
||||
|
||||
if (params.warmup) {
|
||||
@@ -979,9 +979,12 @@ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_l
|
||||
}
|
||||
}
|
||||
|
||||
struct llama_model_params common_model_params_to_llama(const common_params & params) {
|
||||
struct llama_model_params common_model_params_to_llama(common_params & params) {
|
||||
auto mparams = llama_model_default_params();
|
||||
|
||||
if (!params.devices.empty()) {
|
||||
mparams.devices = params.devices.data();
|
||||
}
|
||||
if (params.n_gpu_layers != -1) {
|
||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||
}
|
||||
@@ -1490,6 +1493,66 @@ void common_batch_add(
|
||||
batch.n_tokens++;
|
||||
}
|
||||
|
||||
//
|
||||
// Token utils
|
||||
//
|
||||
|
||||
size_t common_lcp(const llama_tokens & a, const llama_tokens & b) {
|
||||
size_t i;
|
||||
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
size_t common_lcs(const llama_tokens & a, const llama_tokens & b) {
|
||||
// check for empty sequences
|
||||
if (a.empty() || b.empty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// get the lengths of the input sequences
|
||||
size_t a_len = a.size();
|
||||
size_t b_len = b.size();
|
||||
|
||||
// initialize the maximum length of the longest common subsequence (LCS)
|
||||
size_t max_length = 0;
|
||||
|
||||
// use two rows instead of a 2D matrix to optimize space
|
||||
std::vector<size_t> prev_row(b_len + 1, 0);
|
||||
std::vector<size_t> curr_row(b_len + 1, 0);
|
||||
|
||||
// iterate through the elements of a
|
||||
for (size_t i = 1; i <= a_len; i++) {
|
||||
// iterate through the elements of b
|
||||
for (size_t j = 1; j <= b_len; j++) {
|
||||
// if elements at the current positions match
|
||||
if (a[i - 1] == b[j - 1]) {
|
||||
// if it's the first element of either sequences, set LCS length to 1
|
||||
if (i == 1 || j == 1) {
|
||||
curr_row[j] = 1;
|
||||
} else {
|
||||
// increment LCS length by 1 compared to the previous element
|
||||
curr_row[j] = prev_row[j - 1] + 1;
|
||||
}
|
||||
|
||||
// update max_length if necessary
|
||||
if (curr_row[j] > max_length) {
|
||||
max_length = curr_row[j];
|
||||
}
|
||||
} else {
|
||||
// reset LCS length if elements don't match
|
||||
curr_row[j] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// update the previous row for the next iteration
|
||||
prev_row = curr_row;
|
||||
}
|
||||
|
||||
// return the maximum length of the LCS
|
||||
return max_length;
|
||||
}
|
||||
|
||||
//
|
||||
// Vocab utils
|
||||
//
|
||||
|
||||
@@ -33,6 +33,8 @@ struct common_lora_adapter_container : common_lora_adapter_info {
|
||||
struct llama_lora_adapter * adapter;
|
||||
};
|
||||
|
||||
using llama_tokens = std::vector<llama_token>;
|
||||
|
||||
// build info
|
||||
extern int LLAMA_BUILD_NUMBER;
|
||||
extern char const * LLAMA_COMMIT;
|
||||
@@ -101,8 +103,8 @@ enum dimre_method {
|
||||
DIMRE_METHOD_MEAN,
|
||||
};
|
||||
|
||||
// sampler parameters
|
||||
struct common_sampler_params {
|
||||
// sampling parameters
|
||||
struct common_params_sampling {
|
||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
||||
|
||||
int32_t n_prev = 64; // number of previous tokens to remember
|
||||
@@ -153,21 +155,30 @@ struct common_sampler_params {
|
||||
std::string print() const;
|
||||
};
|
||||
|
||||
struct common_params_speculative {
|
||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||
int32_t n_ctx = 0; // draft context size
|
||||
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
|
||||
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
float p_min = 0.9f; // minimum speculative decoding probability (greedy)
|
||||
|
||||
struct cpu_params cpuparams;
|
||||
struct cpu_params cpuparams_batch;
|
||||
|
||||
std::string model = ""; // draft model for speculative decoding // NOLINT
|
||||
};
|
||||
|
||||
struct common_params {
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 4096; // context size
|
||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
|
||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||
int32_t n_sequences = 1; // number of sequences to decode
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t grp_attn_n = 1; // group-attention factor
|
||||
int32_t grp_attn_w = 512; // group-attention width
|
||||
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
||||
@@ -180,25 +191,29 @@ struct common_params {
|
||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||
float defrag_thold = 0.1f; // KV cache defragmentation threshold
|
||||
|
||||
// offload params
|
||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||
|
||||
struct cpu_params cpuparams;
|
||||
struct cpu_params cpuparams_batch;
|
||||
struct cpu_params draft_cpuparams;
|
||||
struct cpu_params draft_cpuparams_batch;
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||
void * cb_eval_user_data = nullptr;
|
||||
|
||||
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||
|
||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
||||
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
||||
|
||||
struct common_sampler_params sparams;
|
||||
struct common_params_sampling sampling;
|
||||
struct common_params_speculative speculative;
|
||||
|
||||
std::string model = ""; // model path // NOLINT
|
||||
std::string model_draft = ""; // draft model for speculative decoding // NOLINT
|
||||
std::string model_alias = "unknown"; // model alias // NOLINT
|
||||
std::string model_url = ""; // model url to download // NOLINT
|
||||
std::string hf_token = ""; // HF token // NOLINT
|
||||
@@ -451,7 +466,7 @@ struct common_init_result {
|
||||
|
||||
struct common_init_result common_init_from_params(common_params & params);
|
||||
|
||||
struct llama_model_params common_model_params_to_llama (const common_params & params);
|
||||
struct llama_model_params common_model_params_to_llama ( common_params & params);
|
||||
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
||||
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
||||
|
||||
@@ -461,7 +476,9 @@ struct llama_model * common_load_model_from_hf(const char * repo, const char * f
|
||||
// clear LoRA adapters from context, then apply new list of adapters
|
||||
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
|
||||
|
||||
//
|
||||
// Batch utils
|
||||
//
|
||||
|
||||
void common_batch_clear(struct llama_batch & batch);
|
||||
|
||||
@@ -472,6 +489,16 @@ void common_batch_add(
|
||||
const std::vector<llama_seq_id> & seq_ids,
|
||||
bool logits);
|
||||
|
||||
//
|
||||
// Token utils
|
||||
//
|
||||
|
||||
// longest common prefix
|
||||
size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
|
||||
|
||||
// longet common subsequence
|
||||
size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
|
||||
|
||||
//
|
||||
// Vocab utils
|
||||
//
|
||||
|
||||
@@ -99,7 +99,7 @@ struct ring_buffer {
|
||||
};
|
||||
|
||||
struct common_sampler {
|
||||
common_sampler_params params;
|
||||
common_params_sampling params;
|
||||
|
||||
struct llama_sampler * grmr;
|
||||
struct llama_sampler * chain;
|
||||
@@ -125,7 +125,7 @@ struct common_sampler {
|
||||
}
|
||||
};
|
||||
|
||||
std::string common_sampler_params::print() const {
|
||||
std::string common_params_sampling::print() const {
|
||||
char result[1024];
|
||||
|
||||
snprintf(result, sizeof(result),
|
||||
@@ -141,7 +141,7 @@ std::string common_sampler_params::print() const {
|
||||
return std::string(result);
|
||||
}
|
||||
|
||||
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) {
|
||||
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
|
||||
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
||||
|
||||
lparams.no_perf = params.no_perf;
|
||||
@@ -320,6 +320,45 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||
return cur_p.data[cur_p.selected].id;
|
||||
}
|
||||
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
|
||||
GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
|
||||
|
||||
std::vector<llama_token> result;
|
||||
result.reserve(idxs.size());
|
||||
|
||||
size_t i = 0;
|
||||
for (; i < draft.size(); i++) {
|
||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
||||
|
||||
common_sampler_accept(gsmpl, id, true);
|
||||
|
||||
result.push_back(id);
|
||||
|
||||
if (draft[i] != id) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (i == draft.size()) {
|
||||
const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
|
||||
|
||||
common_sampler_accept(gsmpl, id, true);
|
||||
|
||||
result.push_back(id);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
|
||||
std::vector<int> idxs(draft.size() + 1);
|
||||
for (size_t i = 0; i < idxs.size(); ++i) {
|
||||
idxs[i] = i;
|
||||
}
|
||||
|
||||
return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
|
||||
}
|
||||
|
||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
||||
return llama_sampler_get_seed(gsmpl->chain);
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ struct common_sampler;
|
||||
|
||||
// llama_sampler API overloads
|
||||
|
||||
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params);
|
||||
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
|
||||
|
||||
void common_sampler_free(struct common_sampler * gsmpl);
|
||||
|
||||
@@ -60,6 +60,27 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
||||
//
|
||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
||||
|
||||
// generalized version of common_sampler_sample
|
||||
//
|
||||
// will cross-reference the sampled tokens with a batch of draft tokens and accept those that match
|
||||
// if the sampler disagrees at some point, we stop and return the accepted tokens up to now
|
||||
//
|
||||
// common_sampler_sample_n(gsmpl, ctx, { idx }, {});
|
||||
//
|
||||
// is equivalent to
|
||||
//
|
||||
// common_sampler_sample(gsmpl, ctx, idx);
|
||||
// common_sampler_accept(gsmpl, token, true);
|
||||
//
|
||||
// requires: idxs.size() == draft.size() + 1
|
||||
//
|
||||
// returns at least 1 token, up to idxs.size()
|
||||
//
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
|
||||
|
||||
// assume idxs == [ 0, 1, 2, ..., draft.size() ]
|
||||
std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
|
||||
|
||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
||||
|
||||
// helpers
|
||||
|
||||
270
common/speculative.cpp
Normal file
270
common/speculative.cpp
Normal file
@@ -0,0 +1,270 @@
|
||||
#include "speculative.h"
|
||||
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
|
||||
#include <cstring>
|
||||
|
||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
|
||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||
|
||||
struct common_speculative {
|
||||
struct llama_context * ctx;
|
||||
struct common_sampler * smpl;
|
||||
|
||||
llama_batch batch;
|
||||
llama_tokens prompt;
|
||||
};
|
||||
|
||||
struct common_speculative * common_speculative_init(
|
||||
struct llama_context * ctx_dft) {
|
||||
auto * result = new common_speculative {
|
||||
/* .ctx = */ ctx_dft,
|
||||
/* .smpl = */ nullptr,
|
||||
/* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
|
||||
/* .prompt = */ {},
|
||||
};
|
||||
|
||||
// TODO: optimize or pass from outside?
|
||||
#if 0
|
||||
{
|
||||
common_params_sampling params;
|
||||
params.no_perf = false;
|
||||
|
||||
params.top_k = 40;
|
||||
params.top_p = 0.9;
|
||||
|
||||
params.samplers = {
|
||||
COMMON_SAMPLER_TYPE_TOP_K,
|
||||
COMMON_SAMPLER_TYPE_TOP_P,
|
||||
COMMON_SAMPLER_TYPE_INFILL,
|
||||
};
|
||||
|
||||
result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
|
||||
}
|
||||
#else
|
||||
{
|
||||
common_params_sampling params;
|
||||
params.no_perf = false;
|
||||
|
||||
params.top_k = 10;
|
||||
|
||||
params.samplers = {
|
||||
COMMON_SAMPLER_TYPE_TOP_K,
|
||||
};
|
||||
|
||||
result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
|
||||
}
|
||||
#endif
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void common_speculative_free(struct common_speculative * spec) {
|
||||
common_sampler_free(spec->smpl);
|
||||
|
||||
llama_batch_free(spec->batch);
|
||||
|
||||
delete spec;
|
||||
}
|
||||
|
||||
bool common_speculative_are_compatible(
|
||||
const struct llama_context * ctx_tgt,
|
||||
const struct llama_context * ctx_dft) {
|
||||
const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
|
||||
const struct llama_model * model_dft = llama_get_model(ctx_dft);
|
||||
|
||||
const bool vocab_type_tgt = llama_vocab_type(model_tgt);
|
||||
LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
|
||||
|
||||
const bool vocab_type_dft = llama_vocab_type(model_dft);
|
||||
LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
|
||||
|
||||
if (vocab_type_tgt != vocab_type_dft) {
|
||||
LOG_ERR("%s: draft model vocab type must match target model to use speculation but "
|
||||
"vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
|
||||
llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
|
||||
llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
|
||||
llama_token_eos(model_tgt) != llama_token_eos(model_dft)) {
|
||||
LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
|
||||
LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt));
|
||||
LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft));
|
||||
return false;
|
||||
}
|
||||
|
||||
{
|
||||
const int n_vocab_tgt = llama_n_vocab(model_tgt);
|
||||
const int n_vocab_dft = llama_n_vocab(model_dft);
|
||||
|
||||
const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
|
||||
|
||||
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
|
||||
LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
|
||||
"target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
||||
__func__, n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
|
||||
const char * token_text_tgt = llama_token_get_text(model_tgt, i);
|
||||
const char * token_text_dft = llama_token_get_text(model_dft, i);
|
||||
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
||||
LOG_ERR("%s: draft model vocab must match target model to use speculation but "
|
||||
"token %d content differs - target '%s', draft '%s'\n", __func__, i,
|
||||
common_token_to_piece(ctx_tgt, i).c_str(),
|
||||
common_token_to_piece(ctx_dft, i).c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
llama_tokens common_speculative_gen_draft(
|
||||
struct common_speculative * spec,
|
||||
struct common_speculative_params params,
|
||||
const llama_tokens & prompt_tgt,
|
||||
llama_token id_last) {
|
||||
auto & batch = spec->batch;
|
||||
auto & ctx = spec->ctx;
|
||||
auto & smpl = spec->smpl;
|
||||
auto & prompt = spec->prompt;
|
||||
|
||||
int reuse_i = 0;
|
||||
int reuse_n = 0;
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx) - params.n_draft;
|
||||
|
||||
const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);
|
||||
|
||||
// reuse as much as possible from the old draft context
|
||||
// ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
|
||||
for (int i = 0; i < (int) prompt.size(); ++i) {
|
||||
int cur = 0;
|
||||
while (i_start + cur < (int) prompt_tgt.size() &&
|
||||
i + cur < (int) prompt.size() &&
|
||||
prompt_tgt[i_start + cur] == prompt[i + cur]) {
|
||||
cur++;
|
||||
}
|
||||
|
||||
if ((cur >= params.n_reuse || n_ctx >= (int) prompt_tgt.size()) && cur > reuse_n) {
|
||||
reuse_i = i;
|
||||
reuse_n = cur;
|
||||
}
|
||||
}
|
||||
|
||||
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());
|
||||
|
||||
llama_tokens result;
|
||||
result.reserve(params.n_draft);
|
||||
|
||||
if (reuse_n == 0) {
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
prompt.clear();
|
||||
} else {
|
||||
// this happens when a previous draft has been discarded (for example, due to being too small), but the
|
||||
// target model agreed with it. in this case, we simply pass back the previous results to save compute
|
||||
if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
|
||||
for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
|
||||
result.push_back(prompt[i]);
|
||||
|
||||
if (params.n_draft <= (int) result.size()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
if (reuse_i > 0) {
|
||||
llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
|
||||
llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
|
||||
|
||||
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
|
||||
}
|
||||
|
||||
if (reuse_n < (int) prompt.size()) {
|
||||
llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
|
||||
|
||||
prompt.erase(prompt.begin() + reuse_n, prompt.end());
|
||||
}
|
||||
}
|
||||
|
||||
// prepare a batch to evaluate any new tokens in the prompt
|
||||
common_batch_clear(batch);
|
||||
|
||||
for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) {
|
||||
//LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
|
||||
common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
|
||||
|
||||
prompt.push_back(prompt_tgt[i]);
|
||||
}
|
||||
|
||||
// we should rarely end-up here during normal decoding
|
||||
if (batch.n_tokens > 0) {
|
||||
//LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
|
||||
|
||||
llama_decode(ctx, batch);
|
||||
}
|
||||
|
||||
const llama_pos n_past = prompt.size();
|
||||
|
||||
LOG_DBG("%s: n_past = %d\n", __func__, n_past);
|
||||
|
||||
common_batch_clear(batch);
|
||||
common_batch_add (batch, id_last, n_past, { 0 }, true);
|
||||
|
||||
prompt.push_back(id_last);
|
||||
|
||||
//LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str());
|
||||
|
||||
llama_decode(ctx, batch);
|
||||
|
||||
common_sampler_reset(smpl);
|
||||
|
||||
// sample n_draft tokens from the draft model
|
||||
for (int i = 0; i < params.n_draft; ++i) {
|
||||
common_batch_clear(batch);
|
||||
|
||||
common_sampler_sample(smpl, ctx, 0, true);
|
||||
|
||||
const auto * cur_p = common_sampler_get_candidates(smpl);
|
||||
|
||||
for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
|
||||
LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
||||
k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str());
|
||||
}
|
||||
|
||||
// add drafted token for each sequence
|
||||
const llama_token id = cur_p->data[0].id;
|
||||
|
||||
// only collect very high-confidence draft tokens
|
||||
if (cur_p->data[0].p < params.p_min) {
|
||||
break;
|
||||
}
|
||||
|
||||
common_sampler_accept(smpl, id, true);
|
||||
|
||||
result.push_back(id);
|
||||
|
||||
if (params.n_draft <= (int) result.size()) {
|
||||
break;
|
||||
}
|
||||
|
||||
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
|
||||
|
||||
// evaluate the drafted tokens on the draft model
|
||||
llama_decode(ctx, batch);
|
||||
|
||||
prompt.push_back(id);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
28
common/speculative.h
Normal file
28
common/speculative.h
Normal file
@@ -0,0 +1,28 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
|
||||
struct common_speculative;
|
||||
|
||||
struct common_speculative_params {
|
||||
int n_draft = 16; // max drafted tokens
|
||||
int n_reuse = 256;
|
||||
|
||||
float p_min = 0.9f; // min probabiliy required to accept a token in the draft
|
||||
};
|
||||
|
||||
struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
|
||||
|
||||
void common_speculative_free(struct common_speculative * spec);
|
||||
|
||||
bool common_speculative_are_compatible(
|
||||
const struct llama_context * ctx_tgt,
|
||||
const struct llama_context * ctx_dft);
|
||||
|
||||
// sample up to n_draft tokens and add them to the batch using the draft model
|
||||
llama_tokens common_speculative_gen_draft(
|
||||
struct common_speculative * spec,
|
||||
struct common_speculative_params params,
|
||||
const llama_tokens & prompt,
|
||||
llama_token id_last);
|
||||
@@ -2707,7 +2707,7 @@ class XLMRobertaModel(BertModel):
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
self.gguf_writer.add_add_space_prefix(add_prefix)
|
||||
self.gguf_writer.add_token_type_count(1)
|
||||
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
||||
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
|
||||
if precompiled_charsmap:
|
||||
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
|
||||
@@ -3040,9 +3040,9 @@ class OlmoModel(Model):
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@Model.register("Olmo1124ForCausalLM")
|
||||
class Olmo1124Model(Model):
|
||||
model_arch = gguf.MODEL_ARCH.OLMO_1124
|
||||
@Model.register("Olmo2ForCausalLM")
|
||||
class Olmo2Model(Model):
|
||||
model_arch = gguf.MODEL_ARCH.OLMO2
|
||||
|
||||
|
||||
@Model.register("OlmoeForCausalLM")
|
||||
|
||||
@@ -12,13 +12,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
else()
|
||||
add_subdirectory(cvector-generator)
|
||||
add_subdirectory(batched-bench)
|
||||
add_subdirectory(batched)
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
add_subdirectory(embedding)
|
||||
add_subdirectory(eval-callback)
|
||||
add_subdirectory(export-lora)
|
||||
add_subdirectory(gbnf-validator)
|
||||
add_subdirectory(gguf-hash)
|
||||
add_subdirectory(gguf-split)
|
||||
@@ -27,28 +24,36 @@ else()
|
||||
add_subdirectory(imatrix)
|
||||
add_subdirectory(infill)
|
||||
add_subdirectory(llama-bench)
|
||||
add_subdirectory(llava)
|
||||
add_subdirectory(lookahead)
|
||||
add_subdirectory(lookup)
|
||||
add_subdirectory(main)
|
||||
add_subdirectory(parallel)
|
||||
add_subdirectory(passkey)
|
||||
add_subdirectory(perplexity)
|
||||
add_subdirectory(quantize-stats)
|
||||
add_subdirectory(quantize)
|
||||
add_subdirectory(retrieval)
|
||||
if (GGML_RPC)
|
||||
add_subdirectory(rpc)
|
||||
endif()
|
||||
if (LLAMA_BUILD_SERVER)
|
||||
add_subdirectory(server)
|
||||
endif()
|
||||
if (GGML_SYCL)
|
||||
add_subdirectory(sycl)
|
||||
add_subdirectory(server)
|
||||
endif()
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(run)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(simple-chat)
|
||||
add_subdirectory(speculative)
|
||||
add_subdirectory(speculative-simple)
|
||||
add_subdirectory(tokenize)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
# these examples use the backends directly and cannot be built with dynamic loading
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
add_subdirectory(cvector-generator)
|
||||
add_subdirectory(export-lora)
|
||||
add_subdirectory(quantize-stats)
|
||||
add_subdirectory(llava)
|
||||
if (GGML_RPC)
|
||||
add_subdirectory(rpc)
|
||||
endif()
|
||||
if (GGML_SYCL)
|
||||
add_subdirectory(sycl)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -68,10 +68,10 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k));
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sampling.top_p, params.sampling.min_keep));
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp));
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed));
|
||||
|
||||
if (ctx == NULL) {
|
||||
LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
|
||||
|
||||
@@ -5,5 +5,6 @@ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
set(TEST_TARGET test-eval-callback)
|
||||
add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
||||
add_test(NAME ${TEST_TARGET}
|
||||
COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
||||
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
|
||||
|
||||
@@ -73,7 +73,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
common_init();
|
||||
|
||||
auto & sparams = params.sparams;
|
||||
auto & sparams = params.sampling;
|
||||
|
||||
console::init(params.simple_io, params.use_color);
|
||||
atexit([]() { console::cleanup(); });
|
||||
|
||||
@@ -1477,6 +1477,17 @@ int main(int argc, char ** argv) {
|
||||
|
||||
cmd_params params = parse_cmd_params(argc, argv);
|
||||
|
||||
// initialize backends
|
||||
ggml_backend_load_all();
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (!cpu_dev) {
|
||||
fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
||||
auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new");
|
||||
auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free");
|
||||
|
||||
// initialize llama.cpp
|
||||
if (!params.verbose) {
|
||||
llama_log_set(llama_null_log_callback, NULL);
|
||||
@@ -1551,7 +1562,7 @@ int main(int argc, char ** argv) {
|
||||
tpp.poll = t.poll;
|
||||
tpp.prio = params.prio;
|
||||
|
||||
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
|
||||
struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
|
||||
if (!threadpool) {
|
||||
fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||
exit(1);
|
||||
@@ -1612,7 +1623,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_free(ctx);
|
||||
|
||||
ggml_threadpool_free(threadpool);
|
||||
ggml_threadpool_free_fn(threadpool);
|
||||
}
|
||||
|
||||
llama_free_model(lmodel);
|
||||
|
||||
@@ -191,7 +191,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
||||
|
||||
LOG("\n");
|
||||
|
||||
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams);
|
||||
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
|
||||
if (!smpl) {
|
||||
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
||||
exit(1);
|
||||
|
||||
@@ -237,7 +237,7 @@ static struct common_sampler * llama_init(struct llava_context * ctx_llava, comm
|
||||
|
||||
LOG_INF("\n");
|
||||
|
||||
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams);
|
||||
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
|
||||
return smpl;
|
||||
}
|
||||
|
||||
|
||||
@@ -115,7 +115,7 @@ int main(int argc, char ** argv) {
|
||||
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
|
||||
|
||||
// target model sampling context
|
||||
struct common_sampler * smpl = common_sampler_init(model, params.sparams);
|
||||
struct common_sampler * smpl = common_sampler_init(model, params.sampling);
|
||||
|
||||
// verification n-grams
|
||||
std::vector<ngram_data> ngrams_cur(G);
|
||||
|
||||
@@ -21,7 +21,7 @@ int main(int argc, char ** argv){
|
||||
|
||||
common_init();
|
||||
|
||||
const int n_draft = params.n_draft;
|
||||
const int n_draft = params.speculative.n_max;
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init();
|
||||
@@ -40,6 +40,7 @@ int main(int argc, char ** argv){
|
||||
common_ngram_cache ngram_cache_context;
|
||||
common_ngram_cache ngram_cache_dynamic;
|
||||
common_ngram_cache ngram_cache_static;
|
||||
|
||||
int64_t t_draft_flat_us = 0;
|
||||
int64_t t_draft_us = 0;
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ int main(int argc, char ** argv){
|
||||
common_init();
|
||||
|
||||
// max. number of additional tokens to draft if match is found
|
||||
const int n_draft = params.n_draft;
|
||||
const int n_draft = params.speculative.n_max;
|
||||
|
||||
const bool dump_kv_cache = params.dump_kv_cache;
|
||||
|
||||
@@ -102,7 +102,7 @@ int main(int argc, char ** argv){
|
||||
|
||||
bool has_eos = false;
|
||||
|
||||
struct common_sampler * smpl = common_sampler_init(model, params.sparams);
|
||||
struct common_sampler * smpl = common_sampler_init(model, params.sampling);
|
||||
|
||||
std::vector<llama_token> draft;
|
||||
|
||||
|
||||
@@ -100,7 +100,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
common_init();
|
||||
|
||||
auto & sparams = params.sparams;
|
||||
auto & sparams = params.sampling;
|
||||
|
||||
// save choice to use color for later
|
||||
// (note for later: this is a slightly awkward choice)
|
||||
@@ -165,6 +165,10 @@ int main(int argc, char ** argv) {
|
||||
|
||||
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
|
||||
|
||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
|
||||
auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new");
|
||||
auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
|
||||
|
||||
struct ggml_threadpool_params tpp_batch =
|
||||
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
|
||||
struct ggml_threadpool_params tpp =
|
||||
@@ -174,7 +178,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
struct ggml_threadpool * threadpool_batch = NULL;
|
||||
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
|
||||
threadpool_batch = ggml_threadpool_new(&tpp_batch);
|
||||
threadpool_batch = ggml_threadpool_new_fn(&tpp_batch);
|
||||
if (!threadpool_batch) {
|
||||
LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
|
||||
return 1;
|
||||
@@ -184,7 +188,7 @@ int main(int argc, char ** argv) {
|
||||
tpp.paused = true;
|
||||
}
|
||||
|
||||
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
|
||||
struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
|
||||
if (!threadpool) {
|
||||
LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||
return 1;
|
||||
@@ -890,8 +894,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
ggml_threadpool_free(threadpool);
|
||||
ggml_threadpool_free(threadpool_batch);
|
||||
ggml_threadpool_free_fn(threadpool);
|
||||
ggml_threadpool_free_fn(threadpool_batch);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -160,7 +160,7 @@ int main(int argc, char ** argv) {
|
||||
for (size_t i = 0; i < clients.size(); ++i) {
|
||||
auto & client = clients[i];
|
||||
client.id = i;
|
||||
client.smpl = common_sampler_init(model, params.sparams);
|
||||
client.smpl = common_sampler_init(model, params.sampling);
|
||||
}
|
||||
|
||||
std::vector<llama_token> tokens_system;
|
||||
|
||||
@@ -282,8 +282,8 @@ int main(int argc, char ** argv) {
|
||||
return a.second > b.second;
|
||||
});
|
||||
|
||||
LOG("Top %d similar chunks:\n", params.sparams.top_k);
|
||||
for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
|
||||
LOG("Top %d similar chunks:\n", params.sampling.top_k);
|
||||
for (int i = 0; i < std::min(params.sampling.top_k, (int) chunks.size()); i++) {
|
||||
LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
|
||||
LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
|
||||
LOG("similarity: %f\n", similarities[i].second);
|
||||
|
||||
5
examples/run/CMakeLists.txt
Normal file
5
examples/run/CMakeLists.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
set(TARGET llama-run)
|
||||
add_executable(${TARGET} run.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
7
examples/run/README.md
Normal file
7
examples/run/README.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# llama.cpp/example/run
|
||||
|
||||
The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.
|
||||
|
||||
```bash
|
||||
./llama-run Meta-Llama-3.1-8B-Instruct.gguf
|
||||
...
|
||||
409
examples/run/run.cpp
Normal file
409
examples/run/run.cpp
Normal file
@@ -0,0 +1,409 @@
|
||||
#if defined(_WIN32)
|
||||
#include <windows.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include <climits>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "llama-cpp.h"
|
||||
|
||||
typedef std::unique_ptr<char[]> char_array_ptr;
|
||||
|
||||
struct Argument {
|
||||
std::string flag;
|
||||
std::string help_text;
|
||||
};
|
||||
|
||||
struct Options {
|
||||
std::string model_path, prompt_non_interactive;
|
||||
int ngl = 99;
|
||||
int n_ctx = 2048;
|
||||
};
|
||||
|
||||
class ArgumentParser {
|
||||
public:
|
||||
ArgumentParser(const char * program_name) : program_name(program_name) {}
|
||||
|
||||
void add_argument(const std::string & flag, std::string & var, const std::string & help_text = "") {
|
||||
string_args[flag] = &var;
|
||||
arguments.push_back({flag, help_text});
|
||||
}
|
||||
|
||||
void add_argument(const std::string & flag, int & var, const std::string & help_text = "") {
|
||||
int_args[flag] = &var;
|
||||
arguments.push_back({flag, help_text});
|
||||
}
|
||||
|
||||
int parse(int argc, const char ** argv) {
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
std::string arg = argv[i];
|
||||
if (string_args.count(arg)) {
|
||||
if (i + 1 < argc) {
|
||||
*string_args[arg] = argv[++i];
|
||||
} else {
|
||||
fprintf(stderr, "error: missing value for %s\n", arg.c_str());
|
||||
print_usage();
|
||||
return 1;
|
||||
}
|
||||
} else if (int_args.count(arg)) {
|
||||
if (i + 1 < argc) {
|
||||
if (parse_int_arg(argv[++i], *int_args[arg]) != 0) {
|
||||
fprintf(stderr, "error: invalid value for %s: %s\n", arg.c_str(), argv[i]);
|
||||
print_usage();
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "error: missing value for %s\n", arg.c_str());
|
||||
print_usage();
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "error: unrecognized argument %s\n", arg.c_str());
|
||||
print_usage();
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (string_args["-m"]->empty()) {
|
||||
fprintf(stderr, "error: -m is required\n");
|
||||
print_usage();
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private:
|
||||
const char * program_name;
|
||||
std::unordered_map<std::string, std::string *> string_args;
|
||||
std::unordered_map<std::string, int *> int_args;
|
||||
std::vector<Argument> arguments;
|
||||
|
||||
int parse_int_arg(const char * arg, int & value) {
|
||||
char * end;
|
||||
const long val = std::strtol(arg, &end, 10);
|
||||
if (*end == '\0' && val >= INT_MIN && val <= INT_MAX) {
|
||||
value = static_cast<int>(val);
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
void print_usage() const {
|
||||
printf("\nUsage:\n");
|
||||
printf(" %s [OPTIONS]\n\n", program_name);
|
||||
printf("Options:\n");
|
||||
for (const auto & arg : arguments) {
|
||||
printf(" %-10s %s\n", arg.flag.c_str(), arg.help_text.c_str());
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
};
|
||||
|
||||
class LlamaData {
|
||||
public:
|
||||
llama_model_ptr model;
|
||||
llama_sampler_ptr sampler;
|
||||
llama_context_ptr context;
|
||||
std::vector<llama_chat_message> messages;
|
||||
|
||||
int init(const Options & opt) {
|
||||
model = initialize_model(opt.model_path, opt.ngl);
|
||||
if (!model) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
context = initialize_context(model, opt.n_ctx);
|
||||
if (!context) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
sampler = initialize_sampler();
|
||||
return 0;
|
||||
}
|
||||
|
||||
private:
|
||||
// Initializes the model and returns a unique pointer to it
|
||||
llama_model_ptr initialize_model(const std::string & model_path, const int ngl) {
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
model_params.n_gpu_layers = ngl;
|
||||
|
||||
llama_model_ptr model(llama_load_model_from_file(model_path.c_str(), model_params));
|
||||
if (!model) {
|
||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||
}
|
||||
|
||||
return model;
|
||||
}
|
||||
|
||||
// Initializes the context with the specified parameters
|
||||
llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) {
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
ctx_params.n_ctx = n_ctx;
|
||||
ctx_params.n_batch = n_ctx;
|
||||
|
||||
llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
|
||||
if (!context) {
|
||||
fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__);
|
||||
}
|
||||
|
||||
return context;
|
||||
}
|
||||
|
||||
// Initializes and configures the sampler
|
||||
llama_sampler_ptr initialize_sampler() {
|
||||
llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params()));
|
||||
llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1));
|
||||
llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(0.8f));
|
||||
llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
|
||||
|
||||
return sampler;
|
||||
}
|
||||
};
|
||||
|
||||
// Add a message to `messages` and store its content in `owned_content`
|
||||
static void add_message(const char * role, const std::string & text, LlamaData & llama_data,
|
||||
std::vector<char_array_ptr> & owned_content) {
|
||||
char_array_ptr content(new char[text.size() + 1]);
|
||||
std::strcpy(content.get(), text.c_str());
|
||||
llama_data.messages.push_back({role, content.get()});
|
||||
owned_content.push_back(std::move(content));
|
||||
}
|
||||
|
||||
// Function to apply the chat template and resize `formatted` if needed
|
||||
static int apply_chat_template(const LlamaData & llama_data, std::vector<char> & formatted, const bool append) {
|
||||
int result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
|
||||
llama_data.messages.size(), append, formatted.data(), formatted.size());
|
||||
if (result > static_cast<int>(formatted.size())) {
|
||||
formatted.resize(result);
|
||||
result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
|
||||
llama_data.messages.size(), append, formatted.data(), formatted.size());
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Function to tokenize the prompt
|
||||
static int tokenize_prompt(const llama_model_ptr & model, const std::string & prompt,
|
||||
std::vector<llama_token> & prompt_tokens) {
|
||||
const int n_prompt_tokens = -llama_tokenize(model.get(), prompt.c_str(), prompt.size(), NULL, 0, true, true);
|
||||
prompt_tokens.resize(n_prompt_tokens);
|
||||
if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
|
||||
true) < 0) {
|
||||
GGML_ABORT("failed to tokenize the prompt\n");
|
||||
}
|
||||
|
||||
return n_prompt_tokens;
|
||||
}
|
||||
|
||||
// Check if we have enough space in the context to evaluate this batch
|
||||
static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
|
||||
const int n_ctx = llama_n_ctx(ctx.get());
|
||||
const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
|
||||
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
||||
printf("\033[0m\n");
|
||||
fprintf(stderr, "context size exceeded\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// convert the token to a string
|
||||
static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) {
|
||||
char buf[256];
|
||||
int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true);
|
||||
if (n < 0) {
|
||||
GGML_ABORT("failed to convert token to piece\n");
|
||||
}
|
||||
|
||||
piece = std::string(buf, n);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void print_word_and_concatenate_to_response(const std::string & piece, std::string & response) {
|
||||
printf("%s", piece.c_str());
|
||||
fflush(stdout);
|
||||
response += piece;
|
||||
}
|
||||
|
||||
// helper function to evaluate a prompt and generate a response
|
||||
static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
|
||||
std::vector<llama_token> prompt_tokens;
|
||||
const int n_prompt_tokens = tokenize_prompt(llama_data.model, prompt, prompt_tokens);
|
||||
if (n_prompt_tokens < 0) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// prepare a batch for the prompt
|
||||
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
|
||||
llama_token new_token_id;
|
||||
while (true) {
|
||||
check_context_size(llama_data.context, batch);
|
||||
if (llama_decode(llama_data.context.get(), batch)) {
|
||||
GGML_ABORT("failed to decode\n");
|
||||
}
|
||||
|
||||
// sample the next token, check is it an end of generation?
|
||||
new_token_id = llama_sampler_sample(llama_data.sampler.get(), llama_data.context.get(), -1);
|
||||
if (llama_token_is_eog(llama_data.model.get(), new_token_id)) {
|
||||
break;
|
||||
}
|
||||
|
||||
std::string piece;
|
||||
if (convert_token_to_string(llama_data.model, new_token_id, piece)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
print_word_and_concatenate_to_response(piece, response);
|
||||
|
||||
// prepare the next batch with the sampled token
|
||||
batch = llama_batch_get_one(&new_token_id, 1);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int parse_arguments(const int argc, const char ** argv, Options & opt) {
|
||||
ArgumentParser parser(argv[0]);
|
||||
parser.add_argument("-m", opt.model_path, "model");
|
||||
parser.add_argument("-p", opt.prompt_non_interactive, "prompt");
|
||||
parser.add_argument("-c", opt.n_ctx, "context_size");
|
||||
parser.add_argument("-ngl", opt.ngl, "n_gpu_layers");
|
||||
if (parser.parse(argc, argv)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int read_user_input(std::string & user) {
|
||||
std::getline(std::cin, user);
|
||||
return user.empty(); // Indicate an error or empty input
|
||||
}
|
||||
|
||||
// Function to generate a response based on the prompt
|
||||
static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response) {
|
||||
// Set response color
|
||||
printf("\033[33m");
|
||||
if (generate(llama_data, prompt, response)) {
|
||||
fprintf(stderr, "failed to generate response\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// End response with color reset and newline
|
||||
printf("\n\033[0m");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Helper function to apply the chat template and handle errors
|
||||
static int apply_chat_template_with_error_handling(const LlamaData & llama_data, std::vector<char> & formatted,
|
||||
const bool is_user_input, int & output_length) {
|
||||
const int new_len = apply_chat_template(llama_data, formatted, is_user_input);
|
||||
if (new_len < 0) {
|
||||
fprintf(stderr, "failed to apply the chat template\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
output_length = new_len;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Helper function to handle user input
|
||||
static bool handle_user_input(std::string & user_input, const std::string & prompt_non_interactive) {
|
||||
if (!prompt_non_interactive.empty()) {
|
||||
user_input = prompt_non_interactive;
|
||||
return true; // No need for interactive input
|
||||
}
|
||||
|
||||
printf("\033[32m> \033[0m");
|
||||
return !read_user_input(user_input); // Returns false if input ends the loop
|
||||
}
|
||||
|
||||
// Function to tokenize the prompt
|
||||
static int chat_loop(LlamaData & llama_data, std::string & prompt_non_interactive) {
|
||||
std::vector<char_array_ptr> owned_content;
|
||||
std::vector<char> fmtted(llama_n_ctx(llama_data.context.get()));
|
||||
int prev_len = 0;
|
||||
|
||||
while (true) {
|
||||
// Get user input
|
||||
std::string user_input;
|
||||
if (!handle_user_input(user_input, prompt_non_interactive)) {
|
||||
break;
|
||||
}
|
||||
|
||||
add_message("user", prompt_non_interactive.empty() ? user_input : prompt_non_interactive, llama_data,
|
||||
owned_content);
|
||||
|
||||
int new_len;
|
||||
if (apply_chat_template_with_error_handling(llama_data, fmtted, true, new_len) < 0) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::string prompt(fmtted.begin() + prev_len, fmtted.begin() + new_len);
|
||||
std::string response;
|
||||
if (generate_response(llama_data, prompt, response)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void log_callback(const enum ggml_log_level level, const char * text, void *) {
|
||||
if (level == GGML_LOG_LEVEL_ERROR) {
|
||||
fprintf(stderr, "%s", text);
|
||||
}
|
||||
}
|
||||
|
||||
static bool is_stdin_a_terminal() {
|
||||
#if defined(_WIN32)
|
||||
HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE);
|
||||
DWORD mode;
|
||||
return GetConsoleMode(hStdin, &mode);
|
||||
#else
|
||||
return isatty(STDIN_FILENO);
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::string read_pipe_data() {
|
||||
std::ostringstream result;
|
||||
result << std::cin.rdbuf(); // Read all data from std::cin
|
||||
return result.str();
|
||||
}
|
||||
|
||||
int main(int argc, const char ** argv) {
|
||||
Options opt;
|
||||
if (parse_arguments(argc, argv, opt)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!is_stdin_a_terminal()) {
|
||||
if (!opt.prompt_non_interactive.empty()) {
|
||||
opt.prompt_non_interactive += "\n\n";
|
||||
}
|
||||
|
||||
opt.prompt_non_interactive += read_pipe_data();
|
||||
}
|
||||
|
||||
llama_log_set(log_callback, nullptr);
|
||||
LlamaData llama_data;
|
||||
if (llama_data.init(opt)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (chat_loop(llama_data, opt.prompt_non_interactive)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -9,7 +9,7 @@ int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
params.prompt = "The quick brown fox";
|
||||
params.sparams.seed = 1234;
|
||||
params.sampling.seed = 1234;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||
return 1;
|
||||
@@ -42,7 +42,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed));
|
||||
|
||||
// tokenize prompt
|
||||
auto tokens = common_tokenize(ctx, params.prompt, true);
|
||||
@@ -106,7 +106,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
|
||||
|
||||
llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed));
|
||||
llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sampling.seed));
|
||||
|
||||
printf("\nsecond run: %s", params.prompt.c_str());
|
||||
|
||||
@@ -169,7 +169,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
|
||||
|
||||
llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed));
|
||||
llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed));
|
||||
|
||||
printf("\nsingle seq run: %s", params.prompt.c_str());
|
||||
|
||||
|
||||
@@ -412,7 +412,7 @@ node index.js
|
||||
|
||||
`id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1`
|
||||
|
||||
`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
|
||||
`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`
|
||||
|
||||
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
|
||||
|
||||
|
||||
@@ -81,7 +81,13 @@
|
||||
<path d="M14.5 3a1 1 0 0 1-1 1H13v9a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V4h-.5a1 1 0 0 1-1-1V2a1 1 0 0 1 1-1H6a1 1 0 0 1 1-1h2a1 1 0 0 1 1 1h3.5a1 1 0 0 1 1 1zM4.118 4 4 4.059V13a1 1 0 0 0 1 1h6a1 1 0 0 0 1-1V4.059L11.882 4zM2.5 3h11V2h-11z"/>
|
||||
</svg>
|
||||
</button>
|
||||
|
||||
<button v-if="messages.length > 0" class="btn mr-1" @click="downloadConv(viewingConvId)" :disabled="isGenerating">
|
||||
<!-- download conversation button -->
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-download" viewBox="0 0 16 16">
|
||||
<path d="M.5 9.9a.5.5 0 0 1 .5.5v2.5a1 1 0 0 0 1 1h12a1 1 0 0 0 1-1v-2.5a.5.5 0 0 1 1 0v2.5a2 2 0 0 1-2 2H2a2 2 0 0 1-2-2v-2.5a.5.5 0 0 1 .5-.5"/>
|
||||
<path d="M7.646 11.854a.5.5 0 0 0 .708 0l3-3a.5.5 0 0 0-.708-.708L8.5 10.293V1.5a.5.5 0 0 0-1 0v8.793L5.354 8.146a.5.5 0 1 0-.708.708z"/>
|
||||
</svg>
|
||||
</button>
|
||||
<button class="btn" @click="showConfigDialog = true" :disabled="isGenerating">
|
||||
<!-- edit config button -->
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16">
|
||||
@@ -526,6 +532,23 @@
|
||||
this.fetchMessages();
|
||||
}
|
||||
},
|
||||
downloadConv(convId) {
|
||||
const conversation = StorageUtils.getOneConversation(convId);
|
||||
if (!conversation) {
|
||||
alert('Conversation not found.');
|
||||
return;
|
||||
}
|
||||
const conversationJson = JSON.stringify(conversation, null, 2);
|
||||
const blob = new Blob([conversationJson], { type: 'application/json' });
|
||||
const url = URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = `conversation_${convId}.json`;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
URL.revokeObjectURL(url);
|
||||
},
|
||||
async sendMessage() {
|
||||
if (!this.inputMsg) return;
|
||||
const currConvId = this.viewingConvId;
|
||||
|
||||
@@ -2,10 +2,11 @@
|
||||
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "sampling.h"
|
||||
#include "json-schema-to-grammar.h"
|
||||
#include "llama.h"
|
||||
#include "log.h"
|
||||
#include "sampling.h"
|
||||
#include "speculative.h"
|
||||
|
||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||
#define JSON_ASSERT GGML_ASSERT
|
||||
@@ -110,7 +111,7 @@ struct server_static_file {
|
||||
|
||||
struct slot_params {
|
||||
bool stream = true;
|
||||
bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
|
||||
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
||||
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
|
||||
@@ -121,12 +122,21 @@ struct slot_params {
|
||||
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
|
||||
|
||||
std::vector<std::string> antiprompt;
|
||||
|
||||
struct common_params_sampling sampling;
|
||||
struct common_params_speculative speculative;
|
||||
};
|
||||
|
||||
struct server_slot {
|
||||
int id;
|
||||
int id_task = -1;
|
||||
|
||||
llama_batch batch_spec;
|
||||
|
||||
llama_context * ctx_dft = nullptr;
|
||||
|
||||
common_speculative * spec = nullptr;
|
||||
|
||||
// the index relative to completion multi-task request
|
||||
size_t index = 0;
|
||||
|
||||
@@ -175,7 +185,6 @@ struct server_slot {
|
||||
// sampling
|
||||
json json_schema;
|
||||
|
||||
struct common_sampler_params sparams;
|
||||
struct common_sampler * smpl = nullptr;
|
||||
|
||||
llama_token sampled;
|
||||
@@ -212,7 +221,7 @@ struct server_slot {
|
||||
generated_token_probs.clear();
|
||||
}
|
||||
|
||||
bool has_budget(common_params &global_params) {
|
||||
bool has_budget(const common_params & global_params) {
|
||||
if (params.n_predict == -1 && global_params.n_predict == -1) {
|
||||
return true; // limitless
|
||||
}
|
||||
@@ -232,6 +241,10 @@ struct server_slot {
|
||||
return state != SLOT_STATE_IDLE;
|
||||
}
|
||||
|
||||
bool can_speculate() const {
|
||||
return ctx_dft && params.speculative.n_max > 0 && params.cache_prompt;
|
||||
}
|
||||
|
||||
void add_token(const completion_token_output & token) {
|
||||
if (!is_processing()) {
|
||||
SLT_WRN(*this, "%s", "slot is not processing\n");
|
||||
@@ -591,11 +604,14 @@ struct server_response {
|
||||
};
|
||||
|
||||
struct server_context {
|
||||
common_params params_base;
|
||||
|
||||
llama_model * model = nullptr;
|
||||
llama_context * ctx = nullptr;
|
||||
std::vector<common_lora_adapter_container> loras;
|
||||
|
||||
common_params params;
|
||||
llama_model * model_dft = nullptr;
|
||||
llama_context_params cparams_dft;
|
||||
|
||||
llama_batch batch = {};
|
||||
|
||||
@@ -628,27 +644,41 @@ struct server_context {
|
||||
model = nullptr;
|
||||
}
|
||||
|
||||
if (model_dft) {
|
||||
llama_free_model(model_dft);
|
||||
model_dft = nullptr;
|
||||
}
|
||||
|
||||
// Clear any sampling context
|
||||
for (server_slot & slot : slots) {
|
||||
if (slot.smpl != nullptr) {
|
||||
common_sampler_free(slot.smpl);
|
||||
}
|
||||
common_sampler_free(slot.smpl);
|
||||
slot.smpl = nullptr;
|
||||
|
||||
llama_free(slot.ctx_dft);
|
||||
slot.ctx_dft = nullptr;
|
||||
|
||||
common_speculative_free(slot.spec);
|
||||
slot.spec = nullptr;
|
||||
|
||||
llama_batch_free(slot.batch_spec);
|
||||
}
|
||||
|
||||
llama_batch_free(batch);
|
||||
}
|
||||
|
||||
bool load_model(const common_params & params_) {
|
||||
params = params_;
|
||||
bool load_model(const common_params & params) {
|
||||
SRV_INF("loading model '%s'\n", params.model.c_str());
|
||||
|
||||
common_init_result llama_init = common_init_from_params(params);
|
||||
params_base = params;
|
||||
|
||||
common_init_result llama_init = common_init_from_params(params_base);
|
||||
|
||||
model = llama_init.model;
|
||||
ctx = llama_init.context;
|
||||
loras = llama_init.lora_adapters;
|
||||
|
||||
if (model == nullptr) {
|
||||
SRV_ERR("failed to load model, '%s'\n", params.model.c_str());
|
||||
SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -657,6 +687,41 @@ struct server_context {
|
||||
add_bos_token = llama_add_bos_token(model);
|
||||
has_eos_token = !llama_add_eos_token(model);
|
||||
|
||||
if (!params_base.speculative.model.empty()) {
|
||||
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
|
||||
|
||||
auto params_dft = params_base;
|
||||
|
||||
params_dft.devices = params_base.speculative.devices;
|
||||
params_dft.model = params_base.speculative.model;
|
||||
params_dft.n_ctx = params_base.speculative.n_ctx;
|
||||
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
|
||||
|
||||
common_init_result llama_init_dft = common_init_from_params(params_dft);
|
||||
|
||||
model_dft = llama_init_dft.model;
|
||||
|
||||
if (model_dft == nullptr) {
|
||||
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) {
|
||||
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
|
||||
|
||||
llama_free (llama_init_dft.context);
|
||||
llama_free_model(llama_init_dft.model);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
cparams_dft = common_context_params_to_llama(params_base);
|
||||
cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context);
|
||||
|
||||
// the context is not needed - we will create one for each slot
|
||||
llama_free(llama_init_dft.context);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -674,20 +739,36 @@ struct server_context {
|
||||
}
|
||||
|
||||
void init() {
|
||||
const int32_t n_ctx_slot = n_ctx / params.n_parallel;
|
||||
const int32_t n_ctx_slot = n_ctx / params_base.n_parallel;
|
||||
|
||||
SRV_INF("initializing slots, n_slots = %d\n", params.n_parallel);
|
||||
SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
|
||||
|
||||
for (int i = 0; i < params.n_parallel; i++) {
|
||||
for (int i = 0; i < params_base.n_parallel; i++) {
|
||||
server_slot slot;
|
||||
|
||||
slot.id = i;
|
||||
slot.n_ctx = n_ctx_slot;
|
||||
slot.n_predict = params.n_predict;
|
||||
slot.n_predict = params_base.n_predict;
|
||||
|
||||
if (model_dft) {
|
||||
slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
|
||||
|
||||
slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft);
|
||||
if (slot.ctx_dft == nullptr) {
|
||||
SRV_ERR("%s", "failed to create draft context\n");
|
||||
return;
|
||||
}
|
||||
|
||||
slot.spec = common_speculative_init(slot.ctx_dft);
|
||||
if (slot.spec == nullptr) {
|
||||
SRV_ERR("%s", "failed to create speculator\n");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
|
||||
|
||||
slot.sparams = params.sparams;
|
||||
slot.params.sampling = params_base.sampling;
|
||||
|
||||
slot.callback_on_release = [this](int) {
|
||||
queue_tasks.pop_deferred_task();
|
||||
@@ -707,7 +788,7 @@ struct server_context {
|
||||
const int32_t n_batch = llama_n_batch(ctx);
|
||||
|
||||
// only a single seq_id per token is needed
|
||||
batch = llama_batch_init(std::max(n_batch, params.n_parallel), 0, 1);
|
||||
batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
|
||||
}
|
||||
|
||||
metrics.init();
|
||||
@@ -743,7 +824,7 @@ struct server_context {
|
||||
}
|
||||
|
||||
// length of the Longest Common Subsequence between the current slot's prompt and the input prompt
|
||||
int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
|
||||
int cur_lcs_len = common_lcs(slot.cache_tokens, task.prompt_tokens);
|
||||
|
||||
// fraction of the common subsequence length compared to the current slot's prompt length
|
||||
float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
|
||||
@@ -786,9 +867,11 @@ struct server_context {
|
||||
}
|
||||
|
||||
bool launch_slot_with_task(server_slot & slot, const server_task & task) {
|
||||
slot_params default_params;
|
||||
// Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
|
||||
auto default_sparams = params.sparams;
|
||||
slot_params defaults;
|
||||
defaults.sampling = params_base.sampling;
|
||||
defaults.speculative = params_base.speculative;
|
||||
|
||||
const auto & data = task.data;
|
||||
|
||||
if (data.count("__oaicompat") != 0) {
|
||||
@@ -799,42 +882,48 @@ struct server_context {
|
||||
slot.oaicompat_model = "";
|
||||
}
|
||||
|
||||
slot.params.stream = json_value(data, "stream", false);
|
||||
slot.params.cache_prompt = json_value(data, "cache_prompt", false);
|
||||
slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict));
|
||||
slot.params.n_indent = json_value(data, "n_indent", default_params.n_indent);
|
||||
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
||||
slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
||||
slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
|
||||
slot.sparams.xtc_probability = json_value(data, "xtc_probability", default_sparams.xtc_probability);
|
||||
slot.sparams.xtc_threshold = json_value(data, "xtc_threshold", default_sparams.xtc_threshold);
|
||||
slot.sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p);
|
||||
slot.sparams.temp = json_value(data, "temperature", default_sparams.temp);
|
||||
slot.sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
|
||||
slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
|
||||
slot.sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
|
||||
slot.sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
|
||||
slot.sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
|
||||
slot.sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
|
||||
slot.sparams.dry_multiplier = json_value(data, "dry_multiplier", default_sparams.dry_multiplier);
|
||||
slot.sparams.dry_base = json_value(data, "dry_base", default_sparams.dry_base);
|
||||
slot.sparams.dry_allowed_length = json_value(data, "dry_allowed_length", default_sparams.dry_allowed_length);
|
||||
slot.sparams.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", default_sparams.dry_penalty_last_n);
|
||||
slot.sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
|
||||
slot.sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
|
||||
slot.sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
|
||||
slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
|
||||
slot.params.n_keep = json_value(data, "n_keep", default_params.n_keep);
|
||||
slot.params.n_discard = json_value(data, "n_discard", default_params.n_discard);
|
||||
slot.sparams.seed = json_value(data, "seed", default_sparams.seed);
|
||||
slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
||||
//slot.params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", default_params.t_max_prompt_ms); // TODO: implement
|
||||
slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms", default_params.t_max_predict_ms);
|
||||
slot.params.stream = json_value(data, "stream", false);
|
||||
slot.params.cache_prompt = json_value(data, "cache_prompt", true);
|
||||
slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict));
|
||||
slot.params.n_indent = json_value(data, "n_indent", defaults.n_indent);
|
||||
slot.params.n_keep = json_value(data, "n_keep", defaults.n_keep);
|
||||
slot.params.n_discard = json_value(data, "n_discard", defaults.n_discard);
|
||||
//slot.params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement
|
||||
slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms);
|
||||
|
||||
if (slot.sparams.dry_base < 1.0f)
|
||||
{
|
||||
slot.sparams.dry_base = default_sparams.dry_base;
|
||||
slot.params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
|
||||
slot.params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
|
||||
slot.params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p);
|
||||
slot.params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability);
|
||||
slot.params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold);
|
||||
slot.params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p);
|
||||
slot.params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp);
|
||||
slot.params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range);
|
||||
slot.params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent);
|
||||
slot.params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n);
|
||||
slot.params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat);
|
||||
slot.params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq);
|
||||
slot.params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present);
|
||||
slot.params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier);
|
||||
slot.params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base);
|
||||
slot.params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length);
|
||||
slot.params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n);
|
||||
slot.params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat);
|
||||
slot.params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau);
|
||||
slot.params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta);
|
||||
slot.params.sampling.penalize_nl = json_value(data, "penalize_nl", defaults.sampling.penalize_nl);
|
||||
slot.params.sampling.seed = json_value(data, "seed", defaults.sampling.seed);
|
||||
slot.params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs);
|
||||
slot.params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep);
|
||||
|
||||
slot.params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
|
||||
slot.params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
|
||||
slot.params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
|
||||
|
||||
slot.params.speculative.n_min = std::min(slot.params.speculative.n_max, slot.params.speculative.n_min);
|
||||
|
||||
if (slot.params.sampling.dry_base < 1.0f) {
|
||||
slot.params.sampling.dry_base = defaults.sampling.dry_base;
|
||||
}
|
||||
|
||||
// sequence breakers for DRY
|
||||
@@ -843,8 +932,8 @@ struct server_context {
|
||||
// Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39
|
||||
|
||||
if (data.contains("dry_sequence_breakers")) {
|
||||
slot.sparams.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector<std::string>());
|
||||
if (slot.sparams.dry_sequence_breakers.empty()) {
|
||||
slot.params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector<std::string>());
|
||||
if (slot.params.sampling.dry_sequence_breakers.empty()) {
|
||||
send_error(task, "Error: dry_sequence_breakers must be a non-empty array of strings", ERROR_TYPE_INVALID_REQUEST);
|
||||
return false;
|
||||
}
|
||||
@@ -858,14 +947,14 @@ struct server_context {
|
||||
}
|
||||
if (data.contains("json_schema") && !data.contains("grammar")) {
|
||||
try {
|
||||
auto schema = json_value(data, "json_schema", json::object());
|
||||
slot.sparams.grammar = json_schema_to_grammar(schema);
|
||||
auto schema = json_value(data, "json_schema", json::object());
|
||||
slot.params.sampling.grammar = json_schema_to_grammar(schema);
|
||||
} catch (const std::exception & e) {
|
||||
send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
||||
slot.params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar);
|
||||
}
|
||||
|
||||
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
|
||||
@@ -875,10 +964,10 @@ struct server_context {
|
||||
}
|
||||
|
||||
{
|
||||
slot.sparams.logit_bias.clear();
|
||||
slot.params.sampling.logit_bias.clear();
|
||||
|
||||
if (json_value(data, "ignore_eos", false) && has_eos_token) {
|
||||
slot.sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
|
||||
slot.params.sampling.logit_bias.push_back({llama_token_eos(model), -INFINITY});
|
||||
}
|
||||
|
||||
const auto & logit_bias = data.find("logit_bias");
|
||||
@@ -899,12 +988,12 @@ struct server_context {
|
||||
if (el[0].is_number_integer()) {
|
||||
llama_token tok = el[0].get<llama_token>();
|
||||
if (tok >= 0 && tok < n_vocab) {
|
||||
slot.sparams.logit_bias.push_back({tok, bias});
|
||||
slot.params.sampling.logit_bias.push_back({tok, bias});
|
||||
}
|
||||
} else if (el[0].is_string()) {
|
||||
auto toks = common_tokenize(model, el[0].get<std::string>(), false);
|
||||
for (auto tok : toks) {
|
||||
slot.sparams.logit_bias.push_back({tok, bias});
|
||||
slot.params.sampling.logit_bias.push_back({tok, bias});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -935,16 +1024,16 @@ struct server_context {
|
||||
sampler_names.emplace_back(name);
|
||||
}
|
||||
}
|
||||
slot.sparams.samplers = common_sampler_types_from_names(sampler_names, false);
|
||||
slot.params.sampling.samplers = common_sampler_types_from_names(sampler_names, false);
|
||||
} else if (samplers->is_string()){
|
||||
std::string sampler_string;
|
||||
for (const auto & name : *samplers) {
|
||||
sampler_string += name;
|
||||
}
|
||||
slot.sparams.samplers = common_sampler_types_from_chars(sampler_string);
|
||||
slot.params.sampling.samplers = common_sampler_types_from_chars(sampler_string);
|
||||
}
|
||||
} else {
|
||||
slot.sparams.samplers = default_sparams.samplers;
|
||||
slot.params.sampling.samplers = defaults.sampling.samplers;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -953,7 +1042,7 @@ struct server_context {
|
||||
common_sampler_free(slot.smpl);
|
||||
}
|
||||
|
||||
slot.smpl = common_sampler_init(model, slot.sparams);
|
||||
slot.smpl = common_sampler_init(model, slot.params.sampling);
|
||||
if (slot.smpl == nullptr) {
|
||||
// for now, the only error that may happen here is invalid grammar
|
||||
send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
|
||||
@@ -961,6 +1050,12 @@ struct server_context {
|
||||
}
|
||||
}
|
||||
|
||||
if (slot.ctx_dft) {
|
||||
llama_batch_free(slot.batch_spec);
|
||||
|
||||
slot.batch_spec = llama_batch_init(slot.params.speculative.n_max + 1, 0, 1);
|
||||
}
|
||||
|
||||
slot.state = SLOT_STATE_STARTED;
|
||||
|
||||
SLT_INF(slot, "%s", "processing task\n");
|
||||
@@ -978,7 +1073,7 @@ struct server_context {
|
||||
|
||||
bool process_token(completion_token_output & result, server_slot & slot) {
|
||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
||||
const std::string token_str = common_token_to_piece(ctx, result.tok, params.special);
|
||||
const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special);
|
||||
slot.sampled = result.tok;
|
||||
|
||||
// search stop word and delete it
|
||||
@@ -1043,7 +1138,7 @@ struct server_context {
|
||||
}
|
||||
|
||||
// check the limits
|
||||
if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params)) {
|
||||
if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
|
||||
slot.stopped_limit = true;
|
||||
slot.has_next_token = false;
|
||||
|
||||
@@ -1136,50 +1231,54 @@ struct server_context {
|
||||
|
||||
json get_formated_generation(const server_slot & slot) const {
|
||||
std::vector<std::string> samplers;
|
||||
samplers.reserve(slot.sparams.samplers.size());
|
||||
for (const auto & sampler : slot.sparams.samplers) {
|
||||
samplers.reserve(slot.params.sampling.samplers.size());
|
||||
for (const auto & sampler : slot.params.sampling.samplers) {
|
||||
samplers.emplace_back(common_sampler_type_to_str(sampler));
|
||||
}
|
||||
|
||||
return json {
|
||||
{"n_ctx", slot.n_ctx},
|
||||
{"n_predict", slot.n_predict}, // Server configured n_predict
|
||||
{"model", params.model_alias},
|
||||
{"seed", slot.sparams.seed},
|
||||
{"model", params_base.model_alias},
|
||||
{"seed", slot.params.sampling.seed},
|
||||
{"seed_cur", slot.smpl ? common_sampler_get_seed(slot.smpl) : 0},
|
||||
{"temperature", slot.sparams.temp},
|
||||
{"dynatemp_range", slot.sparams.dynatemp_range},
|
||||
{"dynatemp_exponent", slot.sparams.dynatemp_exponent},
|
||||
{"top_k", slot.sparams.top_k},
|
||||
{"top_p", slot.sparams.top_p},
|
||||
{"min_p", slot.sparams.min_p},
|
||||
{"xtc_probability", slot.sparams.xtc_probability},
|
||||
{"xtc_threshold", slot.sparams.xtc_threshold},
|
||||
{"typical_p", slot.sparams.typ_p},
|
||||
{"repeat_last_n", slot.sparams.penalty_last_n},
|
||||
{"repeat_penalty", slot.sparams.penalty_repeat},
|
||||
{"presence_penalty", slot.sparams.penalty_present},
|
||||
{"frequency_penalty", slot.sparams.penalty_freq},
|
||||
{"dry_multiplier", slot.sparams.dry_multiplier},
|
||||
{"dry_base", slot.sparams.dry_base},
|
||||
{"dry_allowed_length", slot.sparams.dry_allowed_length},
|
||||
{"dry_penalty_last_n", slot.sparams.dry_penalty_last_n},
|
||||
{"dry_sequence_breakers", slot.sparams.dry_sequence_breakers},
|
||||
{"mirostat", slot.sparams.mirostat},
|
||||
{"mirostat_tau", slot.sparams.mirostat_tau},
|
||||
{"mirostat_eta", slot.sparams.mirostat_eta},
|
||||
{"penalize_nl", slot.sparams.penalize_nl},
|
||||
{"temperature", slot.params.sampling.temp},
|
||||
{"dynatemp_range", slot.params.sampling.dynatemp_range},
|
||||
{"dynatemp_exponent", slot.params.sampling.dynatemp_exponent},
|
||||
{"top_k", slot.params.sampling.top_k},
|
||||
{"top_p", slot.params.sampling.top_p},
|
||||
{"min_p", slot.params.sampling.min_p},
|
||||
{"xtc_probability", slot.params.sampling.xtc_probability},
|
||||
{"xtc_threshold", slot.params.sampling.xtc_threshold},
|
||||
{"typical_p", slot.params.sampling.typ_p},
|
||||
{"repeat_last_n", slot.params.sampling.penalty_last_n},
|
||||
{"repeat_penalty", slot.params.sampling.penalty_repeat},
|
||||
{"presence_penalty", slot.params.sampling.penalty_present},
|
||||
{"frequency_penalty", slot.params.sampling.penalty_freq},
|
||||
{"dry_multiplier", slot.params.sampling.dry_multiplier},
|
||||
{"dry_base", slot.params.sampling.dry_base},
|
||||
{"dry_allowed_length", slot.params.sampling.dry_allowed_length},
|
||||
{"dry_penalty_last_n", slot.params.sampling.dry_penalty_last_n},
|
||||
{"dry_sequence_breakers", slot.params.sampling.dry_sequence_breakers},
|
||||
{"mirostat", slot.params.sampling.mirostat},
|
||||
{"mirostat_tau", slot.params.sampling.mirostat_tau},
|
||||
{"mirostat_eta", slot.params.sampling.mirostat_eta},
|
||||
{"penalize_nl", slot.params.sampling.penalize_nl},
|
||||
{"stop", slot.params.antiprompt},
|
||||
{"max_tokens", slot.params.n_predict}, // User configured n_predict
|
||||
{"n_keep", slot.params.n_keep},
|
||||
{"n_discard", slot.params.n_discard},
|
||||
{"ignore_eos", slot.sparams.ignore_eos},
|
||||
{"ignore_eos", slot.params.sampling.ignore_eos},
|
||||
{"stream", slot.params.stream},
|
||||
//{"logit_bias", slot.sparams.logit_bias},
|
||||
{"n_probs", slot.sparams.n_probs},
|
||||
{"min_keep", slot.sparams.min_keep},
|
||||
{"grammar", slot.sparams.grammar},
|
||||
//{"logit_bias", slot.params.sampling.logit_bias},
|
||||
{"n_probs", slot.params.sampling.n_probs},
|
||||
{"min_keep", slot.params.sampling.min_keep},
|
||||
{"grammar", slot.params.sampling.grammar},
|
||||
{"samplers", samplers},
|
||||
{"speculative", slot.can_speculate()},
|
||||
{"speculative.n_max", slot.params.speculative.n_max},
|
||||
{"speculative.n_min", slot.params.speculative.n_min},
|
||||
{"speculative.p_min", slot.params.speculative.p_min},
|
||||
};
|
||||
}
|
||||
|
||||
@@ -1216,7 +1315,7 @@ struct server_context {
|
||||
{"index", slot.index},
|
||||
};
|
||||
|
||||
if (slot.sparams.n_probs > 0) {
|
||||
if (slot.params.sampling.n_probs > 0) {
|
||||
const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
|
||||
const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size());
|
||||
const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
|
||||
@@ -1249,7 +1348,7 @@ struct server_context {
|
||||
{"content", !slot.params.stream ? slot.generated_text : ""},
|
||||
{"id_slot", slot.id},
|
||||
{"stop", true},
|
||||
{"model", params.model_alias},
|
||||
{"model", params_base.model_alias},
|
||||
{"tokens_predicted", slot.n_decoded},
|
||||
{"tokens_evaluated", slot.n_prompt_tokens},
|
||||
{"generation_settings", get_formated_generation(slot)},
|
||||
@@ -1265,7 +1364,7 @@ struct server_context {
|
||||
{"index", slot.index},
|
||||
};
|
||||
|
||||
if (slot.sparams.n_probs > 0) {
|
||||
if (slot.params.sampling.n_probs > 0) {
|
||||
std::vector<completion_token_output> probs;
|
||||
if (!slot.params.stream && slot.stopped_word) {
|
||||
const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
|
||||
@@ -1422,10 +1521,10 @@ struct server_context {
|
||||
data.at("input_prefix"),
|
||||
data.at("input_suffix"),
|
||||
data.at("input_extra"),
|
||||
params.n_batch,
|
||||
params.n_predict,
|
||||
params_base.n_batch,
|
||||
params_base.n_predict,
|
||||
slots[0].n_ctx, // TODO: there should be a better way
|
||||
params.spm_infill,
|
||||
params_base.spm_infill,
|
||||
tokenized_prompts[i]
|
||||
);
|
||||
create_task(data, tokens);
|
||||
@@ -1798,7 +1897,7 @@ struct server_context {
|
||||
// TODO: simplify and improve
|
||||
for (server_slot & slot : slots) {
|
||||
if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) {
|
||||
if (!params.ctx_shift) {
|
||||
if (!params_base.ctx_shift) {
|
||||
// this check is redundant (for good)
|
||||
// we should never get here, because generation should already stopped in process_token()
|
||||
slot.release();
|
||||
@@ -1864,7 +1963,7 @@ struct server_context {
|
||||
int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;
|
||||
|
||||
// next, batch any pending prompts without exceeding n_batch
|
||||
if (params.cont_batching || batch.n_tokens == 0) {
|
||||
if (params_base.cont_batching || batch.n_tokens == 0) {
|
||||
for (auto & slot : slots) {
|
||||
// this slot still has a prompt to be processed
|
||||
if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
|
||||
@@ -1917,7 +2016,7 @@ struct server_context {
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if (!params.ctx_shift) {
|
||||
if (!params_base.ctx_shift) {
|
||||
// if context shift is disabled, we make sure prompt size is smaller than KV size
|
||||
// TODO: there should be a separate parameter that control prompt truncation
|
||||
// context shift should be applied only during the generation phase
|
||||
@@ -1960,14 +2059,14 @@ struct server_context {
|
||||
|
||||
if (slot.params.cache_prompt) {
|
||||
// reuse any previously computed tokens that are common with the new prompt
|
||||
slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens);
|
||||
slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens);
|
||||
|
||||
// reuse chunks from the cached prompt by shifting their KV cache in the new position
|
||||
if (params.n_cache_reuse > 0) {
|
||||
if (params_base.n_cache_reuse > 0) {
|
||||
size_t head_c = slot.n_past; // cache
|
||||
size_t head_p = slot.n_past; // current prompt
|
||||
|
||||
SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params.n_cache_reuse, slot.n_past);
|
||||
SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params_base.n_cache_reuse, slot.n_past);
|
||||
|
||||
while (head_c < slot.cache_tokens.size() &&
|
||||
head_p < prompt_tokens.size()) {
|
||||
@@ -1980,7 +2079,7 @@ struct server_context {
|
||||
n_match++;
|
||||
}
|
||||
|
||||
if (n_match >= (size_t) params.n_cache_reuse) {
|
||||
if (n_match >= (size_t) params_base.n_cache_reuse) {
|
||||
SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
|
||||
//for (size_t i = head_p; i < head_p + n_match; i++) {
|
||||
// SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
|
||||
@@ -2168,8 +2267,9 @@ struct server_context {
|
||||
continue; // continue loop of slots
|
||||
}
|
||||
|
||||
completion_token_output result;
|
||||
const llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
|
||||
llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
|
||||
|
||||
slot.i_batch = -1;
|
||||
|
||||
common_sampler_accept(slot.smpl, id, true);
|
||||
|
||||
@@ -2180,14 +2280,15 @@ struct server_context {
|
||||
metrics.on_prompt_eval(slot);
|
||||
}
|
||||
|
||||
completion_token_output result;
|
||||
result.tok = id;
|
||||
|
||||
const auto * cur_p = common_sampler_get_candidates(slot.smpl);
|
||||
|
||||
for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
|
||||
for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) {
|
||||
result.probs.push_back({
|
||||
cur_p->data[i].id,
|
||||
i >= cur_p->size ? 0.0f : cur_p->data[i].p,
|
||||
i >= cur_p->size ? 0.0f : cur_p->data[i].p,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -2197,9 +2298,67 @@ struct server_context {
|
||||
slot.print_timings();
|
||||
send_final_response(slot);
|
||||
metrics.on_prediction(slot);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// do speculative decoding
|
||||
for (auto & slot : slots) {
|
||||
if (!slot.is_processing() || !slot.can_speculate()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
slot.i_batch = -1;
|
||||
llama_token id = slot.sampled;
|
||||
|
||||
struct common_speculative_params params_spec;
|
||||
params_spec.n_draft = slot.params.speculative.n_max;
|
||||
params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
|
||||
params_spec.p_min = slot.params.speculative.p_min;
|
||||
|
||||
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
|
||||
|
||||
// ignore small drafts
|
||||
if (slot.params.speculative.n_min > (int) draft.size()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// construct the speculation batch
|
||||
common_batch_clear(slot.batch_spec);
|
||||
common_batch_add (slot.batch_spec, id, slot.n_past, { slot.id }, true);
|
||||
|
||||
for (size_t i = 0; i < draft.size(); ++i) {
|
||||
common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true);
|
||||
}
|
||||
|
||||
llama_decode(ctx, slot.batch_spec);
|
||||
|
||||
// the accepted tokens from the speculation
|
||||
const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft);
|
||||
|
||||
slot.n_past += ids.size();
|
||||
slot.n_decoded += ids.size();
|
||||
|
||||
slot.cache_tokens.push_back(id);
|
||||
slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
|
||||
|
||||
for (size_t i = 0; i < ids.size(); ++i) {
|
||||
completion_token_output result;
|
||||
|
||||
result.tok = ids[i];
|
||||
|
||||
if (!process_token(result, slot)) {
|
||||
// release slot because of stop condition
|
||||
slot.release();
|
||||
slot.print_timings();
|
||||
send_final_response(slot);
|
||||
metrics.on_prediction(slot);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
SRV_DBG("accepted %d/%d draft tokens\n", (int) ids.size() - 1, (int) draft.size());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2697,7 +2856,7 @@ int main(int argc, char ** argv) {
|
||||
const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
|
||||
json data = {
|
||||
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
||||
{ "total_slots", ctx_server.params.n_parallel },
|
||||
{ "total_slots", ctx_server.params_base.n_parallel },
|
||||
{ "chat_template", llama_get_chat_template(ctx_server.model) },
|
||||
};
|
||||
|
||||
@@ -2705,7 +2864,7 @@ int main(int argc, char ** argv) {
|
||||
};
|
||||
|
||||
const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
||||
if (!ctx_server.params.endpoint_props) {
|
||||
if (!ctx_server.params_base.endpoint_props) {
|
||||
res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
@@ -2718,7 +2877,7 @@ int main(int argc, char ** argv) {
|
||||
};
|
||||
|
||||
const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) {
|
||||
if (ctx_server.params.embedding) {
|
||||
if (ctx_server.params_base.embedding) {
|
||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
@@ -2824,7 +2983,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// TODO: maybe merge this function with "handle_completions_generic"
|
||||
const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) {
|
||||
if (ctx_server.params.embedding) {
|
||||
if (ctx_server.params_base.embedding) {
|
||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
@@ -3001,7 +3160,7 @@ int main(int argc, char ** argv) {
|
||||
};
|
||||
|
||||
const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
||||
if (!ctx_server.params.reranking || ctx_server.params.embedding) {
|
||||
if (!ctx_server.params_base.reranking || ctx_server.params_base.embedding) {
|
||||
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -24,7 +24,6 @@
|
||||
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
using llama_tokens = std::vector<llama_token>;
|
||||
|
||||
#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
|
||||
#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
|
||||
@@ -439,62 +438,6 @@ static std::string gen_chatcmplid() {
|
||||
// other common utils
|
||||
//
|
||||
|
||||
static size_t longest_common_prefix(const llama_tokens & a, const llama_tokens & b) {
|
||||
size_t i;
|
||||
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
static size_t longest_common_subsequence(const llama_tokens & a, const llama_tokens & b) {
|
||||
// check for empty sequences
|
||||
if (a.empty() || b.empty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// get the lengths of the input sequences
|
||||
size_t a_len = a.size();
|
||||
size_t b_len = b.size();
|
||||
|
||||
// initialize the maximum length of the longest common subsequence (LCS)
|
||||
size_t max_length = 0;
|
||||
|
||||
// use two rows instead of a 2D matrix to optimize space
|
||||
std::vector<size_t> prev_row(b_len + 1, 0);
|
||||
std::vector<size_t> curr_row(b_len + 1, 0);
|
||||
|
||||
// iterate through the elements of a
|
||||
for (size_t i = 1; i <= a_len; i++) {
|
||||
// iterate through the elements of b
|
||||
for (size_t j = 1; j <= b_len; j++) {
|
||||
// if elements at the current positions match
|
||||
if (a[i - 1] == b[j - 1]) {
|
||||
// if it's the first element of either sequences, set LCS length to 1
|
||||
if (i == 1 || j == 1) {
|
||||
curr_row[j] = 1;
|
||||
} else {
|
||||
// increment LCS length by 1 compared to the previous element
|
||||
curr_row[j] = prev_row[j - 1] + 1;
|
||||
}
|
||||
|
||||
// update max_length if necessary
|
||||
if (curr_row[j] > max_length) {
|
||||
max_length = curr_row[j];
|
||||
}
|
||||
} else {
|
||||
// reset LCS length if elements don't match
|
||||
curr_row[j] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// update the previous row for the next iteration
|
||||
prev_row = curr_row;
|
||||
}
|
||||
|
||||
// return the maximum length of the LCS
|
||||
return max_length;
|
||||
}
|
||||
|
||||
static bool ends_with(const std::string & str, const std::string & suffix) {
|
||||
return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
|
||||
}
|
||||
|
||||
@@ -62,6 +62,9 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}, nullptr);
|
||||
|
||||
// load dynamic backends
|
||||
ggml_backend_load_all();
|
||||
|
||||
// initialize the model
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
model_params.n_gpu_layers = ngl;
|
||||
|
||||
@@ -74,6 +74,10 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
// load dynamic backends
|
||||
|
||||
ggml_backend_load_all();
|
||||
|
||||
// initialize the model
|
||||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
||||
5
examples/speculative-simple/CMakeLists.txt
Normal file
5
examples/speculative-simple/CMakeLists.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
set(TARGET llama-speculative-simple)
|
||||
add_executable(${TARGET} speculative-simple.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
12
examples/speculative-simple/README.md
Normal file
12
examples/speculative-simple/README.md
Normal file
@@ -0,0 +1,12 @@
|
||||
# llama.cpp/examples/speculative-simple
|
||||
|
||||
Demonstration of basic greedy speculative decoding
|
||||
|
||||
```bash
|
||||
./bin/llama-speculative-simple \
|
||||
-m ../models/qwen2.5-32b-coder-instruct/ggml-model-q8_0.gguf \
|
||||
-md ../models/qwen2.5-1.5b-coder-instruct/ggml-model-q4_0.gguf \
|
||||
-f test.txt -c 0 -ngl 99 --color \
|
||||
--sampling-seq k --top-k 1 -fa --temp 0.0 \
|
||||
-ngld 99 --draft-max 16 --draft-min 5 --draft-p-min 0.9
|
||||
```
|
||||
265
examples/speculative-simple/speculative-simple.cpp
Normal file
265
examples/speculative-simple/speculative-simple.cpp
Normal file
@@ -0,0 +1,265 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "speculative.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (params.n_predict < -1) {
|
||||
LOG_ERR("%s: --n-predict must be >= -1\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
common_init();
|
||||
|
||||
if (params.speculative.model.empty()) {
|
||||
LOG_ERR("%s: --model-draft is required\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
llama_model * model_tgt = NULL;
|
||||
llama_model * model_dft = NULL;
|
||||
|
||||
llama_context * ctx_tgt = NULL;
|
||||
llama_context * ctx_dft = NULL;
|
||||
|
||||
// load the target model
|
||||
common_init_result llama_init_tgt = common_init_from_params(params);
|
||||
|
||||
model_tgt = llama_init_tgt.model;
|
||||
ctx_tgt = llama_init_tgt.context;
|
||||
|
||||
// load the draft model
|
||||
params.devices = params.speculative.devices;
|
||||
params.model = params.speculative.model;
|
||||
params.n_ctx = params.speculative.n_ctx;
|
||||
params.n_batch = params.speculative.n_ctx > 0 ? params.speculative.n_ctx : params.n_batch;
|
||||
params.n_gpu_layers = params.speculative.n_gpu_layers;
|
||||
|
||||
if (params.speculative.cpuparams.n_threads > 0) {
|
||||
params.cpuparams.n_threads = params.speculative.cpuparams.n_threads;
|
||||
}
|
||||
|
||||
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
||||
common_init_result llama_init_dft = common_init_from_params(params);
|
||||
|
||||
model_dft = llama_init_dft.model;
|
||||
ctx_dft = llama_init_dft.context;
|
||||
|
||||
if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Tokenize the prompt
|
||||
std::vector<llama_token> inp;
|
||||
inp = common_tokenize(ctx_tgt, params.prompt, true, true);
|
||||
|
||||
if (llama_n_ctx(ctx_tgt) < (int) inp.size()) {
|
||||
LOG_ERR("%s: the prompt exceeds the context size (%d tokens, ctx %d)\n", __func__, (int) inp.size(), llama_n_ctx(ctx_tgt));
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (llama_n_batch(ctx_tgt) < (int) inp.size()) {
|
||||
LOG_ERR("%s: the prompt exceeds the batch size (%d tokens, batch %d)\n", __func__, (int) inp.size(), llama_n_batch(ctx_tgt));
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
LOG("\n\n");
|
||||
|
||||
for (auto id : inp) {
|
||||
LOG("%s", common_token_to_piece(ctx_tgt, id).c_str());
|
||||
}
|
||||
|
||||
// how many tokens to draft each time
|
||||
int n_draft = params.speculative.n_max;
|
||||
int n_draft_min = params.speculative.n_min;
|
||||
|
||||
float p_min = params.speculative.p_min;
|
||||
|
||||
int n_predict = 0;
|
||||
int n_drafted = 0;
|
||||
int n_accept = 0;
|
||||
|
||||
// used to determine end of generation
|
||||
bool has_eos = false;
|
||||
|
||||
// ================================================
|
||||
// everything until here is standard initialization
|
||||
// the relevant stuff for speculative decoding starts here
|
||||
|
||||
const auto t_enc_start = ggml_time_us();
|
||||
|
||||
// target model sampling context
|
||||
struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);
|
||||
|
||||
// eval the prompt
|
||||
llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));
|
||||
|
||||
// note: keep the last token separate!
|
||||
llama_token id_last = inp.back();
|
||||
|
||||
// all tokens currently in the target context
|
||||
llama_tokens prompt_tgt(inp.begin(), inp.end() - 1);
|
||||
prompt_tgt.reserve(llama_n_ctx(ctx_tgt));
|
||||
|
||||
int n_past = inp.size() - 1;
|
||||
|
||||
// init the speculator
|
||||
struct common_speculative_params params_spec;
|
||||
params_spec.n_draft = n_draft;
|
||||
params_spec.n_reuse = llama_n_ctx(ctx_dft) - n_draft;
|
||||
params_spec.p_min = p_min;
|
||||
|
||||
struct common_speculative * spec = common_speculative_init(ctx_dft);
|
||||
|
||||
llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);
|
||||
|
||||
const auto t_enc_end = ggml_time_us();
|
||||
|
||||
const auto t_dec_start = ggml_time_us();
|
||||
|
||||
while (true) {
|
||||
// optionally, generate draft tokens that can be appended to the target batch
|
||||
//
|
||||
// this is the most important part of the speculation. the more probable tokens that are provided here
|
||||
// the better the performance will be. in theory, this computation can be performed asynchronously and even
|
||||
// offloaded to a remote device. it doesn't even have to be based on an LLM. instead, it can provide tokens
|
||||
// from a cache or lookup tables.
|
||||
//
|
||||
llama_tokens draft = common_speculative_gen_draft(spec, params_spec, prompt_tgt, id_last);
|
||||
|
||||
//LOG_DBG("draft: %s\n", string_from(ctx_dft, draft).c_str());
|
||||
|
||||
// always have a token to evaluate from before - id_last
|
||||
common_batch_clear(batch_tgt);
|
||||
common_batch_add (batch_tgt, id_last, n_past++, { 0 }, true);
|
||||
|
||||
// evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
|
||||
{
|
||||
// do not waste time on small drafts
|
||||
if (draft.size() < n_draft_min) {
|
||||
draft.clear();
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < draft.size(); ++i) {
|
||||
common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
|
||||
}
|
||||
|
||||
//LOG_DBG("target batch: %s\n", string_from(ctx_tgt, batch_tgt).c_str());
|
||||
|
||||
llama_decode(ctx_tgt, batch_tgt);
|
||||
}
|
||||
|
||||
// sample from the full target batch and return the accepted tokens based on the target sampler
|
||||
//
|
||||
// for each token to be accepted, the sampler would have to sample that same token
|
||||
// in such cases, instead of decoding the sampled token as we normally do, we simply continue with the
|
||||
// available logits from the batch and sample the next token until we run out of logits or the sampler
|
||||
// disagrees with the draft
|
||||
//
|
||||
const auto ids = common_sampler_sample_and_accept_n(smpl, ctx_tgt, draft);
|
||||
|
||||
//LOG_DBG("ids: %s\n", string_from(ctx_tgt, ids).c_str());
|
||||
|
||||
GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token
|
||||
|
||||
n_past += ids.size() - 1;
|
||||
n_drafted += draft.size(); // note: we ignore the discarded small drafts
|
||||
n_accept += ids.size() - 1;
|
||||
n_predict += ids.size();
|
||||
|
||||
// process the accepted tokens and update contexts
|
||||
//
|
||||
// this is the standard token post-processing that we normally do
|
||||
// in this case, we do it for a group of accepted tokens at once
|
||||
//
|
||||
for (size_t i = 0; i < ids.size(); ++i) {
|
||||
prompt_tgt.push_back(id_last);
|
||||
|
||||
id_last = ids[i];
|
||||
|
||||
if (llama_token_is_eog(model_tgt, id_last)) {
|
||||
has_eos = true;
|
||||
break;
|
||||
}
|
||||
|
||||
const std::string token_str = common_token_to_piece(ctx_tgt, id_last);
|
||||
|
||||
if (params.use_color && i + 1 < ids.size()) {
|
||||
LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str());
|
||||
} else {
|
||||
LOG("%s", token_str.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d)\n", (int) ids.size() - 1, (int) draft.size(), id_last);
|
||||
|
||||
{
|
||||
LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
|
||||
}
|
||||
|
||||
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
auto t_dec_end = ggml_time_us();
|
||||
|
||||
const int n_input = inp.size();
|
||||
|
||||
LOG("\n\n");
|
||||
|
||||
LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
||||
LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
||||
|
||||
LOG_INF("\n");
|
||||
LOG_INF("n_draft = %d\n", n_draft);
|
||||
LOG_INF("n_predict = %d\n", n_predict);
|
||||
LOG_INF("n_drafted = %d\n", n_drafted);
|
||||
LOG_INF("n_accept = %d\n", n_accept);
|
||||
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||
|
||||
LOG_INF("\n");
|
||||
LOG_INF("draft:\n\n");
|
||||
|
||||
llama_perf_context_print(ctx_dft);
|
||||
|
||||
LOG_INF("\n");
|
||||
LOG_INF("target:\n\n");
|
||||
common_perf_print(ctx_tgt, smpl);
|
||||
|
||||
common_sampler_free(smpl);
|
||||
common_speculative_free(spec);
|
||||
|
||||
llama_free(ctx_tgt);
|
||||
llama_free_model(model_tgt);
|
||||
|
||||
llama_free(ctx_dft);
|
||||
llama_free_model(model_dft);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
LOG("\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -12,7 +12,7 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
|
||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
|
||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||
|
||||
struct seq_draft {
|
||||
@@ -33,7 +33,7 @@ int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
// needed to get candidate probs even for temp <= 0.0
|
||||
params.sparams.n_probs = 128;
|
||||
params.sampling.n_probs = 128;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
|
||||
return 1;
|
||||
@@ -46,7 +46,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
common_init();
|
||||
|
||||
if (params.model_draft.empty()) {
|
||||
if (params.speculative.model.empty()) {
|
||||
LOG_ERR("%s: --model-draft is required\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
@@ -55,9 +55,9 @@ int main(int argc, char ** argv) {
|
||||
const int n_seq_dft = params.n_parallel;
|
||||
|
||||
// probability threshold for splitting a draft branch (only for n_seq_dft > 1)
|
||||
const float p_split = params.p_split;
|
||||
const float p_draft_split = params.speculative.p_split;
|
||||
|
||||
std::default_random_engine rng(params.sparams.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sparams.seed);
|
||||
std::default_random_engine rng(params.sampling.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sampling.seed);
|
||||
std::uniform_real_distribution<> u_dist;
|
||||
|
||||
// init llama.cpp
|
||||
@@ -76,13 +76,14 @@ int main(int argc, char ** argv) {
|
||||
ctx_tgt = llama_init_tgt.context;
|
||||
|
||||
// load the draft model
|
||||
params.model = params.model_draft;
|
||||
params.n_gpu_layers = params.n_gpu_layers_draft;
|
||||
if (params.draft_cpuparams.n_threads > 0) {
|
||||
params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
|
||||
params.devices = params.speculative.devices;
|
||||
params.model = params.speculative.model;
|
||||
params.n_gpu_layers = params.speculative.n_gpu_layers;
|
||||
if (params.speculative.cpuparams.n_threads > 0) {
|
||||
params.cpuparams.n_threads = params.speculative.cpuparams.n_threads;
|
||||
}
|
||||
|
||||
params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
|
||||
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
||||
common_init_result llama_init_dft = common_init_from_params(params);
|
||||
model_dft = llama_init_dft.model;
|
||||
ctx_dft = llama_init_dft.context;
|
||||
@@ -170,7 +171,7 @@ int main(int argc, char ** argv) {
|
||||
//GGML_ASSERT(n_vocab == llama_n_vocab(model_dft));
|
||||
|
||||
// how many tokens to draft each time
|
||||
int n_draft = params.n_draft;
|
||||
int n_draft = params.speculative.n_max;
|
||||
|
||||
int n_predict = 0;
|
||||
int n_drafted = 0;
|
||||
@@ -183,14 +184,14 @@ int main(int argc, char ** argv) {
|
||||
bool has_eos = false;
|
||||
|
||||
// target model sampling context (reuse the llama_context's sampling instance)
|
||||
struct common_sampler * smpl = common_sampler_init(model_tgt, params.sparams);
|
||||
struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);
|
||||
|
||||
// draft sequence data
|
||||
std::vector<seq_draft> drafts(n_seq_dft);
|
||||
|
||||
for (int s = 0; s < n_seq_dft; ++s) {
|
||||
// allocate llama_sampler for each draft sequence
|
||||
drafts[s].smpl = common_sampler_init(model_dft, params.sparams);
|
||||
drafts[s].smpl = common_sampler_init(model_dft, params.sampling);
|
||||
}
|
||||
|
||||
llama_batch batch_dft = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
|
||||
@@ -230,7 +231,7 @@ int main(int argc, char ** argv) {
|
||||
// for stochastic sampling, attempt to match the token with the drafted tokens
|
||||
{
|
||||
bool accept = false;
|
||||
if (params.sparams.temp > 0) {
|
||||
if (params.sampling.temp > 0) {
|
||||
// stochastic verification
|
||||
common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
|
||||
|
||||
@@ -494,7 +495,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// attempt to split the branch if the probability is high enough
|
||||
for (int f = 1; f < 8; ++f) {
|
||||
if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) {
|
||||
if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
|
||||
LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1);
|
||||
|
||||
6
flake.lock
generated
6
flake.lock
generated
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1731676054,
|
||||
"narHash": "sha256-OZiZ3m8SCMfh3B6bfGC/Bm4x3qc1m2SVEAlkV6iY7Yg=",
|
||||
"lastModified": 1732014248,
|
||||
"narHash": "sha256-y/MEyuJ5oBWrWAic/14LaIr/u5E0wRVzyYsouYY3W6w=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "5e4fbfb6b3de1aa2872b76d49fafc942626e2add",
|
||||
"rev": "23e89b7da85c3640bbc2173fe04f4bd114342367",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
@@ -33,6 +33,7 @@ else()
|
||||
endif()
|
||||
|
||||
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
||||
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
|
||||
|
||||
#
|
||||
# option list
|
||||
|
||||
@@ -190,6 +190,14 @@ extern "C" {
|
||||
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
|
||||
// Get additional buffer types provided by the device (returns a NULL-terminated array)
|
||||
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
|
||||
// Set the abort callback for the backend
|
||||
typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
// Get a list of feature flags supported by the backend (returns a NULL-terminated array)
|
||||
struct ggml_backend_feature {
|
||||
const char * name;
|
||||
const char * value;
|
||||
};
|
||||
typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
|
||||
|
||||
//
|
||||
// Backend registry
|
||||
@@ -214,6 +222,13 @@ extern "C" {
|
||||
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
|
||||
GGML_API ggml_backend_t ggml_backend_init_best(void);
|
||||
|
||||
// Load a backend from a dynamic library and register it
|
||||
GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
|
||||
// Unload a backend if loaded dynamically and unregister it
|
||||
GGML_API void ggml_backend_unload(ggml_backend_reg_t reg);
|
||||
// Load all known backends from dynamic libraries
|
||||
GGML_API void ggml_backend_load_all(void);
|
||||
|
||||
//
|
||||
// Backend scheduler
|
||||
//
|
||||
|
||||
@@ -7,29 +7,6 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Scheduling priorities
|
||||
enum ggml_sched_priority {
|
||||
GGML_SCHED_PRIO_NORMAL,
|
||||
GGML_SCHED_PRIO_MEDIUM,
|
||||
GGML_SCHED_PRIO_HIGH,
|
||||
GGML_SCHED_PRIO_REALTIME
|
||||
};
|
||||
|
||||
// Threadpool params
|
||||
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
|
||||
struct ggml_threadpool_params {
|
||||
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
|
||||
int n_threads; // number of threads
|
||||
enum ggml_sched_priority prio; // thread priority
|
||||
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
|
||||
bool strict_cpu; // strict cpu placement
|
||||
bool paused; // start in paused state
|
||||
};
|
||||
|
||||
struct ggml_threadpool; // forward declaration, see ggml.c
|
||||
|
||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||
|
||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||
// since https://github.com/ggerganov/ggml/issues/287
|
||||
struct ggml_cplan {
|
||||
@@ -75,14 +52,11 @@ extern "C" {
|
||||
GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||
GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
||||
|
||||
GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||
GGML_BACKEND_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
||||
GGML_BACKEND_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
||||
GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||
GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||
GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||
|
||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||
@@ -104,10 +78,10 @@ extern "C" {
|
||||
GGML_BACKEND_API int ggml_cpu_has_sse3 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_ssse3 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_fma (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
|
||||
|
||||
@@ -2215,6 +2215,37 @@ extern "C" {
|
||||
|
||||
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
|
||||
|
||||
// ggml threadpool
|
||||
// TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend
|
||||
// the goal should be to create an API that other backends can use move everything to the ggml base
|
||||
|
||||
// scheduling priorities
|
||||
enum ggml_sched_priority {
|
||||
GGML_SCHED_PRIO_NORMAL,
|
||||
GGML_SCHED_PRIO_MEDIUM,
|
||||
GGML_SCHED_PRIO_HIGH,
|
||||
GGML_SCHED_PRIO_REALTIME
|
||||
};
|
||||
|
||||
// threadpool params
|
||||
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
|
||||
struct ggml_threadpool_params {
|
||||
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
|
||||
int n_threads; // number of threads
|
||||
enum ggml_sched_priority prio; // thread priority
|
||||
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
|
||||
bool strict_cpu; // strict cpu placement
|
||||
bool paused; // start in paused state
|
||||
};
|
||||
|
||||
struct ggml_threadpool; // forward declaration, see ggml.c
|
||||
|
||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||
|
||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -202,6 +202,10 @@ endif()
|
||||
|
||||
# ggml
|
||||
|
||||
if (GGML_BACKEND_DL AND NOT BUILD_SHARED_LIBS)
|
||||
message(FATAL_ERROR "GGML_BACKEND_DL requires BUILD_SHARED_LIBS")
|
||||
endif()
|
||||
|
||||
add_library(ggml-base
|
||||
../include/ggml.h
|
||||
../include/ggml-alloc.h
|
||||
@@ -226,6 +230,31 @@ add_library(ggml
|
||||
|
||||
target_link_libraries(ggml PUBLIC ggml-base)
|
||||
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
target_link_libraries(ggml PRIVATE dl)
|
||||
endif()
|
||||
|
||||
function(ggml_add_backend_library backend)
|
||||
if (GGML_BACKEND_DL)
|
||||
add_library(${backend} MODULE ${ARGN})
|
||||
# write the shared library to the output directory
|
||||
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
||||
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
|
||||
else()
|
||||
add_library(${backend} ${ARGN})
|
||||
target_link_libraries(ggml PUBLIC ${backend})
|
||||
install(TARGETS ${backend} LIBRARY)
|
||||
endif()
|
||||
|
||||
target_link_libraries(${backend} PRIVATE ggml-base)
|
||||
target_include_directories(${backend} PRIVATE ..)
|
||||
|
||||
if (${BUILD_SHARED_LIBS})
|
||||
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_BUILD)
|
||||
target_compile_definitions(${backend} PUBLIC GGML_BACKEND_SHARED)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
function(ggml_add_backend backend)
|
||||
string(TOUPPER "GGML_${backend}" backend_id)
|
||||
if (${backend_id})
|
||||
@@ -236,14 +265,10 @@ function(ggml_add_backend backend)
|
||||
# however, currently it is necessary for AMX, since it is enabled by default on llama.cpp
|
||||
if (${backend_id})
|
||||
message(STATUS "Including ${backend} backend")
|
||||
if (${BUILD_SHARED_LIBS})
|
||||
target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_BUILD)
|
||||
target_compile_definitions(${backend_target} PUBLIC GGML_BACKEND_SHARED)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
string(TOUPPER "GGML_USE_${backend}" backend_use)
|
||||
target_compile_definitions(ggml PUBLIC ${backend_use})
|
||||
endif()
|
||||
install(TARGETS ${backend_target} LIBRARY)
|
||||
target_link_libraries(ggml PUBLIC ${backend_target})
|
||||
string(TOUPPER "GGML_USE_${backend}" backend_use)
|
||||
target_compile_definitions(ggml PUBLIC ${backend_use})
|
||||
endif()
|
||||
endif()
|
||||
endfunction()
|
||||
@@ -256,10 +281,10 @@ ggml_add_backend(CUDA)
|
||||
ggml_add_backend(HIP)
|
||||
ggml_add_backend(Kompute)
|
||||
ggml_add_backend(METAL)
|
||||
ggml_add_backend(MUSA)
|
||||
ggml_add_backend(RPC)
|
||||
ggml_add_backend(SYCL)
|
||||
ggml_add_backend(Vulkan)
|
||||
ggml_add_backend(MUSA)
|
||||
|
||||
foreach (target ggml-base ggml)
|
||||
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
|
||||
|
||||
@@ -9,12 +9,10 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MA
|
||||
|
||||
file(GLOB GGML_SOURCES_AMX "*.cpp")
|
||||
|
||||
add_library(ggml-amx
|
||||
${GGML_HEADERS_AMX}
|
||||
${GGML_SOURCES_AMX})
|
||||
|
||||
target_link_libraries(ggml-amx PRIVATE ggml-base)
|
||||
target_include_directories(ggml-amx PRIVATE . ..)
|
||||
ggml_add_backend_library(ggml-amx
|
||||
${GGML_HEADERS_AMX}
|
||||
${GGML_SOURCES_AMX}
|
||||
)
|
||||
|
||||
# this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
|
||||
# TODO: integrate AMX backend into the CPU backend
|
||||
|
||||
@@ -409,8 +409,9 @@ static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = {
|
||||
|
||||
ggml_backend_reg_t ggml_backend_amx_reg(void) {
|
||||
static struct ggml_backend_reg ggml_backend_amx_reg = {
|
||||
/* .iface = */ ggml_backend_amx_reg_i,
|
||||
/* .context = */ NULL,
|
||||
/* .api_version = */ GGML_BACKEND_API_VERSION,
|
||||
/* .iface = */ ggml_backend_amx_reg_i,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
return &ggml_backend_amx_reg;
|
||||
@@ -444,3 +445,5 @@ ggml_backend_reg_t ggml_backend_amx_reg(void) {
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg)
|
||||
|
||||
@@ -8,6 +8,8 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_BACKEND_API_VERSION 1
|
||||
|
||||
//
|
||||
// Backend buffer type
|
||||
//
|
||||
@@ -63,20 +65,20 @@ extern "C" {
|
||||
enum ggml_backend_buffer_usage usage;
|
||||
};
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
ggml_backend_buffer_type_t buft,
|
||||
struct ggml_backend_buffer_i iface,
|
||||
void * context,
|
||||
size_t size);
|
||||
|
||||
// do not use directly, use ggml_backend_tensor_copy instead
|
||||
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
GGML_API bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
// multi-buffer
|
||||
// buffer that contains a collection of buffers
|
||||
ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
|
||||
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
|
||||
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
|
||||
GGML_API bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
|
||||
//
|
||||
// Backend (stream)
|
||||
@@ -199,17 +201,37 @@ extern "C" {
|
||||
};
|
||||
|
||||
struct ggml_backend_reg {
|
||||
// int api_version; // TODO: for dynamic loading
|
||||
int api_version; // initialize to GGML_BACKEND_API_VERSION
|
||||
struct ggml_backend_reg_i iface;
|
||||
void * context;
|
||||
};
|
||||
|
||||
|
||||
// Internal backend registry API
|
||||
void ggml_backend_register(ggml_backend_reg_t reg);
|
||||
void ggml_backend_device_register(ggml_backend_dev_t device);
|
||||
// TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
|
||||
// typedef ggml_backend_register_t * (*ggml_backend_init)(void);
|
||||
GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
|
||||
GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
|
||||
|
||||
// Add backend dynamic loading support to the backend
|
||||
typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
|
||||
|
||||
#ifdef GGML_BACKEND_DL
|
||||
#ifdef __cplusplus
|
||||
# define GGML_BACKEND_DL_IMPL(reg_fn) \
|
||||
extern "C" { \
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
|
||||
} \
|
||||
ggml_backend_reg_t ggml_backend_init(void) { \
|
||||
return reg_fn(); \
|
||||
}
|
||||
#else
|
||||
# define GGML_BACKEND_DL_IMPL(reg_fn) \
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
|
||||
ggml_backend_reg_t ggml_backend_init(void) { \
|
||||
return reg_fn(); \
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
# define GGML_BACKEND_DL_IMPL(reg_fn)
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@@ -1,11 +1,29 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-impl.h"
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#ifdef _WIN32
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
# ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
# endif
|
||||
# include <windows.h>
|
||||
#elif defined(__APPLE__)
|
||||
# include <mach-o/dyld.h>
|
||||
# include <dlfcn.h>
|
||||
#else
|
||||
# include <dlfcn.h>
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
|
||||
// Backend registry
|
||||
#ifdef GGML_USE_CPU
|
||||
#include "ggml-cpu.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
#include "ggml-cuda.h"
|
||||
@@ -43,8 +61,13 @@
|
||||
#include "ggml-kompute.h"
|
||||
#endif
|
||||
|
||||
struct ggml_backend_reg_entry {
|
||||
ggml_backend_reg_t reg;
|
||||
void * handle;
|
||||
};
|
||||
|
||||
struct ggml_backend_registry {
|
||||
std::vector<ggml_backend_reg_t> backends;
|
||||
std::vector<ggml_backend_reg_entry> backends;
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
|
||||
ggml_backend_registry() {
|
||||
@@ -75,11 +98,19 @@ struct ggml_backend_registry {
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
register_backend(ggml_backend_kompute_reg());
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CPU
|
||||
register_backend(ggml_backend_cpu_reg());
|
||||
#endif
|
||||
}
|
||||
|
||||
void register_backend(ggml_backend_reg_t reg) {
|
||||
~ggml_backend_registry() {
|
||||
while (!backends.empty()) {
|
||||
// use silent since the log system may have been destroyed at this point
|
||||
unload_backend(backends.back().reg, true);
|
||||
}
|
||||
}
|
||||
|
||||
void register_backend(ggml_backend_reg_t reg, void * handle = nullptr) {
|
||||
if (!reg) {
|
||||
return;
|
||||
}
|
||||
@@ -88,7 +119,7 @@ struct ggml_backend_registry {
|
||||
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
|
||||
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
||||
#endif
|
||||
backends.push_back(reg);
|
||||
backends.push_back({ reg, handle });
|
||||
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
||||
register_device(ggml_backend_reg_dev_get(reg, i));
|
||||
}
|
||||
@@ -100,6 +131,111 @@ struct ggml_backend_registry {
|
||||
#endif
|
||||
devices.push_back(device);
|
||||
}
|
||||
|
||||
ggml_backend_reg_t load_backend(const char * path, bool silent) {
|
||||
#ifdef _WIN32
|
||||
// suppress error dialogs for missing DLLs
|
||||
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||
|
||||
HMODULE handle = LoadLibraryA(path);
|
||||
|
||||
if (!handle) {
|
||||
if (!silent) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError());
|
||||
}
|
||||
SetErrorMode(old_mode);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init");
|
||||
|
||||
SetErrorMode(old_mode);
|
||||
|
||||
if (!backend_init) {
|
||||
if (!silent) {
|
||||
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError());
|
||||
}
|
||||
FreeLibrary(handle);
|
||||
return nullptr;
|
||||
}
|
||||
#else
|
||||
void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
|
||||
|
||||
if (!handle) {
|
||||
if (!silent) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init");
|
||||
|
||||
if (!backend_init) {
|
||||
if (!silent) {
|
||||
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror());
|
||||
}
|
||||
dlclose(handle);
|
||||
return nullptr;
|
||||
}
|
||||
#endif
|
||||
ggml_backend_reg_t reg = backend_init();
|
||||
|
||||
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
|
||||
if (!silent) {
|
||||
if (!reg) {
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
|
||||
} else {
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
|
||||
__func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
|
||||
}
|
||||
}
|
||||
#ifdef _WIN32
|
||||
FreeLibrary(handle);
|
||||
#else
|
||||
dlclose(handle);
|
||||
#endif
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
|
||||
register_backend(reg, handle);
|
||||
return reg;
|
||||
}
|
||||
|
||||
void unload_backend(ggml_backend_reg_t reg, bool silent) {
|
||||
auto it = std::find_if(backends.begin(), backends.end(),
|
||||
[reg](ggml_backend_reg_entry entry) { return entry.reg == reg; });
|
||||
|
||||
if (it == backends.end()) {
|
||||
if (!silent) {
|
||||
GGML_LOG_ERROR("%s: backend not found\n", __func__);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (!silent) {
|
||||
GGML_LOG_DEBUG("%s: unloading %s backend\n", __func__, ggml_backend_reg_name(reg));
|
||||
}
|
||||
|
||||
// remove devices
|
||||
devices.erase(
|
||||
std::remove_if(devices.begin(), devices.end(),
|
||||
[reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
|
||||
devices.end());
|
||||
|
||||
// unload library
|
||||
if (it->handle) {
|
||||
#ifdef _WIN32
|
||||
FreeLibrary((HMODULE) it->handle);
|
||||
#else
|
||||
dlclose(it->handle);
|
||||
#endif
|
||||
}
|
||||
|
||||
// remove backend
|
||||
backends.erase(it);
|
||||
}
|
||||
};
|
||||
|
||||
static ggml_backend_registry & get_reg() {
|
||||
@@ -117,23 +253,32 @@ void ggml_backend_device_register(ggml_backend_dev_t device) {
|
||||
}
|
||||
|
||||
// Backend (reg) enumeration
|
||||
static bool striequals(const char * a, const char * b) {
|
||||
for (; *a && *b; a++, b++) {
|
||||
if (std::tolower(*a) != std::tolower(*b)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return *a == *b;
|
||||
}
|
||||
|
||||
size_t ggml_backend_reg_count() {
|
||||
return get_reg().backends.size();
|
||||
}
|
||||
|
||||
ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
|
||||
GGML_ASSERT(index < ggml_backend_reg_count());
|
||||
return get_reg().backends[index];
|
||||
return get_reg().backends[index].reg;
|
||||
}
|
||||
|
||||
ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
|
||||
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
||||
ggml_backend_reg_t reg = ggml_backend_reg_get(i);
|
||||
if (std::strcmp(ggml_backend_reg_name(reg), name) == 0) {
|
||||
if (striequals(ggml_backend_reg_name(reg), name)) {
|
||||
return reg;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Device enumeration
|
||||
@@ -149,11 +294,11 @@ ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
|
||||
ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
|
||||
if (striequals(ggml_backend_dev_name(dev), name)) {
|
||||
return dev;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
|
||||
@@ -163,14 +308,14 @@ ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
|
||||
return dev;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Convenience functions
|
||||
ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
|
||||
if (!dev) {
|
||||
return NULL;
|
||||
return nullptr;
|
||||
}
|
||||
return ggml_backend_dev_init(dev, params);
|
||||
}
|
||||
@@ -178,7 +323,7 @@ ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params)
|
||||
ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
|
||||
if (!dev) {
|
||||
return NULL;
|
||||
return nullptr;
|
||||
}
|
||||
return ggml_backend_dev_init(dev, params);
|
||||
}
|
||||
@@ -189,7 +334,97 @@ ggml_backend_t ggml_backend_init_best(void) {
|
||||
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
}
|
||||
if (!dev) {
|
||||
return NULL;
|
||||
return nullptr;
|
||||
}
|
||||
return ggml_backend_dev_init(dev, NULL);
|
||||
return ggml_backend_dev_init(dev, nullptr);
|
||||
}
|
||||
|
||||
// Dynamic loading
|
||||
ggml_backend_reg_t ggml_backend_load(const char * path) {
|
||||
return get_reg().load_backend(path, false);
|
||||
}
|
||||
|
||||
void ggml_backend_unload(ggml_backend_reg_t reg) {
|
||||
get_reg().unload_backend(reg, true);
|
||||
}
|
||||
|
||||
void ggml_backend_load_all() {
|
||||
std::vector<std::string> search_prefix;
|
||||
|
||||
// add the executable directory to the search path
|
||||
// FIXME: this is convenient for development, but it should probably be disabled in production
|
||||
|
||||
#if defined(__APPLE__)
|
||||
// get executable path
|
||||
std::vector<char> path;
|
||||
uint32_t size;
|
||||
while (true) {
|
||||
size = path.size();
|
||||
if (_NSGetExecutablePath(path.data(), &size) == 0) {
|
||||
break;
|
||||
}
|
||||
path.resize(size);
|
||||
}
|
||||
std::string base_path(path.data(), size);
|
||||
// remove executable name
|
||||
auto last_slash = base_path.find_last_of('/');
|
||||
if (last_slash != std::string::npos) {
|
||||
base_path = base_path.substr(0, last_slash);
|
||||
}
|
||||
search_prefix.push_back(base_path + "/");
|
||||
#elif defined(__linux__)
|
||||
std::string base_path = ".";
|
||||
std::vector<char> path(1024);
|
||||
while (true) {
|
||||
// get executable path
|
||||
ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
|
||||
if (len == -1) {
|
||||
break;
|
||||
}
|
||||
if (len < (ssize_t) path.size()) {
|
||||
base_path = std::string(path.data(), len);
|
||||
// remove executable name
|
||||
auto last_slash = base_path.find_last_of('/');
|
||||
if (last_slash != std::string::npos) {
|
||||
base_path = base_path.substr(0, last_slash);
|
||||
}
|
||||
break;
|
||||
}
|
||||
path.resize(path.size() * 2);
|
||||
}
|
||||
|
||||
search_prefix.push_back(base_path + "/");
|
||||
#endif
|
||||
|
||||
auto & reg = get_reg();
|
||||
|
||||
auto try_load = [&](const std::string & name) {
|
||||
std::string os_name;
|
||||
#ifdef _WIN32
|
||||
os_name = "ggml-" + name + ".dll";
|
||||
#else
|
||||
os_name = "libggml-" + name + ".so";
|
||||
#endif
|
||||
if (reg.load_backend(os_name.c_str(), true)) {
|
||||
return;
|
||||
}
|
||||
for (const auto & prefix : search_prefix) {
|
||||
if (reg.load_backend((prefix + os_name).c_str(), true)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
try_load("amx");
|
||||
try_load("blas");
|
||||
try_load("cann");
|
||||
try_load("cuda");
|
||||
try_load("hip");
|
||||
try_load("kompute");
|
||||
try_load("metal");
|
||||
try_load("rpc");
|
||||
try_load("sycl");
|
||||
try_load("vulkan");
|
||||
try_load("musa");
|
||||
try_load("cpu");
|
||||
}
|
||||
|
||||
@@ -11,12 +11,9 @@ find_package(BLAS)
|
||||
if (BLAS_FOUND)
|
||||
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
|
||||
|
||||
add_library(ggml-blas
|
||||
ggml-blas.cpp
|
||||
)
|
||||
|
||||
target_link_libraries(ggml-blas PRIVATE ggml-base)
|
||||
target_include_directories(ggml-blas PRIVATE . ..)
|
||||
ggml_add_backend_library(ggml-blas
|
||||
ggml-blas.cpp
|
||||
)
|
||||
|
||||
if (${GGML_BLAS_VENDOR} MATCHES "Apple")
|
||||
add_compile_definitions(ACCELERATE_NEW_LAPACK)
|
||||
|
||||
@@ -506,9 +506,12 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
|
||||
|
||||
ggml_backend_reg_t ggml_backend_blas_reg(void) {
|
||||
static struct ggml_backend_reg ggml_backend_blas_reg = {
|
||||
/* .iface = */ ggml_backend_blas_reg_i,
|
||||
/* .context = */ NULL,
|
||||
/* .api_version = */ GGML_BACKEND_API_VERSION,
|
||||
/* .iface = */ ggml_backend_blas_reg_i,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
return &ggml_backend_blas_reg;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)
|
||||
|
||||
@@ -61,9 +61,9 @@ if (CANN_INSTALL_DIR)
|
||||
|
||||
file(GLOB GGML_SOURCES_CANN "*.cpp")
|
||||
|
||||
add_library(ggml-cann ${GGML_SOURCES_CANN})
|
||||
target_link_libraries(ggml-cann PRIVATE ggml-base ${CANN_LIBRARIES})
|
||||
target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS})
|
||||
ggml_add_backend_library(ggml-cann ${GGML_SOURCES_CANN})
|
||||
target_link_libraries(ggml-cann PRIVATE ${CANN_LIBRARIES})
|
||||
target_include_directories(ggml-cann PRIVATE ${CANN_INCLUDE_DIRS})
|
||||
target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)
|
||||
|
||||
target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
*/
|
||||
|
||||
#include "aclnn_ops.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include <aclnnop/aclnn_avgpool2d.h>
|
||||
#include <aclnnop/aclnn_cast.h>
|
||||
@@ -32,6 +33,8 @@
|
||||
#include <aclnnop/aclnn_group_norm.h>
|
||||
#include <aclnnop/aclnn_index_fill_tensor.h>
|
||||
#include <aclnnop/aclnn_layer_norm.h>
|
||||
#include <aclnnop/aclnn_mm.h>
|
||||
#include <aclnnop/aclnn_batch_matmul.h>
|
||||
#include <aclnnop/aclnn_matmul.h>
|
||||
#include <aclnnop/aclnn_max_pool.h>
|
||||
#include <aclnnop/aclnn_permute.h>
|
||||
@@ -241,10 +244,14 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
|
||||
int64_t concat_dim = 1;
|
||||
const int32_t dim = ggml_get_op_params_i32(dst, 0);
|
||||
|
||||
GGML_ASSERT(dim >= 0 && dim < 4);
|
||||
int32_t acl_dim = 3 - dim;
|
||||
|
||||
aclTensor* tensors[] = {acl_src0, acl_src1};
|
||||
aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
|
||||
aclnn_concat(ctx, tensorList, acl_dst, concat_dim);
|
||||
aclnn_concat(ctx, tensorList, acl_dst, acl_dim);
|
||||
|
||||
ACL_CHECK(aclDestroyTensorList(tensorList));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
@@ -1437,10 +1444,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src0 = dst->src[0]; // kernel
|
||||
ggml_tensor* src1 = dst->src[1]; // input
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
|
||||
// aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
|
||||
@@ -1462,9 +1465,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
const int64_t OH = is_2D ? ne2 : 1;
|
||||
const int64_t OW = ne1;
|
||||
|
||||
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
// memory allocated increased to 3x when is_2D == false
|
||||
const int64_t n_bytes_factor = is_2D ? 1 : 3;
|
||||
|
||||
@@ -2425,7 +2425,6 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
||||
aclTensor* acl_weight, aclTensor* acl_dst) {
|
||||
int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is
|
||||
// fp32, atlas a2 will transpose it to HFLOAT32.
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
void* workspaceAddr = nullptr;
|
||||
@@ -2443,6 +2442,80 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
||||
aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs matrix multiplication of two 2D tensors.
|
||||
*
|
||||
* This function computes the matrix multiplication of the input tensor
|
||||
* `acl_input` and the weight tensor `acl_weight`, and stores the result in the
|
||||
* destination tensor `acl_dst`.
|
||||
* The operation is defined as:
|
||||
* \f[
|
||||
* \text {acl_dst}=\text {acl_input@acl_weight}
|
||||
* \f]
|
||||
*
|
||||
* @param ctx The context for the CANN backend operations.
|
||||
* @param acl_input The input tensor for the matrix multiplication.
|
||||
* @param acl_weight The weight tensor for the matrix multiplication.
|
||||
* @param acl_dst The destination tensor where the result of the matrix
|
||||
* multiplication will be stored.
|
||||
*/
|
||||
static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
||||
aclTensor* acl_weight, aclTensor* acl_dst) {
|
||||
int8_t cube_math_type = 2;
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
void* workspaceAddr = nullptr;
|
||||
|
||||
ACL_CHECK(aclnnMmGetWorkspaceSize(acl_input, acl_weight, acl_dst,
|
||||
cube_math_type, &workspaceSize,
|
||||
&executor));
|
||||
|
||||
if (workspaceSize > 0) {
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
}
|
||||
|
||||
ACL_CHECK(
|
||||
aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs matrix multiplication of two 3D tensors.
|
||||
*
|
||||
* This function computes the matrix multiplication of the input tensor
|
||||
* `acl_input` and the weight tensor `acl_weight`, and stores the result in the
|
||||
* destination tensor `acl_dst`.
|
||||
* The operation is defined as:
|
||||
* \f[
|
||||
* \text {acl_dst}=\text {acl_input@acl_weight}
|
||||
* \f]
|
||||
*
|
||||
* @param ctx The context for the CANN backend operations.
|
||||
* @param acl_input The input tensor for the matrix multiplication.
|
||||
* @param acl_weight The weight tensor for the matrix multiplication.
|
||||
* @param acl_dst The destination tensor where the result of the matrix
|
||||
* multiplication will be stored.
|
||||
*/
|
||||
static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
|
||||
aclTensor* acl_weight, aclTensor* acl_dst) {
|
||||
int8_t cube_math_type = 2;
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
void* workspaceAddr = nullptr;
|
||||
|
||||
ACL_CHECK(aclnnBatchMatMulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
|
||||
cube_math_type, &workspaceSize,
|
||||
&executor));
|
||||
|
||||
if (workspaceSize > 0) {
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
}
|
||||
|
||||
ACL_CHECK(
|
||||
aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs matrix multiplication with floating-point precision on
|
||||
* tensors using the CANN backend.
|
||||
@@ -2464,20 +2537,43 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
||||
// broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
|
||||
BCAST_MUL_MAT_SHAPE(input, weight, dst);
|
||||
|
||||
// transpose weight: [1,2,3,4] -> [1,2,4,3]
|
||||
int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0],
|
||||
bcast_weight_ne[2], bcast_weight_ne[3],
|
||||
bcast_weight_ne[4], bcast_weight_ne[5]};
|
||||
size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
|
||||
bcast_weight_nb[2], bcast_weight_nb[3],
|
||||
bcast_weight_nb[4], bcast_weight_nb[5]};
|
||||
int64_t n_dims = bcast_dims;
|
||||
if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) {
|
||||
if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) {
|
||||
n_dims = 2;
|
||||
} else if (bcast_input_ne[2] == 1) {
|
||||
n_dims = 3;
|
||||
}
|
||||
}
|
||||
|
||||
aclTensor* acl_weight_tensor =
|
||||
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, bcast_dims);
|
||||
aclTensor* acl_input_tensor =
|
||||
ggml_cann_create_tensor(input, BCAST_MUL_MAT_PARAM(input));
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst, BCAST_MUL_MAT_PARAM(dst));
|
||||
aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
||||
ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
|
||||
int64_t transpose_ne[] = {
|
||||
bcast_weight_ne[1], bcast_weight_ne[0],
|
||||
bcast_weight_ne[2], bcast_weight_ne[3],
|
||||
bcast_weight_ne[4], bcast_weight_ne[5]
|
||||
};
|
||||
size_t transpose_nb[] = {
|
||||
bcast_weight_nb[1], bcast_weight_nb[0],
|
||||
bcast_weight_nb[2], bcast_weight_nb[3],
|
||||
bcast_weight_nb[4], bcast_weight_nb[5]
|
||||
};
|
||||
aclTensor* acl_weight_tensor =
|
||||
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
|
||||
aclTensor* acl_dst =
|
||||
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
|
||||
|
||||
switch (n_dims) {
|
||||
case 2:
|
||||
aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
||||
break;
|
||||
case 3:
|
||||
aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
||||
break;
|
||||
default:
|
||||
aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
|
||||
break;
|
||||
}
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
||||
@@ -2503,46 +2599,40 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||
ggml_tensor* src0 = dst->src[0]; // weight
|
||||
ggml_tensor* src1 = dst->src[1]; // input
|
||||
|
||||
// The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC
|
||||
// is regarded as batch. weight need transpose.
|
||||
int64_t weight_ne[] = {src0->ne[1], src0->ne[0]};
|
||||
// The shape of the weight is NCHW.
|
||||
// Matrix multiplication uses HW dims.
|
||||
// HC is regarded as batch.
|
||||
// weight need transpose.
|
||||
float weight_elem_size;
|
||||
if (type == GGML_TYPE_Q4_0) {
|
||||
weight_elem_size = float(sizeof(uint8_t)) / 2;
|
||||
}
|
||||
else if (type == GGML_TYPE_Q8_0) {
|
||||
} else if (type == GGML_TYPE_Q8_0) {
|
||||
weight_elem_size = float(sizeof(uint8_t));
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
|
||||
}
|
||||
float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
|
||||
|
||||
// size of one matrix is element_size * height * width.
|
||||
size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
|
||||
float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size};
|
||||
size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
|
||||
size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
|
||||
|
||||
// scale stored at the end of weight. Also need transpose.
|
||||
GGML_ASSERT(QK4_0 == QK8_0);
|
||||
int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
|
||||
size_t scale_elem_size = sizeof(uint16_t);
|
||||
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
|
||||
scale_elem_size};
|
||||
size_t scale_stride = scale_elem_size * src0->ne[0] * src0->ne[1] / QK8_0;
|
||||
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size};
|
||||
size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
|
||||
char* scale_offset = (char*)src0->data + weight_size;
|
||||
|
||||
// input
|
||||
void* input_buffer;
|
||||
size_t input_elem_size = sizeof(uint16_t);
|
||||
int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
|
||||
size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]};
|
||||
size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1];
|
||||
|
||||
size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
|
||||
size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
|
||||
ggml_cann_pool_alloc input_alloctor(ctx.pool());
|
||||
void* input_buffer = src1->data;
|
||||
|
||||
// case in
|
||||
if (src1->type != GGML_TYPE_F16) {
|
||||
aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
|
||||
input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
|
||||
input_buffer = input_alloctor.get();
|
||||
input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
|
||||
|
||||
int64_t* input_cast_ne = src1->ne;
|
||||
size_t input_cast_nb[GGML_MAX_DIMS];
|
||||
@@ -2552,88 +2642,139 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||
}
|
||||
|
||||
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
|
||||
input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
|
||||
input_cast_nb, GGML_MAX_DIMS);
|
||||
input_buffer,
|
||||
ACL_FLOAT16,
|
||||
input_elem_size, input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
|
||||
aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
|
||||
} else {
|
||||
input_buffer = src1->data;
|
||||
}
|
||||
|
||||
// output
|
||||
size_t output_elem_size = sizeof(uint16_t);
|
||||
int64_t output_ne[] = {dst->ne[0], dst->ne[1]};
|
||||
size_t output_nb[] = {output_elem_size, output_elem_size * dst->ne[0]};
|
||||
ggml_cann_pool_alloc output_alloctor(
|
||||
ctx.pool(), ggml_nelements(dst) * output_elem_size);
|
||||
void* output_buffer = output_alloctor.get();
|
||||
size_t output_stride = output_elem_size * dst->ne[0] * dst->ne[1];
|
||||
size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
|
||||
ggml_cann_pool_alloc output_allocator(ctx.pool());
|
||||
void* output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
|
||||
size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
|
||||
|
||||
// aclnn
|
||||
int64_t max_elem_size = 65535;
|
||||
int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool());
|
||||
aclOpExecutor* executor = nullptr;
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
void* workspaceAddr = nullptr;
|
||||
|
||||
for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
|
||||
for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
|
||||
int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
|
||||
int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
|
||||
|
||||
int64_t batch1 = n1 * src1->ne[2] + c1;
|
||||
int64_t batch0 = n0 * src0->ne[2] + c0;
|
||||
int64_t batch1 = (n1 * src1->ne[2]) + c1;
|
||||
int64_t batch0 = (n0 * src0->ne[2]) + c0;
|
||||
|
||||
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
|
||||
(char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
|
||||
input_elem_size, input_ne, input_nb, 2);
|
||||
|
||||
// first split
|
||||
int64_t weight_ne_offset = 0;
|
||||
int64_t weight_ne[2] = {max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0]};
|
||||
int64_t scale_ne_offset = 0;
|
||||
int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
|
||||
int64_t output_ne_offset = 0;
|
||||
int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};
|
||||
|
||||
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
|
||||
(char*)src0->data + batch0 * weight_stride,
|
||||
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
|
||||
weight_nb, 2);
|
||||
ggml_cann_type_mapping(type),
|
||||
weight_elem_size, weight_ne, weight_nb, 2,
|
||||
ACL_FORMAT_ND, weight_ne_offset);
|
||||
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
|
||||
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
|
||||
scale_elem_size, scale_ne, scale_nb, 2);
|
||||
scale_offset + batch0 * scale_stride,
|
||||
ACL_FLOAT16,
|
||||
scale_elem_size, scale_ne, scale_nb, 2,
|
||||
ACL_FORMAT_ND, scale_ne_offset);
|
||||
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
|
||||
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
|
||||
output_elem_size, output_ne, output_nb, 2);
|
||||
(char*)output_buffer + batch1 * output_stride,
|
||||
ACL_FLOAT16,
|
||||
output_elem_size, output_ne, output_nb, 2,
|
||||
ACL_FORMAT_ND, output_ne_offset);
|
||||
|
||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
||||
acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
|
||||
nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
|
||||
&workspaceSize, &executor));
|
||||
|
||||
if (workspaceSize > 0 && workspaceAddr == nullptr) {
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(),
|
||||
workspaceSize);
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
|
||||
nullptr, nullptr, nullptr, nullptr, QK8_0,
|
||||
acl_output_tensor, &workspaceSize, &executor));
|
||||
if (workspaceAddr == nullptr) {
|
||||
workspaceAddr = workspace_allocator.alloc(workspaceSize);
|
||||
}
|
||||
|
||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
|
||||
workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
|
||||
|
||||
// other splits
|
||||
for (int64_t split = 1; split < split_size; split++) {
|
||||
weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1];
|
||||
weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size;
|
||||
scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
|
||||
scale_ne[0] = weight_ne[0];
|
||||
output_ne_offset += output_elem_size * output_ne[0] * output_ne[1];
|
||||
output_ne[0] = weight_ne[0];
|
||||
|
||||
acl_weight_tensor = ggml_cann_create_tensor(
|
||||
(char*)src0->data + batch0 * weight_stride,
|
||||
ggml_cann_type_mapping(type),
|
||||
weight_elem_size, weight_ne, weight_nb, 2,
|
||||
ACL_FORMAT_ND, weight_ne_offset);
|
||||
acl_scale_tensor = ggml_cann_create_tensor(
|
||||
scale_offset + batch0 * scale_stride,
|
||||
ACL_FLOAT16,
|
||||
scale_elem_size, scale_ne, scale_nb, 2,
|
||||
ACL_FORMAT_ND, scale_ne_offset);
|
||||
acl_output_tensor = ggml_cann_create_tensor(
|
||||
(char*)output_buffer + batch1 * output_stride,
|
||||
ACL_FLOAT16,
|
||||
output_elem_size, output_ne, output_nb, 2,
|
||||
ACL_FORMAT_ND, output_ne_offset);
|
||||
|
||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
||||
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
|
||||
nullptr, nullptr, nullptr, nullptr, QK8_0,
|
||||
acl_output_tensor, &workspaceSize, &executor));
|
||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
|
||||
workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
|
||||
}
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
||||
}
|
||||
}
|
||||
|
||||
// cast out
|
||||
int64_t* output_cast_ne = dst->ne;
|
||||
size_t output_cast_nb[GGML_MAX_DIMS];
|
||||
output_cast_nb[0] = sizeof(uint16_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
|
||||
if (dst->type != GGML_TYPE_F16) {
|
||||
int64_t* output_cast_ne = dst->ne;
|
||||
size_t output_cast_nb[GGML_MAX_DIMS];
|
||||
output_cast_nb[0] = sizeof(uint16_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
|
||||
}
|
||||
|
||||
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
|
||||
output_buffer,
|
||||
ACL_FLOAT16,
|
||||
output_elem_size, output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
|
||||
aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
|
||||
aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
|
||||
}
|
||||
|
||||
aclTensor* acl_output_tensor =
|
||||
ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, output_elem_size,
|
||||
output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
|
||||
aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
|
||||
aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ACL_FLOAT);
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
|
||||
}
|
||||
|
||||
void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
@@ -2859,15 +3000,27 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
|
||||
ACL_CHECK(aclDestroyTensor(acl_cos_tensor));
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(
|
||||
const aclTensor* x, const aclTensor* cos, const aclTensor* sin,
|
||||
int64_t mode, const aclTensor* yOut, uint64_t* workspaceSize,
|
||||
aclOpExecutor** executor);
|
||||
aclnnStatus aclnnRotaryPositionEmbedding(void* workspace,
|
||||
uint64_t workspaceSize,
|
||||
aclOpExecutor* executor,
|
||||
aclrtStream stream);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
// TODO: use ascendc
|
||||
// Only test with LLAMA model.
|
||||
ggml_tensor* src0 = dst->src[0]; // input
|
||||
ggml_tensor* src2 = dst->src[2]; // freq_factors
|
||||
|
||||
// TODO: with freq_factors
|
||||
GGML_ASSERT(src2 == NULL);
|
||||
|
||||
// param
|
||||
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
||||
// const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
@@ -2885,13 +3038,19 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
|
||||
memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
|
||||
|
||||
GGML_ASSERT(n_dims <= ne0);
|
||||
// TODO: with freq_factors
|
||||
GGML_ASSERT(src2 == NULL);
|
||||
// TODO: attn_factor != 1
|
||||
GGML_ASSERT(attn_factor == 1);
|
||||
// TODO: n_dims <= ne0
|
||||
GGML_ASSERT(n_dims == ne0);
|
||||
GGML_ASSERT(n_dims % 2 == 0);
|
||||
|
||||
// TODO: ext_factor != 0
|
||||
GGML_ASSERT(ext_factor == 0);
|
||||
// TODO: freq_scale != 1
|
||||
GGML_ASSERT(freq_scale == 1);
|
||||
// TODO: type == GGML_TYPE_F16
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
||||
|
||||
@@ -2924,177 +3083,30 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
|
||||
theta_scale, is_neox);
|
||||
|
||||
// roll input
|
||||
void* input_roll_buffer;
|
||||
aclTensor* acl_minus_one_tensor;
|
||||
void* minus_one_scale_buffer = nullptr;
|
||||
ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
|
||||
ggml_cann_pool_alloc minus_one_scale_allocator(
|
||||
ctx.pool(), sizeof(float_t) * src0->ne[0]);
|
||||
if (!is_neox) {
|
||||
// roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
|
||||
input_roll_buffer = roll_allocator.get();
|
||||
int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
|
||||
src0->ne[2], src0->ne[3]};
|
||||
size_t input_roll_nb[GGML_MAX_DIMS];
|
||||
input_roll_nb[0] = ggml_type_size(src0->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
|
||||
}
|
||||
aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
|
||||
input_roll_buffer, ggml_cann_type_mapping(src0->type),
|
||||
ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
|
||||
GGML_MAX_DIMS);
|
||||
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
|
||||
src0->data, ggml_cann_type_mapping(src0->type),
|
||||
ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
|
||||
GGML_MAX_DIMS);
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
|
||||
int64_t shifts[] = {1};
|
||||
int64_t dims[] = {3};
|
||||
aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
|
||||
ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
||||
void* workspaceAddr = nullptr;
|
||||
|
||||
// init [-1, 1, -1, 1, ...]
|
||||
minus_one_scale_buffer = minus_one_scale_allocator.get();
|
||||
|
||||
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
|
||||
size_t minus_one_nb[GGML_MAX_DIMS];
|
||||
minus_one_nb[0] = sizeof(float_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
|
||||
}
|
||||
acl_minus_one_tensor = aclnn_ones(
|
||||
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
|
||||
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
|
||||
int64_t dim = 3;
|
||||
int64_t* index = new int64_t[src0->ne[0]];
|
||||
for (int i = 0; i < src0->ne[0]; i++) {
|
||||
index[i] = i / 2 * 2;
|
||||
}
|
||||
int64_t index_num = src0->ne[0];
|
||||
float value = -1;
|
||||
aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
|
||||
index_num, value);
|
||||
} else {
|
||||
// roll input: [q0,q1,q2,...] ->
|
||||
// [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
|
||||
input_roll_buffer = roll_allocator.get();
|
||||
aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
|
||||
input_roll_buffer, ggml_cann_type_mapping(src0->type),
|
||||
ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
|
||||
aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0);
|
||||
|
||||
int64_t shifts[] = {src0->ne[0] / 2};
|
||||
int64_t dims[] = {3};
|
||||
aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
|
||||
|
||||
// init [-1, -1, -1, 1, 1,1,...]
|
||||
minus_one_scale_buffer = minus_one_scale_allocator.get();
|
||||
|
||||
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
|
||||
size_t minus_one_nb[GGML_MAX_DIMS];
|
||||
minus_one_nb[0] = sizeof(float_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
|
||||
}
|
||||
acl_minus_one_tensor = aclnn_ones(
|
||||
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
|
||||
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
|
||||
// -1 * first half
|
||||
int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
|
||||
size_t first_half_nb[GGML_MAX_DIMS];
|
||||
first_half_nb[0] = sizeof(float_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
|
||||
}
|
||||
aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
|
||||
minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
|
||||
first_half_nb, GGML_MAX_DIMS);
|
||||
bool inplace = true;
|
||||
float scale = -1;
|
||||
aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
|
||||
ACL_CHECK(aclDestroyTensor(acl_first_half_tensor));
|
||||
int acl_mode = mode;
|
||||
if (mode == 0) {
|
||||
acl_mode = 1;
|
||||
}
|
||||
|
||||
// TODO: n_dims < ne0
|
||||
GGML_ASSERT(n_dims == src0->ne[0]);
|
||||
|
||||
// input * scale
|
||||
ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
|
||||
ggml_nbytes(src0));
|
||||
void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
|
||||
size_t input_nb[GGML_MAX_DIMS];
|
||||
input_nb[0] = ggml_type_size(src0->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor(
|
||||
input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
|
||||
ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
|
||||
aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor(
|
||||
input_roll_buffer, ggml_cann_type_mapping(src0->type),
|
||||
ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
|
||||
|
||||
aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
|
||||
acl_input_roll_mul_scale_tensor);
|
||||
|
||||
// output
|
||||
aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
|
||||
aclTensor* acl_x = ggml_cann_create_tensor(src0);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
void* output_fp32_buffer;
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
aclnn_inplace_mul(ctx, acl_src0, acl_cos_reshape_tensor);
|
||||
aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor,
|
||||
acl_sin_reshape_tensor);
|
||||
aclnn_add(ctx, acl_src0, acl_input_roll_mul_scale_tensor, acl_dst);
|
||||
// TODO: ne0 != n_dims in mode2
|
||||
} else if (src0->type == GGML_TYPE_F16) {
|
||||
size_t input_fp32_nb[GGML_MAX_DIMS];
|
||||
input_fp32_nb[0] = sizeof(float_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
|
||||
}
|
||||
ggml_cann_pool_alloc fp32_allocator1(
|
||||
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
|
||||
void* input_fp32_buffer1 = fp32_allocator1.get();
|
||||
aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
|
||||
input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne,
|
||||
input_fp32_nb, GGML_MAX_DIMS);
|
||||
ggml_cann_pool_alloc fp32_allocator2(
|
||||
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
|
||||
void* input_fp32_buffer2 = fp32_allocator2.get();
|
||||
aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
|
||||
input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne,
|
||||
input_fp32_nb, GGML_MAX_DIMS);
|
||||
|
||||
ggml_cann_pool_alloc fp32_allocator(
|
||||
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
|
||||
output_fp32_buffer = fp32_allocator.get();
|
||||
aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
|
||||
output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
|
||||
input_fp32_nb, GGML_MAX_DIMS);
|
||||
aclnn_mul(ctx, acl_src0, acl_cos_reshape_tensor, input_fp32_tensor1);
|
||||
aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
|
||||
input_fp32_tensor2);
|
||||
aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
|
||||
output_fp32_tensor);
|
||||
aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
|
||||
ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
|
||||
ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
|
||||
ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
|
||||
acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst, &workspaceSize, &executor));
|
||||
if (workspaceSize > 0) {
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
}
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
|
||||
ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize,
|
||||
executor, ctx.stream()));
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_x));
|
||||
ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_src0));
|
||||
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
}
|
||||
|
||||
@@ -211,17 +211,20 @@ struct ggml_cann_pool_alloc {
|
||||
struct ggml_backend_cann_context {
|
||||
int32_t device; /**< Device ID. */
|
||||
std::string name; /**< Name of the device. */
|
||||
std::string description; /**< Description of the device. */
|
||||
aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
|
||||
|
||||
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {
|
||||
{nullptr}}; /**< Array of streams for the device. */
|
||||
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
|
||||
|
||||
/**
|
||||
* @brief Constructor for initializing the context with a given device.
|
||||
* @param device Device ID.
|
||||
*/
|
||||
explicit ggml_backend_cann_context(int device)
|
||||
: device(device), name("CANN" + std::to_string(device)) {}
|
||||
: device(device), name("CANN" + std::to_string(device)) {
|
||||
ggml_cann_set_device(device);
|
||||
description = aclrtGetSocName();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Destructor for cleaning up resources.
|
||||
|
||||
@@ -122,6 +122,10 @@ static ggml_cann_device_info ggml_cann_init() {
|
||||
ACL_CHECK(aclrtMemGetAllocationGranularity(
|
||||
&prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
|
||||
&info.devices[id].vmm_granularity));
|
||||
|
||||
size_t free, total;
|
||||
ggml_backend_cann_get_device_memory(id, &free, &total);
|
||||
info.devices[id].total_vram = free;
|
||||
}
|
||||
|
||||
// TODO: add more device info later.
|
||||
@@ -208,6 +212,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
|
||||
* @return A pointer to the allocated buffer.
|
||||
*/
|
||||
void* alloc(size_t size, size_t* actual_size) override {
|
||||
const size_t alignment = 128;
|
||||
size = GGML_PAD(size, alignment);
|
||||
if (size == 0) {
|
||||
size = alignment;
|
||||
}
|
||||
#ifdef DEBUG_CANN_MALLOC
|
||||
int nnz = 0;
|
||||
size_t max_size = 0;
|
||||
@@ -246,13 +255,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
|
||||
return ptr;
|
||||
}
|
||||
void* ptr;
|
||||
size_t look_ahead_size = (size_t)(1.05 * size);
|
||||
look_ahead_size = 256 * ((look_ahead_size + 255) / 256);
|
||||
ggml_cann_set_device(device);
|
||||
ACL_CHECK(
|
||||
aclrtMalloc(&ptr, look_ahead_size, ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
*actual_size = look_ahead_size;
|
||||
pool_size += look_ahead_size;
|
||||
aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
*actual_size = size;
|
||||
pool_size += size;
|
||||
#ifdef DEBUG_CANN_MALLOC
|
||||
GGML_LOG_INFO(
|
||||
"%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
|
||||
@@ -296,7 +303,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
||||
/**
|
||||
* @brief The maximum size of the virtual memory pool (32 GB).
|
||||
*/
|
||||
static const size_t CANN_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
||||
size_t max_size;
|
||||
|
||||
/**
|
||||
* @brief The device ID associated with this buffer pool.
|
||||
@@ -341,7 +348,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
||||
*/
|
||||
explicit ggml_cann_pool_vmm(int device)
|
||||
: device(device),
|
||||
granularity(ggml_cann_info().devices[device].vmm_granularity) {}
|
||||
granularity(ggml_cann_info().devices[device].vmm_granularity) {
|
||||
auto dev = ggml_cann_info().devices[device];
|
||||
granularity = dev.vmm_granularity;
|
||||
max_size = dev.total_vram;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Destructor to free all buffers in the virtual memory pool.
|
||||
@@ -370,17 +381,19 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
||||
// round up the allocation size to the alignment to ensure that all
|
||||
// allocations are aligned for all data types
|
||||
const size_t alignment = 128;
|
||||
size = alignment * ((size + alignment - 1) / alignment);
|
||||
size = GGML_PAD(size, alignment);
|
||||
if (size == 0) {
|
||||
size = alignment;
|
||||
}
|
||||
|
||||
size_t avail = pool_size - pool_used;
|
||||
|
||||
if (size > avail) {
|
||||
// round up to the next multiple of the granularity
|
||||
size_t reserve_size = size - avail;
|
||||
reserve_size =
|
||||
granularity * ((reserve_size + granularity - 1) / granularity);
|
||||
reserve_size = GGML_PAD(reserve_size, granularity);
|
||||
|
||||
GGML_ASSERT(pool_size + reserve_size <= CANN_POOL_VMM_MAX_SIZE);
|
||||
GGML_ASSERT(pool_size + reserve_size <= max_size);
|
||||
|
||||
// allocate more physical memory
|
||||
aclrtPhysicalMemProp prop = {};
|
||||
@@ -396,7 +409,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
||||
// reserve virtual address space (if not already reserved)
|
||||
if (pool_addr == 0) {
|
||||
ACL_CHECK(aclrtReserveMemAddress(
|
||||
&pool_addr, CANN_POOL_VMM_MAX_SIZE, 0, NULL, 1));
|
||||
&pool_addr, max_size, 0, NULL, 1));
|
||||
}
|
||||
|
||||
// map at the end of the pool
|
||||
@@ -409,10 +422,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
||||
// add to the pool
|
||||
pool_size += reserve_size;
|
||||
|
||||
// GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (
|
||||
// reserved %llu MB)\n",
|
||||
// device, (unsigned long long) (pool_size/1024/1024),
|
||||
// (unsigned long long) (reserve_size/1024/1024));
|
||||
#ifdef DEBUG_CANN_MALLOC
|
||||
GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
|
||||
device, (unsigned long long) (pool_size/1024/1024),
|
||||
(unsigned long long) (reserve_size/1024/1024));
|
||||
#endif
|
||||
}
|
||||
|
||||
GGML_ASSERT(pool_addr != 0);
|
||||
@@ -457,7 +471,6 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
||||
*/
|
||||
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
|
||||
int device) {
|
||||
// return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_leg(device));
|
||||
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
|
||||
}
|
||||
|
||||
@@ -1130,10 +1143,10 @@ ggml_backend_cann_buffer_type(int32_t device) {
|
||||
static bool ggml_backend_cann_buffer_type_initialized = false;
|
||||
|
||||
if (!ggml_backend_cann_buffer_type_initialized) {
|
||||
for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
|
||||
for (int32_t i = 0; i < ggml_cann_info().device_count; i++) {
|
||||
ggml_backend_cann_buffer_types[i] = {
|
||||
/* .iface = */ ggml_backend_cann_buffer_type_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
|
||||
/* .context = */
|
||||
new ggml_backend_cann_buffer_type_context{
|
||||
i, "CANN" + std::to_string(i)},
|
||||
@@ -1199,10 +1212,15 @@ static void * ggml_cann_host_malloc(size_t size) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const size_t alignment = 128;
|
||||
size = GGML_PAD(size, alignment);
|
||||
if (size == 0) {
|
||||
size = alignment;
|
||||
}
|
||||
|
||||
void * hostPtr = nullptr;
|
||||
aclError err = aclrtMallocHost((void **) &hostPtr, size);
|
||||
if (err != ACL_SUCCESS) {
|
||||
|
||||
GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
||||
size / 1024.0 / 1024.0, aclGetRecentErrMsg());
|
||||
return nullptr;
|
||||
@@ -1669,12 +1687,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
}
|
||||
case GGML_OP_MUL_MAT: {
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_Q8_0:
|
||||
// Current groupsize should not be greater than k-1 in
|
||||
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
|
||||
if (op->src[0]->ne[0] <= QK8_0) {
|
||||
return false;
|
||||
}
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_Q8_0:
|
||||
// TODO: fix me
|
||||
// Current groupsize should not be greater than k-1 in
|
||||
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
|
||||
case GGML_TYPE_Q4_0:
|
||||
return true;
|
||||
default:
|
||||
@@ -1706,9 +1726,61 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
return false;
|
||||
}
|
||||
}
|
||||
case GGML_OP_CONT: {
|
||||
// TODO: support GGML_TYPE_BF16
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
case GGML_OP_ROPE: {
|
||||
// TODO: with ops-test v == 1
|
||||
float * freq_scale = (float*)((int32_t*)op->op_params + 6);
|
||||
float * ext_factor = (float*)((int32_t*)op->op_params + 7);
|
||||
float * attn_factor = (float*)((int32_t*)op->op_params + 8);
|
||||
// TODO: with freq_factors
|
||||
if (op->src[2] != NULL) {
|
||||
return false;
|
||||
}
|
||||
// TODO: n_dims <= ne0
|
||||
if (op->src[0]->ne[0] != op->op_params[1]) {
|
||||
return false;
|
||||
}
|
||||
// TODO: ext_factor != 0
|
||||
if (*ext_factor != 0) {
|
||||
return false;
|
||||
}
|
||||
// TODO: freq_scale != 1
|
||||
if (*freq_scale != 1) {
|
||||
return false;
|
||||
}
|
||||
// TODO: attn_factor != 1
|
||||
if (*attn_factor != 1) {
|
||||
return false;
|
||||
}
|
||||
//TODO: type == GGML_TYPE_F16
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F32:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
case GGML_OP_UPSCALE: {
|
||||
// aclnnUpsampleNearest2dGetWorkspaceSize not support
|
||||
// selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
|
||||
if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
@@ -1722,17 +1794,13 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_ACC:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
@@ -2064,16 +2132,17 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
|
||||
dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
|
||||
ggml_cann_set_device(i);
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
/* .interface = */ ggml_backend_cann_device_interface,
|
||||
/* .reg = */ ®,
|
||||
/* .context = */ dev_ctx
|
||||
/* .iface = */ ggml_backend_cann_device_interface,
|
||||
/* .reg = */ ®,
|
||||
/* .context = */ dev_ctx
|
||||
};
|
||||
ctx->devices.push_back(dev);
|
||||
}
|
||||
|
||||
reg = ggml_backend_reg {
|
||||
/* .interface = */ ggml_backend_cann_reg_interface,
|
||||
/* .context = */ ctx
|
||||
/* .api_version = */ GGML_BACKEND_API_VERSION,
|
||||
/* .iface = */ ggml_backend_cann_reg_interface,
|
||||
/* .context = */ ctx
|
||||
};
|
||||
}
|
||||
|
||||
@@ -2126,3 +2195,5 @@ void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
|
||||
ggml_cann_set_device(device);
|
||||
ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_cann_reg)
|
||||
|
||||
@@ -1,14 +1,13 @@
|
||||
add_library(ggml-cpu
|
||||
ggml-cpu.c
|
||||
ggml-cpu.cpp
|
||||
ggml-cpu-aarch64.c
|
||||
ggml-cpu-aarch64.h
|
||||
ggml-cpu-quants.c
|
||||
ggml-cpu-quants.h
|
||||
)
|
||||
ggml_add_backend_library(ggml-cpu
|
||||
ggml-cpu.c
|
||||
ggml-cpu.cpp
|
||||
ggml-cpu-aarch64.c
|
||||
ggml-cpu-aarch64.h
|
||||
ggml-cpu-quants.c
|
||||
ggml-cpu-quants.h
|
||||
)
|
||||
|
||||
target_link_libraries(ggml-cpu PRIVATE ggml-base)
|
||||
target_include_directories(ggml-cpu PRIVATE . ..)
|
||||
target_include_directories(ggml-cpu PRIVATE .)
|
||||
|
||||
if (APPLE AND GGML_ACCELERATE)
|
||||
find_library(ACCELERATE_FRAMEWORK Accelerate)
|
||||
|
||||
@@ -13578,29 +13578,6 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
|
||||
|
||||
#endif // GGML_USE_OPENMP
|
||||
|
||||
void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
|
||||
p->n_threads = n_threads;
|
||||
p->prio = 0; // default priority (usually means normal or inherited)
|
||||
p->poll = 50; // hybrid-polling enabled
|
||||
p->strict_cpu = false; // no strict placement (all threads share same cpumask)
|
||||
p->paused = false; // threads are ready to go
|
||||
memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
|
||||
}
|
||||
|
||||
struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
|
||||
struct ggml_threadpool_params p;
|
||||
ggml_threadpool_params_init(&p, n_threads);
|
||||
return p;
|
||||
}
|
||||
|
||||
bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
|
||||
if (p0->n_threads != p1->n_threads ) return false;
|
||||
if (p0->prio != p1->prio ) return false;
|
||||
if (p0->poll != p1->poll ) return false;
|
||||
if (p0->strict_cpu != p1->strict_cpu ) return false;
|
||||
return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
|
||||
}
|
||||
|
||||
static struct ggml_threadpool * ggml_threadpool_new_impl(
|
||||
struct ggml_threadpool_params * tpp,
|
||||
struct ggml_cgraph * cgraph,
|
||||
@@ -13896,7 +13873,7 @@ int ggml_cpu_has_vsx(void) {
|
||||
}
|
||||
|
||||
int ggml_cpu_has_neon(void) {
|
||||
#if defined(__ARM_ARCH)
|
||||
#if defined(__ARM_ARCH) && defined(__ARM_NEON)
|
||||
return ggml_arm_arch_features.has_neon;
|
||||
#else
|
||||
return 0;
|
||||
@@ -13904,7 +13881,7 @@ int ggml_cpu_has_neon(void) {
|
||||
}
|
||||
|
||||
int ggml_cpu_has_sve(void) {
|
||||
#if defined(__ARM_ARCH)
|
||||
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
|
||||
return ggml_arm_arch_features.has_sve;
|
||||
#else
|
||||
return 0;
|
||||
@@ -13912,7 +13889,7 @@ int ggml_cpu_has_sve(void) {
|
||||
}
|
||||
|
||||
int ggml_cpu_has_matmul_int8(void) {
|
||||
#if defined(__ARM_ARCH)
|
||||
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
return ggml_arm_arch_features.has_i8mm;
|
||||
#else
|
||||
return 0;
|
||||
@@ -13920,7 +13897,7 @@ int ggml_cpu_has_matmul_int8(void) {
|
||||
}
|
||||
|
||||
int ggml_cpu_get_sve_cnt(void) {
|
||||
#if defined(__ARM_ARCH)
|
||||
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
|
||||
return ggml_arm_arch_features.sve_cnt;
|
||||
#else
|
||||
return 0;
|
||||
|
||||
@@ -541,16 +541,12 @@ static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg
|
||||
return &ggml_backend_cpu_device;
|
||||
}
|
||||
|
||||
struct ggml_backend_feature {
|
||||
const char * name;
|
||||
const char * value;
|
||||
};
|
||||
|
||||
// Not used yet
|
||||
// This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically,
|
||||
// and additionally to allow other backends to expose their own list of features that applications can query using the same API.
|
||||
// and additionally to allow other backends to expose their own list of features that applications can query using the same API
|
||||
static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) {
|
||||
static std::vector<ggml_backend_feature> features = []() {
|
||||
ggml_cpu_init();
|
||||
|
||||
std::vector<ggml_backend_feature> features;
|
||||
if (ggml_cpu_has_sse3()) {
|
||||
features.push_back({ "SSE3", "1" });
|
||||
@@ -561,6 +557,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
||||
if (ggml_cpu_has_avx()) {
|
||||
features.push_back({ "AVX", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx_vnni()) {
|
||||
features.push_back({ "AVX_VNNI", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx2()) {
|
||||
features.push_back({ "AVX2", "1" });
|
||||
}
|
||||
@@ -570,9 +569,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
||||
if (ggml_cpu_has_fma()) {
|
||||
features.push_back({ "FMA", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx_vnni()) {
|
||||
features.push_back({ "AVX_VNNI", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx512()) {
|
||||
features.push_back({ "AVX512", "1" });
|
||||
}
|
||||
@@ -619,6 +615,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
||||
if (ggml_cpu_has_llamafile()) {
|
||||
features.push_back({ "LLAMAFILE", "1" });
|
||||
}
|
||||
// TODO: rename this
|
||||
#ifdef GGML_USE_CPU_AARCH64
|
||||
features.push_back({ "AARCH64_REPACK", "1" });
|
||||
#endif
|
||||
|
||||
features.push_back({ nullptr, nullptr });
|
||||
|
||||
@@ -637,6 +637,29 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
|
||||
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
|
||||
return (void *)ggml_backend_cpu_get_extra_bufts;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_get_features") == 0) {
|
||||
return (void *)ggml_backend_cpu_get_features;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_set_abort_callback") == 0) {
|
||||
return (void *)ggml_backend_cpu_set_abort_callback;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_cpu_numa_init") == 0) {
|
||||
return (void *)ggml_numa_init;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) {
|
||||
return (void *)ggml_is_numa;
|
||||
}
|
||||
|
||||
// threadpool - TODO: move to ggml-base
|
||||
if (strcmp(name, "ggml_threadpool_new") == 0) {
|
||||
return (void *)ggml_threadpool_new;
|
||||
}
|
||||
if (strcmp(name, "ggml_threadpool_free") == 0) {
|
||||
return (void *)ggml_threadpool_free;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_cpu_set_threadpool") == 0) {
|
||||
return (void *)ggml_backend_cpu_set_threadpool;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
|
||||
@@ -655,9 +678,12 @@ ggml_backend_reg_t ggml_backend_cpu_reg(void) {
|
||||
ggml_cpu_init();
|
||||
|
||||
static struct ggml_backend_reg ggml_backend_cpu_reg = {
|
||||
/* .iface = */ ggml_backend_cpu_reg_i,
|
||||
/* .context = */ NULL,
|
||||
/* .api_version = */ GGML_BACKEND_API_VERSION,
|
||||
/* .iface = */ ggml_backend_cpu_reg_i,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
return &ggml_backend_cpu_reg;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_cpu_reg)
|
||||
|
||||
@@ -46,13 +46,10 @@ if (CUDAToolkit_FOUND)
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
endif()
|
||||
|
||||
add_library(ggml-cuda
|
||||
${GGML_HEADERS_CUDA}
|
||||
${GGML_SOURCES_CUDA}
|
||||
)
|
||||
|
||||
target_link_libraries(ggml-cuda PRIVATE ggml-base)
|
||||
target_include_directories(ggml-cuda PRIVATE . ..)
|
||||
ggml_add_backend_library(ggml-cuda
|
||||
${GGML_HEADERS_CUDA}
|
||||
${GGML_SOURCES_CUDA}
|
||||
)
|
||||
|
||||
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
|
||||
|
||||
|
||||
@@ -3126,6 +3126,61 @@ static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t re
|
||||
return ctx->devices[index];
|
||||
}
|
||||
|
||||
static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t reg) {
|
||||
static std::vector<ggml_backend_feature> features = []() {
|
||||
std::vector<ggml_backend_feature> features;
|
||||
#define _STRINGIFY(...) #__VA_ARGS__
|
||||
#define STRINGIFY(...) _STRINGIFY(__VA_ARGS__)
|
||||
|
||||
#ifdef __CUDA_ARCH_LIST__
|
||||
features.push_back({ "ARCHS", STRINGIFY(__CUDA_ARCH_LIST__) });
|
||||
#endif
|
||||
|
||||
#ifdef GGML_CUDA_FORCE_MMQ
|
||||
features.push_back({ "FORCE_MMQ", "1" });
|
||||
#endif
|
||||
|
||||
#ifdef GGML_CUDA_FORCE_CUBLAS
|
||||
features.push_back({ "FORCE_CUBLAS", "1" });
|
||||
#endif
|
||||
|
||||
#ifdef GGML_CUDA_NO_VMM
|
||||
features.push_back({ "NO_VMM", "1" });
|
||||
#endif
|
||||
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
features.push_back({ "NO_PEER_COPY", "1" });
|
||||
#endif
|
||||
|
||||
#ifdef GGML_CUDA_F16
|
||||
features.push_back({ "F16", "1" });
|
||||
#endif
|
||||
|
||||
#ifdef GGML_CUDA_USE_GRAPHS
|
||||
features.push_back({ "USE_GRAPHS", "1" });
|
||||
#endif
|
||||
|
||||
#ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
|
||||
features.push_back({ "PEER_MAX_BATCH_SIZE", STRINGIFY(GGML_CUDA_PEER_MAX_BATCH_SIZE) });
|
||||
#endif
|
||||
|
||||
#ifdef GGML_CUDA_FA_ALL_QUANTS
|
||||
features.push_back({ "FA_ALL_QUANTS", "1" });
|
||||
#endif
|
||||
|
||||
#undef _STRINGIFY
|
||||
#undef STRINGIFY
|
||||
|
||||
features.push_back({ nullptr, nullptr });
|
||||
|
||||
return features;
|
||||
}();
|
||||
|
||||
return features.data();
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||
GGML_UNUSED(reg);
|
||||
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
|
||||
@@ -3137,6 +3192,9 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con
|
||||
if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) {
|
||||
return (void *)ggml_backend_cuda_unregister_host_buffer;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_get_features") == 0) {
|
||||
return (void *)ggml_backend_cuda_get_features;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@@ -3169,16 +3227,17 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
dev_ctx->description = prop.name;
|
||||
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
/* .interface = */ ggml_backend_cuda_device_interface,
|
||||
/* .reg = */ ®,
|
||||
/* .context = */ dev_ctx
|
||||
/* .iface = */ ggml_backend_cuda_device_interface,
|
||||
/* .reg = */ ®,
|
||||
/* .context = */ dev_ctx
|
||||
};
|
||||
ctx->devices.push_back(dev);
|
||||
}
|
||||
|
||||
reg = ggml_backend_reg {
|
||||
/* .interface = */ ggml_backend_cuda_reg_interface,
|
||||
/* .context = */ ctx
|
||||
/* .api_version = */ GGML_BACKEND_API_VERSION,
|
||||
/* .iface = */ ggml_backend_cuda_reg_interface,
|
||||
/* .context = */ ctx
|
||||
};
|
||||
}
|
||||
|
||||
@@ -3209,3 +3268,5 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
|
||||
|
||||
return cuda_backend;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_cuda_reg)
|
||||
|
||||
@@ -64,12 +64,10 @@ else()
|
||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||
endif()
|
||||
|
||||
add_library(ggml-hip
|
||||
${GGML_HEADERS_ROCM}
|
||||
${GGML_SOURCES_ROCM})
|
||||
|
||||
target_link_libraries(ggml-hip PRIVATE ggml-base)
|
||||
target_include_directories(ggml-hip PRIVATE . ..)
|
||||
ggml_add_backend_library(ggml-hip
|
||||
${GGML_HEADERS_ROCM}
|
||||
${GGML_SOURCES_ROCM}
|
||||
)
|
||||
|
||||
# TODO: do not use CUDA definitions for HIP
|
||||
target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
|
||||
|
||||
@@ -6,13 +6,13 @@ if (NOT glslc_executable)
|
||||
message(FATAL_ERROR "glslc not found")
|
||||
endif()
|
||||
|
||||
add_library(ggml-kompute
|
||||
ggml-kompute.cpp
|
||||
../../include/ggml-kompute.h
|
||||
)
|
||||
ggml_add_backend_library(ggml-kompute
|
||||
ggml-kompute.cpp
|
||||
../../include/ggml-kompute.h
|
||||
)
|
||||
|
||||
target_link_libraries(ggml-kompute PRIVATE ggml-base kompute)
|
||||
target_include_directories(ggml-kompute PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR})
|
||||
target_include_directories(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
|
||||
|
||||
add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
|
||||
|
||||
|
||||
@@ -2176,9 +2176,12 @@ static const struct ggml_backend_reg_i ggml_backend_kompute_reg_i = {
|
||||
|
||||
ggml_backend_reg_t ggml_backend_kompute_reg() {
|
||||
static ggml_backend_reg reg = {
|
||||
/* .iface = */ ggml_backend_kompute_reg_i,
|
||||
/* .context = */ nullptr,
|
||||
/* .api_version = */ GGML_BACKEND_API_VERSION,
|
||||
/* .iface = */ ggml_backend_kompute_reg_i,
|
||||
/* .context = */ nullptr,
|
||||
};
|
||||
|
||||
return ®
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_kompute_reg)
|
||||
|
||||
@@ -4,19 +4,16 @@ find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||
|
||||
message(STATUS "Metal framework found")
|
||||
|
||||
add_library(ggml-metal
|
||||
ggml-metal.m
|
||||
)
|
||||
ggml_add_backend_library(ggml-metal
|
||||
ggml-metal.m
|
||||
)
|
||||
|
||||
target_link_libraries(ggml-metal PRIVATE
|
||||
ggml-base
|
||||
${FOUNDATION_LIBRARY}
|
||||
${METAL_FRAMEWORK}
|
||||
${METALKIT_FRAMEWORK}
|
||||
)
|
||||
|
||||
target_include_directories(ggml-metal PRIVATE . ..)
|
||||
|
||||
if (GGML_METAL_NDEBUG)
|
||||
add_compile_definitions(GGML_METAL_NDEBUG)
|
||||
endif()
|
||||
|
||||
@@ -1927,7 +1927,7 @@ static void ggml_metal_encode_node(
|
||||
|
||||
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
|
||||
// to the matrix-vector kernel
|
||||
int ne11_mm_min = 1;
|
||||
int ne11_mm_min = 4;
|
||||
|
||||
#if 0
|
||||
// the numbers below are measured on M2 Ultra for 7B and 13B models
|
||||
@@ -1951,316 +1951,316 @@ static void ggml_metal_encode_node(
|
||||
}
|
||||
#endif
|
||||
|
||||
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
||||
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
||||
if ([device supportsFamily:MTLGPUFamilyApple7] &&
|
||||
!ggml_is_transposed(src0) &&
|
||||
!ggml_is_transposed(src1) &&
|
||||
src1t == GGML_TYPE_F32 &&
|
||||
ne00 % 32 == 0 && ne00 >= 64 &&
|
||||
(ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) {
|
||||
//printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
|
||||
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
||||
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
||||
if ([device supportsFamily:MTLGPUFamilyApple7] &&
|
||||
!ggml_is_transposed(src0) &&
|
||||
!ggml_is_transposed(src1) &&
|
||||
src1t == GGML_TYPE_F32 &&
|
||||
ne00 % 32 == 0 && ne00 >= 64 &&
|
||||
(ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) {
|
||||
//printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
|
||||
|
||||
// some Metal matrix data types require aligned pointers
|
||||
// ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break;
|
||||
case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break;
|
||||
case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8 == 0); break;
|
||||
default: break;
|
||||
}
|
||||
// some Metal matrix data types require aligned pointers
|
||||
// ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break;
|
||||
case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break;
|
||||
case GGML_TYPE_BF16: GGML_ASSERT(nb01 % 8 == 0); break;
|
||||
default: break;
|
||||
}
|
||||
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32 ].pipeline; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32 ].pipeline; break;
|
||||
case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break;
|
||||
default: GGML_ABORT("MUL MAT-MAT not implemented");
|
||||
}
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32 ].pipeline; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32 ].pipeline; break;
|
||||
case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_BF16_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q2_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q3_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q4_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q5_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32 ].pipeline; break;
|
||||
case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break;
|
||||
default: GGML_ABORT("MUL MAT-MAT not implemented");
|
||||
}
|
||||
|
||||
ggml_metal_kargs_mul_mm args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne12 =*/ ne12,
|
||||
/*.nb10 =*/ nb10,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb12 =*/ nb12,
|
||||
/*.nb13 =*/ nb13,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.r2 =*/ r2,
|
||||
/*.r3 =*/ r3,
|
||||
};
|
||||
ggml_metal_kargs_mul_mm args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne12 =*/ ne12,
|
||||
/*.nb10 =*/ nb10,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb12 =*/ nb12,
|
||||
/*.nb13 =*/ nb13,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.r2 =*/ r2,
|
||||
/*.r3 =*/ r3,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:0];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:3];
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:0];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:3];
|
||||
|
||||
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
||||
} else {
|
||||
int nth0 = 32;
|
||||
int nth1 = 1;
|
||||
int nrows = 1;
|
||||
//printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
|
||||
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
||||
} else {
|
||||
int nth0 = 32;
|
||||
int nth1 = 1;
|
||||
int nrows = 1;
|
||||
//printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
|
||||
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
// use custom matrix x vector kernel
|
||||
switch (src0t) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
GGML_ASSERT(src1t == GGML_TYPE_F32);
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline;
|
||||
// use custom matrix x vector kernel
|
||||
switch (src0t) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
GGML_ASSERT(src1t == GGML_TYPE_F32);
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline;
|
||||
nrows = 4;
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
nth0 = 32;
|
||||
nth1 = 1;
|
||||
if (src1t == GGML_TYPE_F32) {
|
||||
if (ne11 * ne12 < 4) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline;
|
||||
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline;
|
||||
nrows = ne11;
|
||||
} else {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32].pipeline;
|
||||
nrows = 4;
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
nth0 = 32;
|
||||
nth1 = 1;
|
||||
if (src1t == GGML_TYPE_F32) {
|
||||
if (ne11 * ne12 < 4) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline;
|
||||
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline;
|
||||
nrows = ne11;
|
||||
} else {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32].pipeline;
|
||||
nrows = 4;
|
||||
}
|
||||
} else {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16].pipeline;
|
||||
nrows = 4;
|
||||
}
|
||||
} break;
|
||||
case GGML_TYPE_BF16:
|
||||
{
|
||||
nth0 = 32;
|
||||
nth1 = 1;
|
||||
if (src1t == GGML_TYPE_F32) {
|
||||
if (ne11 * ne12 < 4) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW].pipeline;
|
||||
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4].pipeline;
|
||||
nrows = ne11;
|
||||
} else {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32].pipeline;
|
||||
nrows = 4;
|
||||
}
|
||||
} else {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16].pipeline;
|
||||
nrows = 4;
|
||||
}
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
{
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
{
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
{
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
{
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
{
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
{
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
{
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
{
|
||||
nth0 = 4; //1;
|
||||
nth1 = 8; //32;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
{
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
{
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ3_S:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ2_S:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32].pipeline;
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_LOG_ERROR("Asserting on type %d\n", (int)src0t);
|
||||
GGML_ABORT("not implemented");
|
||||
}
|
||||
};
|
||||
} else {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16].pipeline;
|
||||
nrows = 4;
|
||||
}
|
||||
} break;
|
||||
case GGML_TYPE_BF16:
|
||||
{
|
||||
nth0 = 32;
|
||||
nth1 = 1;
|
||||
if (src1t == GGML_TYPE_F32) {
|
||||
if (ne11 * ne12 < 4) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW].pipeline;
|
||||
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4].pipeline;
|
||||
nrows = ne11;
|
||||
} else {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32].pipeline;
|
||||
nrows = 4;
|
||||
}
|
||||
} else {
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16].pipeline;
|
||||
nrows = 4;
|
||||
}
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
{
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
{
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
{
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
{
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
{
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
{
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
{
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
{
|
||||
nth0 = 4; //1;
|
||||
nth1 = 8; //32;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
{
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
{
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ3_S:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ2_S:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32].pipeline;
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_LOG_ERROR("Asserting on type %d\n", (int)src0t);
|
||||
GGML_ABORT("not implemented");
|
||||
}
|
||||
};
|
||||
|
||||
ggml_metal_kargs_mul_mv args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.nb00 =*/ nb00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne10 =*/ ne10,
|
||||
/*.ne11 =*/ ne11,
|
||||
/*.ne12 =*/ ne12,
|
||||
/*.nb10 =*/ nb10,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb12 =*/ nb12,
|
||||
/*.nb13 =*/ nb13,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.r2 =*/ r2,
|
||||
/*.r3 =*/ r3,
|
||||
};
|
||||
ggml_metal_kargs_mul_mv args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne01 =*/ ne01,
|
||||
/*.ne02 =*/ ne02,
|
||||
/*.nb00 =*/ nb00,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.nb02 =*/ nb02,
|
||||
/*.nb03 =*/ nb03,
|
||||
/*.ne10 =*/ ne10,
|
||||
/*.ne11 =*/ ne11,
|
||||
/*.ne12 =*/ ne12,
|
||||
/*.nb10 =*/ nb10,
|
||||
/*.nb11 =*/ nb11,
|
||||
/*.nb12 =*/ nb12,
|
||||
/*.nb13 =*/ nb13,
|
||||
/*.ne0 =*/ ne0,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.r2 =*/ r2,
|
||||
/*.r3 =*/ r3,
|
||||
};
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:0];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:3];
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBytes:&args length:sizeof(args) atIndex:0];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:2];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:3];
|
||||
|
||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 ||
|
||||
src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K ||
|
||||
src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
|
||||
const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) {
|
||||
const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) {
|
||||
const int mem_size = 32*sizeof(float);
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q3_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q5_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q6_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
} else {
|
||||
const int64_t ny = (ne11 + nrows - 1)/nrows;
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
}
|
||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 ||
|
||||
src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K ||
|
||||
src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
|
||||
const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) {
|
||||
const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) {
|
||||
const int mem_size = 32*sizeof(float);
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q3_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q5_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q6_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
} else {
|
||||
const int64_t ny = (ne11 + nrows - 1)/nrows;
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
@@ -4372,19 +4372,45 @@ static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t r
|
||||
GGML_UNUSED(index);
|
||||
}
|
||||
|
||||
static struct ggml_backend_feature g_ggml_backend_metal_features[] = {
|
||||
#if defined(GGML_METAL_EMBED_LIBRARY)
|
||||
{ "EMBED_LIBRARY", "1" },
|
||||
#endif
|
||||
#if defined(GGML_METAL_USE_BF16)
|
||||
{ "BF16", "1" },
|
||||
#endif
|
||||
{ nil, nil },
|
||||
};
|
||||
|
||||
static struct ggml_backend_feature * ggml_backend_metal_get_features(ggml_backend_reg_t reg) {
|
||||
return g_ggml_backend_metal_features;
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static void * ggml_backend_metal_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||
if (strcmp(name, "ggml_backend_get_features") == 0) {
|
||||
return (void *)ggml_backend_metal_get_features;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
static struct ggml_backend_reg_i ggml_backend_metal_reg_i = {
|
||||
/* .get_name = */ ggml_backend_metal_reg_get_name,
|
||||
/* .device_count = */ ggml_backend_metal_reg_device_count,
|
||||
/* .device_get = */ ggml_backend_metal_reg_device_get,
|
||||
/* .get_proc_address = */ NULL,
|
||||
/* .get_proc_address = */ ggml_backend_metal_get_proc_address,
|
||||
};
|
||||
|
||||
ggml_backend_reg_t ggml_backend_metal_reg(void) {
|
||||
// TODO: make this thread-safe somehow?
|
||||
{
|
||||
g_ggml_backend_metal_reg = (struct ggml_backend_reg) {
|
||||
/* .iface = */ ggml_backend_metal_reg_i,
|
||||
/* .context = */ NULL,
|
||||
/* .api_version = */ GGML_BACKEND_API_VERSION,
|
||||
/* .iface = */ ggml_backend_metal_reg_i,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
g_ggml_backend_metal_device = (struct ggml_backend_device) {
|
||||
@@ -4396,3 +4422,5 @@ ggml_backend_reg_t ggml_backend_metal_reg(void) {
|
||||
|
||||
return &g_ggml_backend_metal_reg;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg)
|
||||
|
||||
@@ -5447,12 +5447,12 @@ kernel void kernel_mul_mm(
|
||||
const int im = tgpig.z;
|
||||
|
||||
// if this block is of 64x32 shape or smaller
|
||||
short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
|
||||
short n_cols = (args.ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (args.ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
|
||||
const short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
|
||||
const short n_cols = (args.ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (args.ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
|
||||
|
||||
// a thread shouldn't load data outside of the matrix
|
||||
short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
|
||||
short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
|
||||
const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
|
||||
const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
|
||||
|
||||
simdgroup_T8x8 ma[4];
|
||||
simdgroup_float8x8 mb[2];
|
||||
@@ -5467,20 +5467,23 @@ kernel void kernel_mul_mm(
|
||||
const int i12 = im%args.ne12;
|
||||
const int i13 = im/args.ne12;
|
||||
|
||||
uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
short offset1 = il/nl;
|
||||
const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const short offset1 = il/nl;
|
||||
|
||||
device const block_q * x = (device const block_q *)(src0
|
||||
+ args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1;
|
||||
|
||||
device const block_q * x = (device const block_q *)(src0 + (r0*BLOCK_SIZE_M + thread_row)*args.nb01 + offset0) + offset1;
|
||||
device const float * y = (device const float *)(src1
|
||||
+ args.nb13*i13
|
||||
+ args.nb12*i12
|
||||
+ args.nb11*(r1 * BLOCK_SIZE_N + thread_col)
|
||||
+ args.nb11*(r1*BLOCK_SIZE_N + thread_col)
|
||||
+ args.nb10*(BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
|
||||
|
||||
for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) {
|
||||
// load data and store to threadgroup memory
|
||||
T4x4 temp_a;
|
||||
dequantize_func(x, il, temp_a);
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
#pragma unroll(16)
|
||||
@@ -5490,44 +5493,46 @@ kernel void kernel_mul_mm(
|
||||
+ (tiitg/THREAD_PER_ROW)%8 + (i&7)*8) = temp_a[i/4][i%4];
|
||||
}
|
||||
|
||||
*(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL)*8*32 + 8*(tiitg/THREAD_PER_COL)) = *((device float2x4 *) y);
|
||||
*(threadgroup float2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = *((device float2x4 *) y);
|
||||
|
||||
il = (il + 2 < nl) ? il + 2 : il % 2;
|
||||
x = (il < 2) ? x + (2+nl-1)/nl : x;
|
||||
x = (il < 2) ? x + (2 + nl - 1)/nl : x;
|
||||
y += BLOCK_SIZE_K;
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// load matrices from threadgroup memory and conduct outer products
|
||||
threadgroup T * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
|
||||
threadgroup float * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
|
||||
threadgroup const T * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
|
||||
threadgroup const float * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
|
||||
|
||||
#pragma unroll(4)
|
||||
for (short ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
|
||||
for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) {
|
||||
#pragma unroll(4)
|
||||
for (short i = 0; i < 4; i++) {
|
||||
simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
|
||||
}
|
||||
|
||||
simdgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
#pragma unroll(2)
|
||||
for (short i = 0; i < 2; i++) {
|
||||
simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
|
||||
}
|
||||
|
||||
lsma += BLOCK_SIZE_M/SG_MAT_ROW * SG_MAT_SIZE;
|
||||
lsmb += BLOCK_SIZE_N/SG_MAT_ROW * SG_MAT_SIZE;
|
||||
|
||||
#pragma unroll(8)
|
||||
for (short i = 0; i < 8; i++){
|
||||
simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
|
||||
}
|
||||
|
||||
lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE;
|
||||
lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
if ((r0 + 1) * BLOCK_SIZE_M <= args.ne0 && (r1 + 1) * BLOCK_SIZE_N <= args.ne1) {
|
||||
device float * C = (device float *) dst +
|
||||
(BLOCK_SIZE_M * r0 + 32 * (sgitg & 1)) + \
|
||||
(BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;
|
||||
(BLOCK_SIZE_M * r0 + 32*(sgitg & 1)) + \
|
||||
(BLOCK_SIZE_N * r1 + 16*(sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;
|
||||
|
||||
for (short i = 0; i < 8; i++) {
|
||||
simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.ne0 * (i/4), args.ne0);
|
||||
@@ -5536,7 +5541,7 @@ kernel void kernel_mul_mm(
|
||||
// block is smaller than 64x32, we should avoid writing data outside of the matrix
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
threadgroup float * temp_str = ((threadgroup float *) shmem) \
|
||||
+ 32 * (sgitg&1) + (16 * (sgitg>>1))*BLOCK_SIZE_M;
|
||||
+ 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M;
|
||||
for (short i = 0; i < 8; i++) {
|
||||
simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M);
|
||||
}
|
||||
|
||||
@@ -47,12 +47,10 @@ if (MUSAToolkit_FOUND)
|
||||
set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22")
|
||||
endforeach()
|
||||
|
||||
add_library(ggml-musa
|
||||
${GGML_HEADERS_MUSA}
|
||||
${GGML_SOURCES_MUSA})
|
||||
|
||||
target_link_libraries(ggml-musa PRIVATE ggml-base)
|
||||
target_include_directories(ggml-musa PRIVATE . ..)
|
||||
ggml_add_backend_library(ggml-musa
|
||||
${GGML_HEADERS_MUSA}
|
||||
${GGML_SOURCES_MUSA}
|
||||
)
|
||||
|
||||
# TODO: do not use CUDA definitions for MUSA
|
||||
target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
message(STATUS "Using RPC backend")
|
||||
|
||||
add_library(ggml-rpc
|
||||
ggml-rpc.cpp)
|
||||
|
||||
target_link_libraries(ggml-rpc PRIVATE ggml-base)
|
||||
target_include_directories(ggml-rpc PRIVATE . ..)
|
||||
ggml_add_backend_library(ggml-rpc
|
||||
ggml-rpc.cpp
|
||||
)
|
||||
|
||||
if (WIN32)
|
||||
target_link_libraries(ggml-rpc PRIVATE ws2_32)
|
||||
|
||||
@@ -1369,8 +1369,9 @@ static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = {
|
||||
|
||||
ggml_backend_reg_t ggml_backend_rpc_reg(void) {
|
||||
static struct ggml_backend_reg ggml_backend_rpc_reg = {
|
||||
/* .iface = */ ggml_backend_rpc_reg_i,
|
||||
/* .context = */ NULL,
|
||||
/* .api_version = */ GGML_BACKEND_API_VERSION,
|
||||
/* .iface = */ ggml_backend_rpc_reg_i,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
return &ggml_backend_rpc_reg;
|
||||
@@ -1401,3 +1402,5 @@ ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) {
|
||||
|
||||
return dev;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_rpc_reg)
|
||||
|
||||
@@ -16,12 +16,10 @@ endif()
|
||||
message(STATUS "SYCL found")
|
||||
#todo: AOT
|
||||
|
||||
add_library(ggml-sycl
|
||||
ggml-sycl.cpp
|
||||
../../include/ggml-sycl.h)
|
||||
|
||||
target_link_libraries(ggml-sycl PRIVATE ggml-base)
|
||||
target_include_directories(ggml-sycl PRIVATE . ..)
|
||||
ggml_add_backend_library(ggml-sycl
|
||||
ggml-sycl.cpp
|
||||
../../include/ggml-sycl.h
|
||||
)
|
||||
|
||||
if (GGML_SYCL_F16)
|
||||
if (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||
|
||||
@@ -4637,16 +4637,17 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
|
||||
dev_ctx->description = prop.get_name();
|
||||
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
/* .interface = */ ggml_backend_sycl_device_interface,
|
||||
/* .reg = */ ®,
|
||||
/* .context = */ dev_ctx
|
||||
/* .iface = */ ggml_backend_sycl_device_interface,
|
||||
/* .reg = */ ®,
|
||||
/* .context = */ dev_ctx
|
||||
};
|
||||
ctx->devices.push_back(dev);
|
||||
}
|
||||
|
||||
reg = ggml_backend_reg {
|
||||
/* .interface = */ ggml_backend_sycl_reg_interface,
|
||||
/* .context = */ ctx
|
||||
/* .api_version = */ GGML_BACKEND_API_VERSION,
|
||||
/* .iface = */ ggml_backend_sycl_reg_interface,
|
||||
/* .context = */ ctx
|
||||
};
|
||||
}
|
||||
|
||||
@@ -4678,3 +4679,4 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
|
||||
return sycl_backend;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_sycl_reg)
|
||||
|
||||
@@ -3,13 +3,13 @@ find_package(Vulkan COMPONENTS glslc REQUIRED)
|
||||
if (Vulkan_FOUND)
|
||||
message(STATUS "Vulkan found")
|
||||
|
||||
add_library(ggml-vulkan
|
||||
ggml-vulkan.cpp
|
||||
../../include/ggml-vulkan.h
|
||||
)
|
||||
ggml_add_backend_library(ggml-vulkan
|
||||
ggml-vulkan.cpp
|
||||
../../include/ggml-vulkan.h
|
||||
)
|
||||
|
||||
target_link_libraries(ggml-vulkan PRIVATE ggml-base Vulkan::Vulkan)
|
||||
target_include_directories(ggml-vulkan PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR})
|
||||
target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
|
||||
target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
|
||||
|
||||
# Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
|
||||
# Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
|
||||
|
||||
@@ -6738,8 +6738,9 @@ static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = {
|
||||
|
||||
ggml_backend_reg_t ggml_backend_vk_reg() {
|
||||
static ggml_backend_reg reg = {
|
||||
/* .iface = */ ggml_backend_vk_reg_i,
|
||||
/* .context = */ nullptr,
|
||||
/* .api_version = */ GGML_BACKEND_API_VERSION,
|
||||
/* .iface = */ ggml_backend_vk_reg_i,
|
||||
/* .context = */ nullptr,
|
||||
};
|
||||
|
||||
return ®
|
||||
@@ -7365,3 +7366,5 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
|
||||
VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")");
|
||||
}
|
||||
#endif
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_vk_reg)
|
||||
|
||||
@@ -474,9 +474,15 @@ void write_output_files() {
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
std::map<std::string, std::string> args;
|
||||
for (int i = 1; i < argc; i += 2) {
|
||||
if (i + 1 < argc) {
|
||||
args[argv[i]] = argv[i + 1];
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
std::string arg = argv[i];
|
||||
if (arg.rfind("--", 0) == 0) {
|
||||
if (i + 1 < argc && argv[i + 1][0] != '-') {
|
||||
args[arg] = argv[i + 1];
|
||||
++i;
|
||||
} else {
|
||||
args[arg] = "";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -7571,3 +7571,26 @@ void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
|
||||
g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
|
||||
g_logger_state.log_callback_user_data = user_data;
|
||||
}
|
||||
|
||||
void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
|
||||
p->n_threads = n_threads;
|
||||
p->prio = 0; // default priority (usually means normal or inherited)
|
||||
p->poll = 50; // hybrid-polling enabled
|
||||
p->strict_cpu = false; // no strict placement (all threads share same cpumask)
|
||||
p->paused = false; // threads are ready to go
|
||||
memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
|
||||
}
|
||||
|
||||
struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
|
||||
struct ggml_threadpool_params p;
|
||||
ggml_threadpool_params_init(&p, n_threads);
|
||||
return p;
|
||||
}
|
||||
|
||||
bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
|
||||
if (p0->n_threads != p1->n_threads ) return false;
|
||||
if (p0->prio != p1->prio ) return false;
|
||||
if (p0->poll != p1->poll ) return false;
|
||||
if (p0->strict_cpu != p1->strict_cpu ) return false;
|
||||
return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
|
||||
}
|
||||
|
||||
@@ -243,7 +243,7 @@ class MODEL_ARCH(IntEnum):
|
||||
COMMAND_R = auto()
|
||||
DBRX = auto()
|
||||
OLMO = auto()
|
||||
OLMO_1124 = auto()
|
||||
OLMO2 = auto()
|
||||
OLMOE = auto()
|
||||
OPENELM = auto()
|
||||
ARCTIC = auto()
|
||||
@@ -405,7 +405,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.COMMAND_R: "command-r",
|
||||
MODEL_ARCH.DBRX: "dbrx",
|
||||
MODEL_ARCH.OLMO: "olmo",
|
||||
MODEL_ARCH.OLMO_1124: "olmo_1124",
|
||||
MODEL_ARCH.OLMO2: "olmo2",
|
||||
MODEL_ARCH.OLMOE: "olmoe",
|
||||
MODEL_ARCH.OPENELM: "openelm",
|
||||
MODEL_ARCH.ARCTIC: "arctic",
|
||||
@@ -1071,7 +1071,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.OLMO_1124: [
|
||||
MODEL_ARCH.OLMO2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
|
||||
@@ -545,7 +545,10 @@ class Metadata:
|
||||
gguf_writer.add_size_label(self.size_label)
|
||||
|
||||
if self.license is not None:
|
||||
gguf_writer.add_license(self.license)
|
||||
if isinstance(self.license, list):
|
||||
gguf_writer.add_license(",".join(self.license))
|
||||
else:
|
||||
gguf_writer.add_license(self.license)
|
||||
if self.license_name is not None:
|
||||
gguf_writer.add_license_name(self.license_name)
|
||||
if self.license_link is not None:
|
||||
|
||||
@@ -13,7 +13,7 @@ class TensorNameMap:
|
||||
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone
|
||||
"transformer.word_embeddings", # falcon
|
||||
"word_embeddings", # bloom
|
||||
"model.embed_tokens", # llama-hf nemotron olmoe olmo_1124
|
||||
"model.embed_tokens", # llama-hf nemotron olmoe olmo2
|
||||
"tok_embeddings", # llama-pth
|
||||
"embeddings.word_embeddings", # bert nomic-bert
|
||||
"language_model.embedding.word_embeddings", # persimmon
|
||||
@@ -54,7 +54,7 @@ class TensorNameMap:
|
||||
# Output
|
||||
MODEL_TENSOR.OUTPUT: (
|
||||
"embed_out", # gptneox
|
||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo_1124
|
||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2
|
||||
"output", # llama-pth bloom internlm2
|
||||
"word_embeddings_for_head", # persimmon
|
||||
"lm_head.linear", # phi2
|
||||
@@ -66,7 +66,7 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.OUTPUT_NORM: (
|
||||
"gpt_neox.final_layer_norm", # gptneox
|
||||
"transformer.ln_f", # gpt2 gpt-j falcon jais exaone
|
||||
"model.norm", # llama-hf baichuan internlm2 olmoe olmo_1124
|
||||
"model.norm", # llama-hf baichuan internlm2 olmoe olmo2
|
||||
"norm", # llama-pth
|
||||
"transformer.norm_f", # mpt dbrx
|
||||
"ln_f", # refact bloom qwen gpt2
|
||||
@@ -145,7 +145,7 @@ class TensorNameMap:
|
||||
|
||||
# Attention query
|
||||
MODEL_TENSOR.ATTN_Q: (
|
||||
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo_1124
|
||||
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2
|
||||
"layers.{bid}.attention.wq", # llama-pth
|
||||
"encoder.layer.{bid}.attention.self.query", # bert
|
||||
"transformer.h.{bid}.attn.q_proj", # gpt-j
|
||||
@@ -157,7 +157,7 @@ class TensorNameMap:
|
||||
|
||||
# Attention key
|
||||
MODEL_TENSOR.ATTN_K: (
|
||||
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo_1124
|
||||
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2
|
||||
"layers.{bid}.attention.wk", # llama-pth
|
||||
"encoder.layer.{bid}.attention.self.key", # bert
|
||||
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
||||
@@ -170,7 +170,7 @@ class TensorNameMap:
|
||||
|
||||
# Attention value
|
||||
MODEL_TENSOR.ATTN_V: (
|
||||
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo_1124
|
||||
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2
|
||||
"layers.{bid}.attention.wv", # llama-pth
|
||||
"encoder.layer.{bid}.attention.self.value", # bert
|
||||
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
||||
@@ -188,7 +188,7 @@ class TensorNameMap:
|
||||
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
||||
"transformer.h.{bid}.self_attention.dense", # falcon
|
||||
"h.{bid}.self_attention.dense", # bloom
|
||||
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo_1124
|
||||
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2
|
||||
"layers.{bid}.attention.wo", # llama-pth
|
||||
"encoder.layer.{bid}.attention.output.dense", # bert
|
||||
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
||||
@@ -215,7 +215,7 @@ class TensorNameMap:
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_POST_NORM: (
|
||||
"model.layers.{bid}.post_attention_layernorm", # gemma2 olmo_1124
|
||||
"model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2
|
||||
),
|
||||
|
||||
# Rotary embeddings
|
||||
@@ -250,7 +250,7 @@ class TensorNameMap:
|
||||
|
||||
# Post feed-forward norm
|
||||
MODEL_TENSOR.FFN_POST_NORM: (
|
||||
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo_1124
|
||||
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP: (
|
||||
@@ -273,7 +273,7 @@ class TensorNameMap:
|
||||
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
||||
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
||||
"h.{bid}.mlp.dense_h_to_4h", # bloom
|
||||
"model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo_1124
|
||||
"model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2
|
||||
"layers.{bid}.feed_forward.w3", # llama-pth
|
||||
"encoder.layer.{bid}.intermediate.dense", # bert
|
||||
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
||||
@@ -314,7 +314,7 @@ class TensorNameMap:
|
||||
|
||||
# Feed-forward gate
|
||||
MODEL_TENSOR.FFN_GATE: (
|
||||
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo_1124
|
||||
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2
|
||||
"layers.{bid}.feed_forward.w1", # llama-pth
|
||||
"transformer.h.{bid}.mlp.w2", # qwen
|
||||
"transformer.h.{bid}.mlp.c_fc2", # jais
|
||||
@@ -346,7 +346,7 @@ class TensorNameMap:
|
||||
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
||||
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
||||
"h.{bid}.mlp.dense_4h_to_h", # bloom
|
||||
"model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo_1124
|
||||
"model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2
|
||||
"layers.{bid}.feed_forward.w2", # llama-pth
|
||||
"encoder.layer.{bid}.output.dense", # bert
|
||||
"transformer.h.{bid}.mlp.fc_out", # gpt-j
|
||||
@@ -383,7 +383,7 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
||||
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
||||
"model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo_1124
|
||||
"model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo2
|
||||
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
|
||||
"transformer.layers.{bid}.attn.q_norm", # openelm
|
||||
@@ -392,7 +392,7 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.ATTN_K_NORM: (
|
||||
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
|
||||
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
||||
"model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo_1124
|
||||
"model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo2
|
||||
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
|
||||
"transformer.layers.{bid}.attn.k_norm", # openelm
|
||||
|
||||
25
include/llama-cpp.h
Normal file
25
include/llama-cpp.h
Normal file
@@ -0,0 +1,25 @@
|
||||
#pragma once
|
||||
|
||||
#ifndef __cplusplus
|
||||
#error "This header is for C++ only"
|
||||
#endif
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
struct llama_model_deleter {
|
||||
void operator()(llama_model * model) { llama_free_model(model); }
|
||||
};
|
||||
|
||||
struct llama_context_deleter {
|
||||
void operator()(llama_context * context) { llama_free(context); }
|
||||
};
|
||||
|
||||
struct llama_sampler_deleter {
|
||||
void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
|
||||
};
|
||||
|
||||
typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
|
||||
typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
|
||||
typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
|
||||
@@ -272,6 +272,9 @@ extern "C" {
|
||||
};
|
||||
|
||||
struct llama_model_params {
|
||||
// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
|
||||
ggml_backend_dev_t * devices;
|
||||
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
||||
|
||||
|
||||
@@ -8,5 +8,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
else()
|
||||
add_subdirectory(vdot)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
add_subdirectory(vdot)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
129
src/llama.cpp
129
src/llama.cpp
@@ -179,7 +179,7 @@ enum llm_arch {
|
||||
LLM_ARCH_COMMAND_R,
|
||||
LLM_ARCH_DBRX,
|
||||
LLM_ARCH_OLMO,
|
||||
LLM_ARCH_OLMO_1124,
|
||||
LLM_ARCH_OLMO2,
|
||||
LLM_ARCH_OLMOE,
|
||||
LLM_ARCH_OPENELM,
|
||||
LLM_ARCH_ARCTIC,
|
||||
@@ -233,7 +233,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_COMMAND_R, "command-r" },
|
||||
{ LLM_ARCH_DBRX, "dbrx" },
|
||||
{ LLM_ARCH_OLMO, "olmo" },
|
||||
{ LLM_ARCH_OLMO_1124, "olmo_1124" },
|
||||
{ LLM_ARCH_OLMO2, "olmo2" },
|
||||
{ LLM_ARCH_OLMOE, "olmoe" },
|
||||
{ LLM_ARCH_OPENELM, "openelm" },
|
||||
{ LLM_ARCH_ARCTIC, "arctic" },
|
||||
@@ -1210,7 +1210,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_OLMO_1124,
|
||||
LLM_ARCH_OLMO2,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
@@ -4866,7 +4866,9 @@ struct llama_model_loader {
|
||||
mappings.reserve(files.size());
|
||||
mmaps_used.reserve(files.size());
|
||||
for (const auto & file : files) {
|
||||
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
|
||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
|
||||
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
|
||||
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn()));
|
||||
mmaps_used.emplace_back(mapping->size, 0);
|
||||
if (mlock_mmaps) {
|
||||
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
|
||||
@@ -5898,7 +5900,7 @@ static void llm_load_hparams(
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_OLMO_1124:
|
||||
case LLM_ARCH_OLMO2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
@@ -7181,12 +7183,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
|
||||
} break;
|
||||
case GGML_OP_ADD:
|
||||
{
|
||||
ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, w->ne[0], 512);
|
||||
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
||||
op_tensor = ggml_add(ctx, a, w);
|
||||
} break;
|
||||
case GGML_OP_MUL:
|
||||
{
|
||||
ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, w->ne[0], 512);
|
||||
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
||||
op_tensor = ggml_mul(ctx, a, w);
|
||||
} break;
|
||||
case GGML_OP_DIV:
|
||||
@@ -8591,7 +8593,7 @@ static bool llm_load_tensors(
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_OLMO_1124:
|
||||
case LLM_ARCH_OLMO2:
|
||||
{
|
||||
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
@@ -9190,7 +9192,7 @@ static bool llm_load_tensors(
|
||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
||||
if (!dev) {
|
||||
// FIXME: workaround for CPU backend buft having a NULL device
|
||||
dev = ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0);
|
||||
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
}
|
||||
ggml_backend_dev_props props;
|
||||
ggml_backend_dev_get_props(dev, &props);
|
||||
@@ -14481,7 +14483,7 @@ struct llm_build_context {
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_olmo_1124() {
|
||||
struct ggml_cgraph * build_olmo2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
@@ -16797,9 +16799,9 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_olmo();
|
||||
} break;
|
||||
case LLM_ARCH_OLMO_1124:
|
||||
case LLM_ARCH_OLMO2:
|
||||
{
|
||||
result = llm.build_olmo_1124();
|
||||
result = llm.build_olmo2();
|
||||
} break;
|
||||
case LLM_ARCH_OLMOE:
|
||||
{
|
||||
@@ -17443,8 +17445,9 @@ static enum ggml_status llama_graph_compute(
|
||||
int n_threads,
|
||||
ggml_threadpool * threadpool) {
|
||||
if (lctx.backend_cpu != nullptr) {
|
||||
ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
|
||||
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu));
|
||||
auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
|
||||
set_threadpool_fn(lctx.backend_cpu, threadpool);
|
||||
}
|
||||
|
||||
// set the number of threads for all the backends
|
||||
@@ -19361,6 +19364,7 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
|
||||
//
|
||||
struct llama_model_params llama_model_default_params() {
|
||||
struct llama_model_params result = {
|
||||
/*.devices =*/ nullptr,
|
||||
/*.n_gpu_layers =*/ 0,
|
||||
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
||||
/*.main_gpu =*/ 0,
|
||||
@@ -19478,7 +19482,11 @@ void llama_backend_init(void) {
|
||||
|
||||
void llama_numa_init(enum ggml_numa_strategy numa) {
|
||||
if (numa != GGML_NUMA_STRATEGY_DISABLED) {
|
||||
ggml_numa_init(numa);
|
||||
auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
GGML_ASSERT(dev && "CPU backend is not loaded");
|
||||
auto * reg = ggml_backend_dev_backend_reg(dev);
|
||||
auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
|
||||
numa_init_fn(numa);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19569,19 +19577,24 @@ struct llama_model * llama_load_model_from_file(
|
||||
}
|
||||
|
||||
// create list of devices to use with this model
|
||||
// currently, we use all available devices
|
||||
// TODO: rework API to give user more control over device selection
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
switch (ggml_backend_dev_type(dev)) {
|
||||
case GGML_BACKEND_DEVICE_TYPE_CPU:
|
||||
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
||||
// skip CPU backends since they are handled separately
|
||||
break;
|
||||
if (params.devices) {
|
||||
for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
|
||||
model->devices.push_back(*dev);
|
||||
}
|
||||
} else {
|
||||
// use all available devices
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
switch (ggml_backend_dev_type(dev)) {
|
||||
case GGML_BACKEND_DEVICE_TYPE_CPU:
|
||||
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
||||
// skip CPU backends since they are handled separately
|
||||
break;
|
||||
|
||||
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
||||
model->devices.push_back(dev);
|
||||
break;
|
||||
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
||||
model->devices.push_back(dev);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19752,9 +19765,6 @@ struct llama_context * llama_new_context_with_model(
|
||||
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
||||
}
|
||||
|
||||
ctx->abort_callback = params.abort_callback;
|
||||
ctx->abort_callback_data = params.abort_callback_data;
|
||||
|
||||
ctx->logits_all = params.logits_all;
|
||||
|
||||
// build worst-case graph for encoder if a model contains encoder
|
||||
@@ -19803,7 +19813,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
}
|
||||
|
||||
// add CPU backend
|
||||
ctx->backend_cpu = ggml_backend_cpu_init();
|
||||
ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
|
||||
if (ctx->backend_cpu == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
||||
llama_free(ctx);
|
||||
@@ -19823,6 +19833,8 @@ struct llama_context * llama_new_context_with_model(
|
||||
}
|
||||
}
|
||||
|
||||
llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data);
|
||||
|
||||
if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
|
||||
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
||||
llama_free(ctx);
|
||||
@@ -19868,7 +19880,8 @@ struct llama_context * llama_new_context_with_model(
|
||||
std::vector<ggml_backend_t> backend_ptrs;
|
||||
for (auto & backend : ctx->backends) {
|
||||
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
|
||||
if (ggml_backend_is_cpu(backend.get()) && !model->devices.empty()) {
|
||||
auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
|
||||
if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) {
|
||||
// use the host buffer of the first device CPU for faster transfer of the intermediate state
|
||||
auto * dev = model->devices[0];
|
||||
auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
|
||||
@@ -19896,7 +19909,8 @@ struct llama_context * llama_new_context_with_model(
|
||||
// pipeline parallelism requires support for async compute and events in all devices
|
||||
if (pipeline_parallel) {
|
||||
for (auto & backend : ctx->backends) {
|
||||
if (ggml_backend_is_cpu(backend.get())) {
|
||||
auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
|
||||
if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
// ignore CPU backend
|
||||
continue;
|
||||
}
|
||||
@@ -20070,7 +20084,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_QWEN:
|
||||
case LLM_ARCH_QWEN2:
|
||||
case LLM_ARCH_QWEN2MOE:
|
||||
case LLM_ARCH_OLMO_1124:
|
||||
case LLM_ARCH_OLMO2:
|
||||
case LLM_ARCH_OLMOE:
|
||||
case LLM_ARCH_PHI2:
|
||||
case LLM_ARCH_PHI3:
|
||||
@@ -21450,6 +21464,14 @@ int32_t llama_n_threads_batch(struct llama_context * ctx) {
|
||||
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
||||
ctx->abort_callback = abort_callback;
|
||||
ctx->abort_callback_data = abort_callback_data;
|
||||
|
||||
for (auto & backend : ctx->backends) {
|
||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
|
||||
auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
|
||||
if (set_abort_callback_fn) {
|
||||
set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
|
||||
@@ -22191,32 +22213,23 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
|
||||
}
|
||||
|
||||
const char * llama_print_system_info(void) {
|
||||
ggml_cpu_init(); // some ARM features are detected at runtime
|
||||
|
||||
static std::string s;
|
||||
|
||||
s = "";
|
||||
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
||||
s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | ";
|
||||
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
||||
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
||||
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
||||
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
||||
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
|
||||
s += "AMX_INT8 = " + std::to_string(ggml_cpu_has_amx_int8()) + " | ";
|
||||
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
||||
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
||||
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
|
||||
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
||||
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
||||
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
||||
s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | ";
|
||||
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
||||
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
||||
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
||||
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
||||
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
||||
s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | ";
|
||||
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
||||
auto * reg = ggml_backend_reg_get(i);
|
||||
auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
|
||||
if (get_features_fn) {
|
||||
ggml_backend_feature * features = get_features_fn(reg);
|
||||
s += ggml_backend_reg_name(reg);
|
||||
s += " : ";
|
||||
for (; features->name; features++) {
|
||||
s += features->name;
|
||||
s += " = ";
|
||||
s += features->value;
|
||||
s += " | ";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return s.c_str();
|
||||
}
|
||||
|
||||
@@ -110,23 +110,26 @@ llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU
|
||||
# llama_target_and_test(test-double-float.cpp) # SLOW
|
||||
llama_target_and_test(test-log.cpp)
|
||||
llama_target_and_test(test-arg-parser.cpp)
|
||||
llama_target_and_test(test-quantize-fns.cpp)
|
||||
llama_target_and_test(test-quantize-perf.cpp)
|
||||
llama_target_and_test(test-sampling.cpp)
|
||||
llama_target_and_test(test-chat-template.cpp)
|
||||
|
||||
llama_target_and_test(test-grammar-parser.cpp)
|
||||
llama_target_and_test(test-grammar-integration.cpp)
|
||||
llama_target_and_test(test-llama-grammar.cpp)
|
||||
llama_target_and_test(test-barrier.cpp)
|
||||
# llama_target_and_test(test-opt.cpp) # SLOW
|
||||
llama_target_and_test(test-backend-ops.cpp)
|
||||
|
||||
llama_target_and_test(test-rope.cpp)
|
||||
|
||||
llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
|
||||
llama_target_and_test(test-autorelease.cpp LABEL "model")
|
||||
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
# these tests use the backends directly and cannot be built with dynamic loading
|
||||
llama_target_and_test(test-barrier.cpp)
|
||||
llama_target_and_test(test-quantize-fns.cpp)
|
||||
llama_target_and_test(test-quantize-perf.cpp)
|
||||
llama_target_and_test(test-rope.cpp)
|
||||
endif()
|
||||
|
||||
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
|
||||
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
||||
|
||||
@@ -70,7 +70,7 @@ int main(void) {
|
||||
|
||||
// non-existence arg in specific example (--draft cannot be used outside llama-speculative)
|
||||
argv = {"binary_name", "--draft", "123"};
|
||||
assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
|
||||
assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_EMBEDDING));
|
||||
|
||||
|
||||
printf("test-arg-parser: test valid usage\n\n");
|
||||
@@ -96,7 +96,7 @@ int main(void) {
|
||||
// --draft cannot be used outside llama-speculative
|
||||
argv = {"binary_name", "--draft", "123"};
|
||||
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
|
||||
assert(params.n_draft == 123);
|
||||
assert(params.speculative.n_max == 123);
|
||||
|
||||
// skip this part on windows, because setenv is not supported
|
||||
#ifdef _WIN32
|
||||
|
||||
@@ -16,7 +16,6 @@
|
||||
|
||||
|
||||
#include <ggml.h>
|
||||
#include <ggml-cpu.h>
|
||||
#include <ggml-alloc.h>
|
||||
#include <ggml-backend.h>
|
||||
|
||||
@@ -26,7 +25,6 @@
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <cinttypes>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <stdio.h>
|
||||
@@ -639,19 +637,20 @@ struct test_case {
|
||||
|
||||
// determine number of runs
|
||||
int n_runs;
|
||||
bool is_cpu = ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_CPU;
|
||||
if (op_flops(out) > 0) {
|
||||
// based on flops
|
||||
const uint64_t GFLOP = 1000 * 1000 * 1000;
|
||||
const uint64_t target_flops_cpu = 8ULL * GFLOP;
|
||||
const uint64_t target_flops_gpu = 100ULL * GFLOP;
|
||||
uint64_t target_flops = ggml_backend_is_cpu(backend) ? target_flops_cpu : target_flops_gpu;
|
||||
uint64_t target_flops = is_cpu ? target_flops_cpu : target_flops_gpu;
|
||||
n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1;
|
||||
} else {
|
||||
// based on memory size
|
||||
const size_t GB = 1ULL << 30;
|
||||
const size_t target_size_cpu = 8 * GB;
|
||||
const size_t target_size_gpu = 32 * GB;
|
||||
size_t target_size = ggml_backend_is_cpu(backend) ? target_size_cpu : target_size_gpu;
|
||||
size_t target_size = is_cpu ? target_size_cpu : target_size_gpu;
|
||||
n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
|
||||
}
|
||||
|
||||
@@ -3873,7 +3872,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
|
||||
if (mode == MODE_TEST) {
|
||||
auto test_cases = make_test_cases_eval();
|
||||
ggml_backend_t backend_cpu = ggml_backend_cpu_init();
|
||||
ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
|
||||
if (backend_cpu == NULL) {
|
||||
printf(" Failed to initialize CPU backend\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t n_ok = 0;
|
||||
for (auto & test : test_cases) {
|
||||
@@ -3953,7 +3956,9 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
// enumerate backends
|
||||
// load and enumerate backends
|
||||
ggml_backend_load_all();
|
||||
|
||||
printf("Testing %zu devices\n\n", ggml_backend_dev_count());
|
||||
|
||||
size_t n_ok = 0;
|
||||
@@ -3969,16 +3974,15 @@ int main(int argc, char ** argv) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
|
||||
GGML_ASSERT(backend != NULL);
|
||||
|
||||
if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) {
|
||||
if (backend_filter == NULL && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && mode != MODE_GRAD) {
|
||||
printf(" Skipping CPU backend\n");
|
||||
ggml_backend_free(backend);
|
||||
n_ok++;
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
|
||||
GGML_ASSERT(backend != NULL);
|
||||
|
||||
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
||||
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
|
||||
if (ggml_backend_set_n_threads_fn) {
|
||||
|
||||
@@ -79,9 +79,9 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
|
||||
}
|
||||
|
||||
// Total dot product error
|
||||
static float dot_product_error(
|
||||
const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float *test_data2
|
||||
) {
|
||||
static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float * test_data2) {
|
||||
GGML_UNUSED(qfns);
|
||||
|
||||
std::vector<uint8_t> tmp_q1(2*test_size);
|
||||
std::vector<uint8_t> tmp_q2(2*test_size);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user