mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-05 00:24:07 +00:00
Compare commits
27 Commits
master-953
...
gguf-pip
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0248ca811e | ||
|
|
2897926d90 | ||
|
|
8798aea247 | ||
|
|
3e98cbe76e | ||
|
|
87338093d6 | ||
|
|
b8a777e77b | ||
|
|
2e5f70a25f | ||
|
|
d0f77b1353 | ||
|
|
0d3094f0c7 | ||
|
|
01f2224682 | ||
|
|
38b16dfca6 | ||
|
|
8f8c28e89c | ||
|
|
7694adda8d | ||
|
|
fea95c682d | ||
|
|
ef955fbd23 | ||
|
|
d67777c202 | ||
|
|
c3e53b421a | ||
|
|
0288361b65 | ||
|
|
344f6e373b | ||
|
|
5dd870574e | ||
|
|
050046fa45 | ||
|
|
6e91a1b070 | ||
|
|
44d5462b5c | ||
|
|
c7868b0753 | ||
|
|
79da24b58c | ||
|
|
cf658adc83 | ||
|
|
a192860cfe |
63
.github/workflows/build.yml
vendored
63
.github/workflows/build.yml
vendored
@@ -291,24 +291,32 @@ jobs:
|
||||
cd build
|
||||
ctest -C Release --verbose --timeout 900
|
||||
|
||||
- name: Get commit hash
|
||||
id: commit
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: pr-mpt/actions-commit-hash@v2
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
|
||||
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
path: |
|
||||
llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
|
||||
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
|
||||
|
||||
windows-latest-cmake-cublas:
|
||||
runs-on: windows-latest
|
||||
@@ -338,23 +346,31 @@ jobs:
|
||||
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
|
||||
cmake --build . --config Release
|
||||
|
||||
- name: Get commit hash
|
||||
id: commit
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: pr-mpt/actions-commit-hash@v2
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
path: |
|
||||
llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
|
||||
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
|
||||
|
||||
- name: Copy and pack Cuda runtime
|
||||
if: ${{ matrix.cuda == '12.1.0' }}
|
||||
@@ -400,21 +416,34 @@ jobs:
|
||||
- windows-latest-cmake-cublas
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Download artifacts
|
||||
id: download-artifact
|
||||
uses: actions/download-artifact@v3
|
||||
|
||||
- name: Get commit hash
|
||||
id: commit
|
||||
uses: pr-mpt/actions-commit-hash@v2
|
||||
|
||||
- name: Create release
|
||||
id: create_release
|
||||
uses: anzz1/action-create-release@v1
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
|
||||
tag_name: ${{ steps.tag.outputs.name }}
|
||||
|
||||
- name: Upload release
|
||||
id: upload_release
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -60,6 +60,7 @@ compile_commands.json
|
||||
CMakeSettings.json
|
||||
|
||||
__pycache__
|
||||
dist
|
||||
|
||||
zig-out/
|
||||
zig-cache/
|
||||
@@ -70,7 +71,6 @@ perf-*.txt
|
||||
|
||||
examples/jeopardy/results.txt
|
||||
|
||||
pyproject.toml
|
||||
poetry.lock
|
||||
poetry.toml
|
||||
|
||||
|
||||
160
README.md
160
README.md
@@ -11,15 +11,17 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
### Hot topics
|
||||
|
||||
A new file format has been introduced: [GGUF](https://github.com/ggerganov/llama.cpp/pull/2398)
|
||||
- Added support for Falcon models: https://github.com/ggerganov/llama.cpp/pull/2717
|
||||
|
||||
Last revision compatible with the old format: [dadbed9](https://github.com/ggerganov/llama.cpp/commit/dadbed99e65252d79f81101a392d0d6497b86caa)
|
||||
- A new file format has been introduced: [GGUF](https://github.com/ggerganov/llama.cpp/pull/2398)
|
||||
|
||||
### Current `master` should be considered in Beta - expect some issues for a few days!
|
||||
Last revision compatible with the old format: [dadbed9](https://github.com/ggerganov/llama.cpp/commit/dadbed99e65252d79f81101a392d0d6497b86caa)
|
||||
|
||||
### Be prepared to re-convert and / or re-quantize your GGUF models while this notice is up!
|
||||
### Current `master` should be considered in Beta - expect some issues for a few days!
|
||||
|
||||
### Issues with non-GGUF models will be considered with low priority!
|
||||
### Be prepared to re-convert and / or re-quantize your GGUF models while this notice is up!
|
||||
|
||||
### Issues with non-GGUF models will be considered with low priority!
|
||||
|
||||
----
|
||||
|
||||
@@ -66,12 +68,11 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
|
||||
- Apple silicon first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
||||
- AVX, AVX2 and AVX512 support for x86 architectures
|
||||
- Mixed F16 / F32 precision
|
||||
- 4-bit, 5-bit and 8-bit integer quantization support
|
||||
- Supports OpenBLAS/Apple BLAS/ARM Performance Lib/ATLAS/BLIS/Intel MKL/NVHPC/ACML/SCSL/SGIMATH and [more](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) in BLAS
|
||||
- cuBLAS and CLBlast support
|
||||
- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support
|
||||
- CUDA, Metal and OpenCL GPU backend support
|
||||
|
||||
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
|
||||
Since then, the project has improved significantly thanks to many contributions. This project is for educational purposes and serves
|
||||
Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves
|
||||
as the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library.
|
||||
|
||||
**Supported platforms:**
|
||||
@@ -85,6 +86,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||
|
||||
- [X] LLaMA 🦙
|
||||
- [x] LLaMA 2 🦙🦙
|
||||
- [X] Falcon
|
||||
- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
|
||||
- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
|
||||
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
|
||||
@@ -115,90 +117,84 @@ as the main playground for developing new features for the [ggml](https://github
|
||||
|
||||
---
|
||||
|
||||
Here is a typical run using LLaMA-7B:
|
||||
Here is a typical run using LLaMA v2 13B on M2 Ultra:
|
||||
|
||||
```java
|
||||
make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
$ make -j && ./main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
|
||||
I llama.cpp build info:
|
||||
I UNAME_S: Darwin
|
||||
I UNAME_P: arm
|
||||
I UNAME_M: arm64
|
||||
I CFLAGS: -I. -O3 -DNDEBUG -std=c11 -fPIC -pthread -DGGML_USE_ACCELERATE
|
||||
I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread
|
||||
I CFLAGS: -I. -O3 -std=c11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE
|
||||
I CXXFLAGS: -I. -I./common -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS
|
||||
I LDFLAGS: -framework Accelerate
|
||||
I CC: Apple clang version 14.0.0 (clang-1400.0.29.202)
|
||||
I CXX: Apple clang version 14.0.0 (clang-1400.0.29.202)
|
||||
I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
|
||||
I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
|
||||
|
||||
make: Nothing to be done for `default'.
|
||||
main: seed = 1678486056
|
||||
llama_model_load: loading model from './models/7B/ggml-model-q4_0.bin' - please wait ...
|
||||
llama_model_load: n_vocab = 32000
|
||||
llama_model_load: n_ctx = 512
|
||||
llama_model_load: n_embd = 4096
|
||||
llama_model_load: n_mult = 256
|
||||
llama_model_load: n_head = 32
|
||||
llama_model_load: n_layer = 32
|
||||
llama_model_load: n_rot = 128
|
||||
llama_model_load: f16 = 2
|
||||
llama_model_load: n_ff = 11008
|
||||
llama_model_load: ggml ctx size = 4529.34 MB
|
||||
llama_model_load: memory_size = 512.00 MB, n_mem = 16384
|
||||
llama_model_load: .................................... done
|
||||
llama_model_load: model size = 4017.27 MB / num tensors = 291
|
||||
main: build = 1041 (cf658ad)
|
||||
main: seed = 1692823051
|
||||
llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
|
||||
llama_model_loader: - type f32: 81 tensors
|
||||
llama_model_loader: - type q4_0: 281 tensors
|
||||
llama_model_loader: - type q6_K: 1 tensors
|
||||
llm_load_print_meta: format = GGUF V1 (latest)
|
||||
llm_load_print_meta: arch = llama
|
||||
llm_load_print_meta: vocab type = SPM
|
||||
llm_load_print_meta: n_vocab = 32000
|
||||
llm_load_print_meta: n_merges = 0
|
||||
llm_load_print_meta: n_ctx_train = 4096
|
||||
llm_load_print_meta: n_ctx = 512
|
||||
llm_load_print_meta: n_embd = 5120
|
||||
llm_load_print_meta: n_head = 40
|
||||
llm_load_print_meta: n_head_kv = 40
|
||||
llm_load_print_meta: n_layer = 40
|
||||
llm_load_print_meta: n_rot = 128
|
||||
llm_load_print_meta: n_gqa = 1
|
||||
llm_load_print_meta: f_norm_eps = 1.0e-05
|
||||
llm_load_print_meta: f_norm_rms_eps = 1.0e-05
|
||||
llm_load_print_meta: n_ff = 13824
|
||||
llm_load_print_meta: freq_base = 10000.0
|
||||
llm_load_print_meta: freq_scale = 1
|
||||
llm_load_print_meta: model type = 13B
|
||||
llm_load_print_meta: model ftype = mostly Q4_0
|
||||
llm_load_print_meta: model size = 13.02 B
|
||||
llm_load_print_meta: general.name = LLaMA v2
|
||||
llm_load_print_meta: BOS token = 1 '<s>'
|
||||
llm_load_print_meta: EOS token = 2 '</s>'
|
||||
llm_load_print_meta: UNK token = 0 '<unk>'
|
||||
llm_load_print_meta: LF token = 13 '<0x0A>'
|
||||
llm_load_tensors: ggml ctx size = 0.11 MB
|
||||
llm_load_tensors: mem required = 7024.01 MB (+ 400.00 MB per state)
|
||||
...................................................................................................
|
||||
llama_new_context_with_model: kv self size = 400.00 MB
|
||||
llama_new_context_with_model: compute buffer total size = 75.41 MB
|
||||
|
||||
main: prompt: 'Building a website can be done in 10 simple steps:'
|
||||
main: number of tokens in prompt = 15
|
||||
1 -> ''
|
||||
8893 -> 'Build'
|
||||
292 -> 'ing'
|
||||
263 -> ' a'
|
||||
4700 -> ' website'
|
||||
508 -> ' can'
|
||||
367 -> ' be'
|
||||
2309 -> ' done'
|
||||
297 -> ' in'
|
||||
29871 -> ' '
|
||||
29896 -> '1'
|
||||
29900 -> '0'
|
||||
2560 -> ' simple'
|
||||
6576 -> ' steps'
|
||||
29901 -> ':'
|
||||
|
||||
sampling parameters: temp = 0.800000, top_k = 40, top_p = 0.950000
|
||||
system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
|
||||
sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
|
||||
generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
|
||||
|
||||
|
||||
Building a website can be done in 10 simple steps:
|
||||
1) Select a domain name and web hosting plan
|
||||
2) Complete a sitemap
|
||||
3) List your products
|
||||
4) Write product descriptions
|
||||
5) Create a user account
|
||||
6) Build the template
|
||||
7) Start building the website
|
||||
8) Advertise the website
|
||||
9) Provide email support
|
||||
10) Submit the website to search engines
|
||||
A website is a collection of web pages that are formatted with HTML. HTML is the code that defines what the website looks like and how it behaves.
|
||||
The HTML code is formatted into a template or a format. Once this is done, it is displayed on the user's browser.
|
||||
The web pages are stored in a web server. The web server is also called a host. When the website is accessed, it is retrieved from the server and displayed on the user's computer.
|
||||
A website is known as a website when it is hosted. This means that it is displayed on a host. The host is usually a web server.
|
||||
A website can be displayed on different browsers. The browsers are basically the software that renders the website on the user's screen.
|
||||
A website can also be viewed on different devices such as desktops, tablets and smartphones.
|
||||
Hence, to have a website displayed on a browser, the website must be hosted.
|
||||
A domain name is an address of a website. It is the name of the website.
|
||||
The website is known as a website when it is hosted. This means that it is displayed on a host. The host is usually a web server.
|
||||
A website can be displayed on different browsers. The browsers are basically the software that renders the website on the user’s screen.
|
||||
A website can also be viewed on different devices such as desktops, tablets and smartphones. Hence, to have a website displayed on a browser, the website must be hosted.
|
||||
A domain name is an address of a website. It is the name of the website.
|
||||
A website is an address of a website. It is a collection of web pages that are formatted with HTML. HTML is the code that defines what the website looks like and how it behaves.
|
||||
The HTML code is formatted into a template or a format. Once this is done, it is displayed on the user’s browser.
|
||||
A website is known as a website when it is hosted
|
||||
|
||||
main: mem per token = 14434244 bytes
|
||||
main: load time = 1332.48 ms
|
||||
main: sample time = 1081.40 ms
|
||||
main: predict time = 31378.77 ms / 61.41 ms per token
|
||||
main: total time = 34036.74 ms
|
||||
Building a website can be done in 10 simple steps:
|
||||
Step 1: Find the right website platform.
|
||||
Step 2: Choose your domain name and hosting plan.
|
||||
Step 3: Design your website layout.
|
||||
Step 4: Write your website content and add images.
|
||||
Step 5: Install security features to protect your site from hackers or spammers
|
||||
Step 6: Test your website on multiple browsers, mobile devices, operating systems etc…
|
||||
Step 7: Test it again with people who are not related to you personally – friends or family members will work just fine!
|
||||
Step 8: Start marketing and promoting the website via social media channels or paid ads
|
||||
Step 9: Analyze how many visitors have come to your site so far, what type of people visit more often than others (e.g., men vs women) etc…
|
||||
Step 10: Continue to improve upon all aspects mentioned above by following trends in web design and staying up-to-date on new technologies that can enhance user experience even further!
|
||||
How does a Website Work?
|
||||
A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit – whether it’s an image or text file (like PDFs). In order for someone else’s browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
|
||||
The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols – this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
|
||||
How to
|
||||
llama_print_timings: load time = 576.45 ms
|
||||
llama_print_timings: sample time = 283.10 ms / 400 runs ( 0.71 ms per token, 1412.91 tokens per second)
|
||||
llama_print_timings: prompt eval time = 599.83 ms / 19 tokens ( 31.57 ms per token, 31.68 tokens per second)
|
||||
llama_print_timings: eval time = 24513.59 ms / 399 runs ( 61.44 ms per token, 16.28 tokens per second)
|
||||
llama_print_timings: total time = 25431.49 ms
|
||||
```
|
||||
|
||||
And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
|
||||
@@ -543,6 +539,8 @@ As the models are currently fully loaded into memory, you will need adequate dis
|
||||
|
||||
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
|
||||
|
||||
*(outdated)*
|
||||
|
||||
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|
||||
|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
|
||||
| 7B | perplexity | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
|
||||
|
||||
@@ -744,35 +744,3 @@ std::string llama_token_to_str(const struct llama_context * ctx, llama_token tok
|
||||
|
||||
return std::string(result.data(), result.size());
|
||||
}
|
||||
|
||||
std::vector<llama_token> llama_tokenize_bpe(
|
||||
struct llama_context * ctx,
|
||||
const std::string & text,
|
||||
bool add_bos) {
|
||||
int n_tokens = text.length() + add_bos;
|
||||
std::vector<llama_token> result(n_tokens);
|
||||
n_tokens = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token) {
|
||||
std::vector<char> result(8, 0);
|
||||
const int n_tokens = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
const int check = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
}
|
||||
|
||||
return std::string(result.data(), result.size());
|
||||
}
|
||||
|
||||
|
||||
@@ -120,15 +120,6 @@ std::vector<llama_token> llama_tokenize(
|
||||
const std::string & text,
|
||||
bool add_bos);
|
||||
|
||||
std::vector<llama_token> llama_tokenize_bpe(
|
||||
struct llama_context * ctx,
|
||||
const std::string & text,
|
||||
bool add_bos);
|
||||
|
||||
std::string llama_token_to_str(
|
||||
const struct llama_context * ctx,
|
||||
llama_token token);
|
||||
|
||||
std::string llama_token_to_str_bpe(
|
||||
const struct llama_context * ctx,
|
||||
llama_token token);
|
||||
|
||||
@@ -95,21 +95,27 @@ print("gguf: get model metadata")
|
||||
|
||||
block_count = hparams["n_layer"]
|
||||
|
||||
gguf_writer.add_name(last_dir)
|
||||
gguf_writer.add_name("Falcon")
|
||||
gguf_writer.add_context_length(2048) # not in config.json
|
||||
gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
|
||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
|
||||
gguf_writer.add_block_count(block_count)
|
||||
gguf_writer.add_head_count(hparams["n_head"])
|
||||
if "n_head_kv" in hparams: gguf_writer.add_head_count_kv(hparams["n_head_kv"])
|
||||
if "n_head_kv" in hparams:
|
||||
gguf_writer.add_head_count_kv(hparams["n_head_kv"])
|
||||
else:
|
||||
gguf_writer.add_head_count_kv(1)
|
||||
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
||||
gguf_writer.add_file_type(ftype)
|
||||
|
||||
# TOKENIZATION
|
||||
|
||||
print("gguf: get tokenizer metadata")
|
||||
|
||||
tokens: List[str] = []
|
||||
scores: List[float] = []
|
||||
toktypes: List[int] = []
|
||||
merges: List[str] = []
|
||||
|
||||
|
||||
@@ -153,41 +159,30 @@ if Path(dir_model + "/tokenizer.json").is_file():
|
||||
text = bytearray(pad_token)
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(0.0) # dymmy
|
||||
toktypes.append(gguf.TokenType.NORMAL) # dummy
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
|
||||
print("gguf: get special token ids")
|
||||
print("gguf: get special token ids")
|
||||
# Look for special tokens in config.json
|
||||
|
||||
with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
|
||||
tokenizer_config = json.load(f)
|
||||
if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
|
||||
gguf_writer.add_bos_token_id(hparams["bos_token_id"])
|
||||
|
||||
# find special token ids
|
||||
if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
|
||||
gguf_writer.add_eos_token_id(hparams["eos_token_id"])
|
||||
|
||||
if "bos_token" in tokenizer_config:
|
||||
for key in tokenizer_json["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["bos_token"]:
|
||||
gguf_writer.add_bos_token_id(key["id"])
|
||||
if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
|
||||
gguf_writer.add_unk_token_id(hparams["unk_token_id"])
|
||||
|
||||
if "eos_token" in tokenizer_config:
|
||||
for key in tokenizer_json["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["eos_token"]:
|
||||
gguf_writer.add_eos_token_id(key["id"])
|
||||
if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
|
||||
gguf_writer.add_sep_token_id(hparams["sep_token_id"])
|
||||
|
||||
if "unk_token" in tokenizer_config:
|
||||
for key in tokenizer_json["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["unk_token"]:
|
||||
gguf_writer.add_unk_token_id(key["id"])
|
||||
|
||||
if "sep_token" in tokenizer_config:
|
||||
for key in tokenizer_json["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["sep_token"]:
|
||||
gguf_writer.add_sep_token_id(key["id"])
|
||||
|
||||
if "pad_token" in tokenizer_config:
|
||||
for key in tokenizer_json["added_tokens"]:
|
||||
if key["content"] == tokenizer_config["pad_token"]:
|
||||
gguf_writer.add_pad_token_id(key["id"])
|
||||
if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
|
||||
gguf_writer.add_pad_token_id(hparams["pad_token_id"])
|
||||
|
||||
|
||||
# TENSORS
|
||||
@@ -195,8 +190,9 @@ if Path(dir_model + "/tokenizer.json").is_file():
|
||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
||||
|
||||
# params for qkv transform
|
||||
n_head = hparams["n_head"]
|
||||
n_head = hparams["n_head"]
|
||||
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
|
||||
|
||||
head_dim = hparams["hidden_size"] // n_head
|
||||
|
||||
# tensor info
|
||||
|
||||
66
convert.py
66
convert.py
@@ -104,8 +104,13 @@ class Params:
|
||||
n_head_kv: int
|
||||
f_norm_eps: float
|
||||
|
||||
f_rope_freq_base: Optional[float] = None
|
||||
|
||||
ftype: Optional[GGMLFileType] = None
|
||||
|
||||
# path to the directory containing the model files
|
||||
path_model: Optional['Path'] = None
|
||||
|
||||
@staticmethod
|
||||
def find_n_mult(n_ff: int, n_embd: int) -> int:
|
||||
# hardcoded magic range
|
||||
@@ -191,15 +196,26 @@ class Params:
|
||||
def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
|
||||
config = json.load(open(config_path))
|
||||
|
||||
n_vocab = config["vocab_size"]
|
||||
n_embd = config["dim"]
|
||||
n_layer = config["n_layers"]
|
||||
n_mult = config["multiple_of"]
|
||||
n_ctx = 2048 if config["norm_eps"] == 1e-06 else 4096 # hack to determine LLaMA v1 vs v2
|
||||
n_ff = -1
|
||||
n_head = config["n_heads"]
|
||||
n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head
|
||||
f_norm_eps = config["norm_eps"]
|
||||
n_vocab = config["vocab_size"] if "vocab_size" in config else -1
|
||||
n_embd = config["dim"]
|
||||
n_layer = config["n_layers"]
|
||||
n_mult = config["multiple_of"]
|
||||
n_ff = -1
|
||||
n_head = config["n_heads"]
|
||||
n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head
|
||||
f_norm_eps = config["norm_eps"]
|
||||
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
|
||||
|
||||
# hack to determine LLaMA v1 vs v2 vs CodeLlama
|
||||
if f_rope_freq_base and f_rope_freq_base == 1000000:
|
||||
# CodeLlama
|
||||
n_ctx = 16384
|
||||
elif config["norm_eps"] == 1e-05:
|
||||
# LLaMA v2
|
||||
n_ctx = 4096
|
||||
else:
|
||||
# LLaMA v1
|
||||
n_ctx = 2048
|
||||
|
||||
if n_vocab == -1:
|
||||
n_vocab = model["tok_embeddings.weight"].shape[0]
|
||||
@@ -208,15 +224,16 @@ class Params:
|
||||
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
|
||||
|
||||
return Params(
|
||||
n_vocab = n_vocab,
|
||||
n_embd = n_embd,
|
||||
n_mult = n_mult,
|
||||
n_layer = n_layer,
|
||||
n_ctx = n_ctx,
|
||||
n_ff = n_ff,
|
||||
n_head = n_head,
|
||||
n_head_kv = n_head_kv,
|
||||
f_norm_eps = f_norm_eps,
|
||||
n_vocab = n_vocab,
|
||||
n_embd = n_embd,
|
||||
n_mult = n_mult,
|
||||
n_layer = n_layer,
|
||||
n_ctx = n_ctx,
|
||||
n_ff = n_ff,
|
||||
n_head = n_head,
|
||||
n_head_kv = n_head_kv,
|
||||
f_norm_eps = f_norm_eps,
|
||||
f_rope_freq_base = f_rope_freq_base,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@@ -231,6 +248,8 @@ class Params:
|
||||
else:
|
||||
params = Params.guessed(model_plus.model)
|
||||
|
||||
params.path_model = model_plus.paths[0].parent
|
||||
|
||||
return params
|
||||
|
||||
|
||||
@@ -733,7 +752,13 @@ class OutputFile:
|
||||
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||
|
||||
def add_meta_arch(self, params: Params) -> None:
|
||||
self.gguf.add_name ("LLaMA")
|
||||
name = "LLaMA"
|
||||
if (params.n_ctx == 4096):
|
||||
name = "LLaMA v2"
|
||||
if params.path_model:
|
||||
name = str(params.path_model.parent).split('/')[-1]
|
||||
|
||||
self.gguf.add_name (name)
|
||||
self.gguf.add_context_length (params.n_ctx)
|
||||
self.gguf.add_embedding_length (params.n_embd)
|
||||
self.gguf.add_block_count (params.n_layer)
|
||||
@@ -743,6 +768,9 @@ class OutputFile:
|
||||
self.gguf.add_head_count_kv (params.n_head_kv)
|
||||
self.gguf.add_layer_norm_rms_eps (params.f_norm_eps)
|
||||
|
||||
if params.f_rope_freq_base:
|
||||
self.gguf.add_rope_freq_base(params.f_rope_freq_base)
|
||||
|
||||
if params.ftype:
|
||||
self.gguf.add_file_type(params.ftype)
|
||||
|
||||
|
||||
@@ -557,7 +557,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
|
||||
// }
|
||||
// llama_free(lctx);
|
||||
// llama_free_model(lmodel);
|
||||
// } else
|
||||
// } else
|
||||
{ // assume llama2.c vocabulary
|
||||
printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
|
||||
llama_file file(filename, "rb");
|
||||
|
||||
@@ -43,7 +43,7 @@ static bool is_interacting = false;
|
||||
void sigint_handler(int signo) {
|
||||
if (signo == SIGINT) {
|
||||
if (!is_interacting) {
|
||||
is_interacting=true;
|
||||
is_interacting = true;
|
||||
} else {
|
||||
console::cleanup();
|
||||
printf("\n");
|
||||
@@ -189,10 +189,12 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||
|
||||
// tokenize the prompt
|
||||
std::vector<llama_token> embd_inp;
|
||||
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
||||
embd_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||
embd_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
|
||||
} else {
|
||||
embd_inp = session_tokens;
|
||||
}
|
||||
@@ -208,9 +210,9 @@ int main(int argc, char ** argv) {
|
||||
int original_prompt_len = 0;
|
||||
if (ctx_guidance) {
|
||||
params.cfg_negative_prompt.insert(0, 1, ' ');
|
||||
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, true);
|
||||
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, is_spm);
|
||||
|
||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
|
||||
original_prompt_len = original_inp.size();
|
||||
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
||||
}
|
||||
@@ -257,8 +259,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// prefix & suffix for instruct mode
|
||||
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
|
||||
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
|
||||
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", is_spm);
|
||||
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
|
||||
|
||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||
if (params.instruct) {
|
||||
@@ -796,7 +798,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
|
||||
if (params.interactive && n_remain <= 0 && params.n_predict != -1) {
|
||||
// We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
|
||||
if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
|
||||
n_remain = params.n_predict;
|
||||
is_interacting = true;
|
||||
}
|
||||
|
||||
@@ -28,7 +28,6 @@ std::vector<float> softmax(const std::vector<float>& logits) {
|
||||
}
|
||||
|
||||
void perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
||||
|
||||
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||
// Output: `perplexity: 13.5106 [114/114]`
|
||||
@@ -38,7 +37,13 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
||||
fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
|
||||
return;
|
||||
}
|
||||
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||
const bool add_bos = is_spm;
|
||||
|
||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||
|
||||
auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
|
||||
const int calc_chunk = params.n_ctx;
|
||||
|
||||
@@ -86,7 +91,7 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
||||
const auto token_org = tokens[batch_start];
|
||||
|
||||
// add BOS token for the first batch of each chunk
|
||||
if (j == 0) {
|
||||
if (add_bos && j == 0) {
|
||||
tokens[batch_start] = llama_token_bos(ctx);
|
||||
}
|
||||
|
||||
@@ -136,7 +141,6 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
||||
}
|
||||
|
||||
void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
|
||||
if (params.ppl_stride > 0) {
|
||||
perplexity_v2(ctx, params);
|
||||
return;
|
||||
@@ -146,7 +150,13 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||
// Output: `perplexity: 13.5106 [114/114]`
|
||||
// BOS tokens will be added for each chunk before eval
|
||||
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||
const bool add_bos = is_spm;
|
||||
|
||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||
|
||||
auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
|
||||
const int n_chunk_max = tokens.size() / params.n_ctx;
|
||||
|
||||
@@ -177,7 +187,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
const auto token_org = tokens[batch_start];
|
||||
|
||||
// add BOS token for the first batch of each chunk
|
||||
if (j == 0) {
|
||||
if (add_bos && j == 0) {
|
||||
tokens[batch_start] = llama_token_bos(ctx);
|
||||
}
|
||||
|
||||
@@ -295,8 +305,10 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
size_t hs_task_count = prompt_lines.size()/6;
|
||||
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
|
||||
|
||||
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||
|
||||
// This is needed as usual for LLaMA models
|
||||
bool prepend_bos = true;
|
||||
const bool add_bos = is_spm;
|
||||
|
||||
// Number of tasks to use when computing the score
|
||||
if ( params.hellaswag_tasks < hs_task_count ) {
|
||||
@@ -352,14 +364,13 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
std::vector<float> tok_logits(n_vocab);
|
||||
|
||||
for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
|
||||
|
||||
// Tokenize the context to count tokens
|
||||
std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos);
|
||||
std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, add_bos);
|
||||
size_t context_size = context_embd.size();
|
||||
|
||||
// Do the 1st ending
|
||||
// In this case we include the context when evaluating
|
||||
auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], prepend_bos);
|
||||
auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos);
|
||||
auto query_size = query_embd.size();
|
||||
//printf("First query: %d\n",(int)query_size);
|
||||
|
||||
|
||||
143
ggml-alloc.c
143
ggml-alloc.c
@@ -68,7 +68,7 @@ struct ggml_allocr {
|
||||
size_t max_size;
|
||||
bool measure;
|
||||
int parse_seq[GGML_MAX_NODES];
|
||||
bool has_parse_seq;
|
||||
int parse_seq_len;
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
struct ggml_tensor * allocated_tensors[1024];
|
||||
@@ -238,15 +238,11 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
|
||||
alloc->n_free_blocks++;
|
||||
}
|
||||
|
||||
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
|
||||
int pos = 0;
|
||||
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
if (list[i] != -1) {
|
||||
alloc->parse_seq[pos] = list[i];
|
||||
pos++;
|
||||
}
|
||||
alloc->parse_seq[i] = list[i];
|
||||
}
|
||||
alloc->has_parse_seq = true;
|
||||
alloc->parse_seq_len = n;
|
||||
}
|
||||
|
||||
void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
||||
@@ -269,7 +265,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
||||
/*.max_size = */ 0,
|
||||
/*.measure = */ false,
|
||||
/*.parse_seq = */ {0},
|
||||
/*.has_parse_seq = */ false,
|
||||
/*.parse_seq_len = */ 0,
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
/*.allocated_tensors = */ = {0},
|
||||
#endif
|
||||
@@ -298,7 +294,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
||||
/*.max_size = */ 0,
|
||||
/*.measure = */ true,
|
||||
/*.parse_seq = */ {0},
|
||||
/*.has_parse_seq = */ false,
|
||||
/*.parse_seq_len = */ 0,
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
/*.allocated_tensors = */ = {0},
|
||||
#endif
|
||||
@@ -445,8 +441,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
||||
else {
|
||||
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
||||
node->data = parent->data;
|
||||
return;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -497,69 +493,86 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
|
||||
allocate_node(alloc, input);
|
||||
}
|
||||
}
|
||||
for (int ind = 0; ind < gf->n_nodes; ind++) {
|
||||
int i;
|
||||
if (alloc->has_parse_seq) {
|
||||
i = alloc->parse_seq[ind];
|
||||
} else {
|
||||
i = ind;
|
||||
}
|
||||
struct ggml_tensor * node = gf->nodes[i];
|
||||
// if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
|
||||
int last_barrier_pos = 0;
|
||||
int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
|
||||
|
||||
// allocate parents (leafs)
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * parent = node->src[j];
|
||||
if (parent == NULL) {
|
||||
break;
|
||||
for (int ind = 0; ind < n_nodes; ind++) {
|
||||
// allocate a node if there is no parse_seq or this is not a barrier
|
||||
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
|
||||
int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
|
||||
struct ggml_tensor * node = gf->nodes[i];
|
||||
|
||||
// allocate parents (leafs)
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * parent = node->src[j];
|
||||
if (parent == NULL) {
|
||||
break;
|
||||
}
|
||||
allocate_node(alloc, parent);
|
||||
}
|
||||
allocate_node(alloc, parent);
|
||||
|
||||
// allocate node
|
||||
allocate_node(alloc, node);
|
||||
|
||||
AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * parent = node->src[j];
|
||||
if (parent == NULL) {
|
||||
break;
|
||||
}
|
||||
AT_PRINTF("%s", parent->name);
|
||||
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
||||
AT_PRINTF(", ");
|
||||
}
|
||||
}
|
||||
AT_PRINTF("\n");
|
||||
}
|
||||
|
||||
// allocate node
|
||||
allocate_node(alloc, node);
|
||||
|
||||
AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * parent = node->src[j];
|
||||
if (parent == NULL) {
|
||||
break;
|
||||
}
|
||||
AT_PRINTF("%s", parent->name);
|
||||
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
||||
AT_PRINTF(", ");
|
||||
}
|
||||
}
|
||||
AT_PRINTF("\n");
|
||||
|
||||
// update parents
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * parent = node->src[j];
|
||||
if (parent == NULL) {
|
||||
break;
|
||||
// update immediately if there is no parse_seq
|
||||
// update only at barriers if there is parse_seq
|
||||
if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
|
||||
int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
|
||||
int update_end = alloc->parse_seq_len ? ind : ind + 1;
|
||||
for (int i = update_start; i < update_end; i++) {
|
||||
int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
|
||||
struct ggml_tensor * node = gf->nodes[node_i];
|
||||
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * parent = node->src[j];
|
||||
if (parent == NULL) {
|
||||
break;
|
||||
}
|
||||
struct hash_node * p_hn = hash_get(ht, parent);
|
||||
p_hn->n_children -= 1;
|
||||
|
||||
//AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
|
||||
|
||||
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
||||
if (ggml_is_view(parent)) {
|
||||
struct ggml_tensor * view_src = get_view_source(parent);
|
||||
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
||||
view_src_hn->n_views -= 1;
|
||||
AT_PRINTF("view_src %s\n", view_src->name);
|
||||
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
||||
ggml_allocator_free_tensor(alloc, view_src);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (parent->data != node->data) {
|
||||
ggml_allocator_free_tensor(alloc, parent);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
struct hash_node * p_hn = hash_get(ht, parent);
|
||||
p_hn->n_children -= 1;
|
||||
|
||||
//AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
|
||||
|
||||
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
||||
if (ggml_is_view(parent)) {
|
||||
struct ggml_tensor * view_src = get_view_source(parent);
|
||||
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
||||
view_src_hn->n_views -= 1;
|
||||
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
|
||||
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
||||
ggml_allocator_free_tensor(alloc, view_src);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (parent->data != node->data) {
|
||||
ggml_allocator_free_tensor(alloc, parent);
|
||||
}
|
||||
}
|
||||
AT_PRINTF("\n");
|
||||
if (alloc->parse_seq_len) {
|
||||
last_barrier_pos = ind + 1;
|
||||
}
|
||||
}
|
||||
AT_PRINTF("\n");
|
||||
}
|
||||
// free graph outputs here that wouldn't be freed otherwise because they have no children
|
||||
if (outputs != NULL && outputs[g] != NULL) {
|
||||
|
||||
@@ -12,7 +12,7 @@ GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
||||
|
||||
// tell the allocator to parse nodes following the order described in the list
|
||||
// you should call this if your graph are optimized to execute out-of-order
|
||||
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
|
||||
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
|
||||
|
||||
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
|
||||
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|
||||
|
||||
29
ggml-cuda.cu
29
ggml-cuda.cu
@@ -3907,6 +3907,29 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
||||
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
|
||||
// TODO: this implementation is wrong!
|
||||
//static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
|
||||
// const float p_delta, const int p_delta_rows, const float theta_scale) {
|
||||
// const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||
//
|
||||
// if (col >= ncols) {
|
||||
// return;
|
||||
// }
|
||||
//
|
||||
// const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
// const int i = row*ncols + col/2;
|
||||
//
|
||||
// const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
||||
// const float sin_theta = sinf(theta);
|
||||
// const float cos_theta = cosf(theta);
|
||||
//
|
||||
// const float x0 = x[i + 0];
|
||||
// const float x1 = x[i + ncols/2];
|
||||
//
|
||||
// dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||
// dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
||||
//}
|
||||
|
||||
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
||||
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
const int half_n_dims = ncols/4;
|
||||
@@ -5515,7 +5538,8 @@ inline void ggml_cuda_op_rope(
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
|
||||
const bool is_glm = mode & 4;
|
||||
const bool is_neox = mode & 2;
|
||||
const bool is_glm = mode & 4;
|
||||
|
||||
// compute
|
||||
if (is_glm) {
|
||||
@@ -5523,6 +5547,9 @@ inline void ggml_cuda_op_rope(
|
||||
const float id_p = min(p, n_ctx - 2.f);
|
||||
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
||||
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
||||
} else if (is_neox) {
|
||||
GGML_ASSERT(false && "RoPE NeoX not implemented yet");
|
||||
#pragma message("TODO: implement RoPE NeoX for CUDA")
|
||||
} else {
|
||||
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
||||
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
||||
|
||||
153
ggml-metal.m
153
ggml-metal.m
@@ -63,6 +63,7 @@ struct ggml_metal_context {
|
||||
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q8_0);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q2_K);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q3_K);
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q4_K);
|
||||
@@ -73,6 +74,7 @@ struct ggml_metal_context {
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
|
||||
@@ -81,6 +83,7 @@ struct ggml_metal_context {
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_q8_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32);
|
||||
@@ -167,7 +170,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
#define GGML_METAL_ADD_KERNEL(name) \
|
||||
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
||||
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
||||
fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); \
|
||||
fprintf(stderr, "%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
|
||||
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
|
||||
(int) ctx->pipeline_##name.threadExecutionWidth); \
|
||||
if (error) { \
|
||||
fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||
return NULL; \
|
||||
@@ -186,6 +191,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q8_0);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q2_K);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q3_K);
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q4_K);
|
||||
@@ -196,6 +202,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
|
||||
@@ -203,6 +210,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
|
||||
@@ -218,12 +226,12 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
#undef GGML_METAL_ADD_KERNEL
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||
fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||
fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||
fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||
if (ctx->device.maxTransferRate != 0) {
|
||||
fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
||||
fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
||||
} else {
|
||||
fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__);
|
||||
fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__);
|
||||
}
|
||||
|
||||
return ctx;
|
||||
@@ -537,8 +545,8 @@ void ggml_metal_graph_compute(
|
||||
|
||||
id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
|
||||
|
||||
const int node_start = (cb_idx + 0) * n_nodes_per_cb;
|
||||
const int node_end = (cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb;
|
||||
const int node_start = (cb_idx + 0) * n_nodes_per_cb;
|
||||
const int node_end = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
|
||||
|
||||
for (int ind = node_start; ind < node_end; ++ind) {
|
||||
const int i = has_concur ? ctx->concur_list[ind] : ind;
|
||||
@@ -744,32 +752,32 @@ void ggml_metal_graph_compute(
|
||||
[ctx->device supportsFamily:MTLGPUFamilyApple7] &&
|
||||
ne00%32 == 0 &&
|
||||
ne11 > 1) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
|
||||
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
|
||||
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
|
||||
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
|
||||
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
|
||||
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break;
|
||||
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break;
|
||||
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break;
|
||||
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
|
||||
}
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
|
||||
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9];
|
||||
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
|
||||
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
|
||||
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
|
||||
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
|
||||
case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q8_0_f32]; break;
|
||||
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
|
||||
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
|
||||
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break;
|
||||
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break;
|
||||
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break;
|
||||
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
|
||||
}
|
||||
else {
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
|
||||
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9];
|
||||
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
|
||||
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
||||
} else {
|
||||
int nth0 = 32;
|
||||
int nth1 = 1;
|
||||
|
||||
@@ -799,6 +807,15 @@ void ggml_metal_graph_compute(
|
||||
nth1 = 8;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
{
|
||||
GGML_ASSERT(ne02 == 1);
|
||||
GGML_ASSERT(ne12 == 1);
|
||||
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q8_0_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
{
|
||||
GGML_ASSERT(ne02 == 1);
|
||||
@@ -868,24 +885,24 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
|
||||
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
|
||||
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
|
||||
|
||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
|
||||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q3_K) {
|
||||
#ifdef GGML_QKK_64
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
#else
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
#endif
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q5_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q6_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
} else {
|
||||
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
@@ -895,9 +912,10 @@ void ggml_metal_graph_compute(
|
||||
case GGML_OP_GET_ROWS:
|
||||
{
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
|
||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
|
||||
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
||||
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
|
||||
case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q8_0]; break;
|
||||
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
|
||||
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
|
||||
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
|
||||
@@ -938,16 +956,17 @@ void ggml_metal_graph_compute(
|
||||
} break;
|
||||
case GGML_OP_NORM:
|
||||
{
|
||||
const float eps = 1e-5f;
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
const int nth = 256;
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_norm];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
||||
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
||||
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
||||
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
||||
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
@@ -990,7 +1009,9 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&m0 length:sizeof( float) atIndex:18];
|
||||
|
||||
const int nth = 32;
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_ROPE:
|
||||
@@ -1005,8 +1026,8 @@ void ggml_metal_graph_compute(
|
||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_rope];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
||||
@@ -1057,24 +1078,24 @@ void ggml_metal_graph_compute(
|
||||
default: GGML_ASSERT(false && "not implemented");
|
||||
}
|
||||
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
|
||||
120
ggml-metal.metal
120
ggml-metal.metal
@@ -18,6 +18,12 @@ typedef struct {
|
||||
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
||||
} block_q4_1;
|
||||
|
||||
#define QK8_0 32
|
||||
typedef struct {
|
||||
half d; // delta
|
||||
int8_t qs[QK8_0]; // quants
|
||||
} block_q8_0;
|
||||
|
||||
kernel void kernel_add(
|
||||
device const float * src0,
|
||||
device const float * src1,
|
||||
@@ -87,7 +93,12 @@ kernel void kernel_gelu(
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
float x = src0[tpig];
|
||||
dst[tpig] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
||||
|
||||
// BEWARE !!!
|
||||
// Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs!
|
||||
// This was observed with Falcon 7B and 40B models
|
||||
//
|
||||
dst[tpig] = 0.5f*x*(1.0f + precise::tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
||||
}
|
||||
|
||||
kernel void kernel_soft_max(
|
||||
@@ -352,7 +363,7 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
|
||||
const int first_row = (r0 * nsg + sgitg) * nr;
|
||||
const uint offset0 = first_row * nb + im/gqa*(nb*ne0);
|
||||
device const block_q_type * x = (device const block_q_type *) src0 + offset0;
|
||||
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
float yl[16]; // src1 vector cache
|
||||
float sumf[nr]={0.f};
|
||||
|
||||
@@ -424,6 +435,68 @@ kernel void kernel_mul_mat_q4_1_f32(
|
||||
mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
|
||||
}
|
||||
|
||||
kernel void kernel_mul_mat_q8_0_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01[[buffer(4)]],
|
||||
constant int64_t & ne02[[buffer(5)]],
|
||||
constant int64_t & ne10[[buffer(9)]],
|
||||
constant int64_t & ne12[[buffer(11)]],
|
||||
constant int64_t & ne0[[buffer(15)]],
|
||||
constant int64_t & ne1[[buffer(16)]],
|
||||
constant uint & gqa[[buffer(17)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
const int nr = N_DST;
|
||||
const int nsg = N_SIMDGROUP;
|
||||
const int nw = N_SIMDWIDTH;
|
||||
|
||||
const int nb = ne00/QK8_0;
|
||||
const int r0 = tgpig.x;
|
||||
const int r1 = tgpig.y;
|
||||
const int im = tgpig.z;
|
||||
const int first_row = (r0 * nsg + sgitg) * nr;
|
||||
const uint offset0 = first_row * nb + im/gqa*(nb*ne0);
|
||||
device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
|
||||
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
float yl[16];
|
||||
float sumf[nr]={0.f};
|
||||
|
||||
const int ix = tiisg/2;
|
||||
const int il = tiisg%2;
|
||||
|
||||
device const float * yb = y + ix * QK8_0 + 16*il;
|
||||
|
||||
// each thread in a SIMD group deals with half a block.
|
||||
for (int ib = ix; ib < nb; ib += nw/2) {
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
yl[i] = yb[i];
|
||||
}
|
||||
|
||||
for (int row = 0; row < nr; row++) {
|
||||
device const int8_t * qs = x[ib+row*nb].qs + 16*il;
|
||||
float sumq = 0.f;
|
||||
for (int iq = 0; iq < 16; ++iq) {
|
||||
sumq += qs[iq] * yl[iq];
|
||||
}
|
||||
sumf[row] += sumq*x[ib+row*nb].d;
|
||||
}
|
||||
|
||||
yb += QK8_0 * 16;
|
||||
}
|
||||
|
||||
for (int row = 0; row < nr; ++row) {
|
||||
const float tot = simd_sum(sumf[row]);
|
||||
if (tiisg == 0 && first_row + row < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_mul_mat_f16_f32(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
@@ -475,7 +548,6 @@ kernel void kernel_mul_mat_f16_f32(
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
kernel void kernel_alibi_f32(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
@@ -571,7 +643,25 @@ kernel void kernel_rope(
|
||||
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
} else {
|
||||
// TODO: implement
|
||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
||||
const float cos_theta = cos(theta);
|
||||
const float sin_theta = sin(theta);
|
||||
|
||||
theta *= theta_scale;
|
||||
|
||||
const int64_t i0 = ib*n_dims + ic/2;
|
||||
|
||||
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
const float x0 = src[0];
|
||||
const float x1 = src[n_dims/2];
|
||||
|
||||
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1598,12 +1688,12 @@ template <typename type4x4>
|
||||
void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
|
||||
device const uint16_t * qs = ((device const uint16_t *)xb + 1);
|
||||
const half d = il ? (xb->d / 16.h) : xb->d;
|
||||
const half m = il ? (-8.h * 16.h) : -8.h;
|
||||
const half m = il ? ( -8.h * 16.h) : -8.h;
|
||||
const ushort mask0 = il ? 0x00F0 : 0x000F;
|
||||
const ushort mask1 = il ? 0xF000 : 0x0F00;
|
||||
|
||||
for (int i=0;i<8;i++) {
|
||||
reg[i/2][2*(i%2)] = (((qs[i] & mask0)) + m) * d;
|
||||
reg[i/2][2*(i%2)] = (((qs[i] & mask0) ) + m) * d;
|
||||
reg[i/2][2*(i%2)+1] = (((qs[i] & mask1) >> 8) + m) * d;
|
||||
}
|
||||
}
|
||||
@@ -1617,11 +1707,21 @@ void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg
|
||||
const ushort mask1 = il ? 0xF000 : 0x0F00;
|
||||
|
||||
for (int i=0;i<8;i++) {
|
||||
reg[i/2][2*(i%2)] = (((qs[i] & mask0)) * d) + m;
|
||||
reg[i/2][2*(i%2)] = (((qs[i] & mask0) ) * d) + m;
|
||||
reg[i/2][2*(i%2)+1] = (((qs[i] & mask1) >> 8) * d) + m;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
|
||||
device const int8_t * qs = ((device const int8_t *)xb->qs);
|
||||
const half d = xb->d;
|
||||
|
||||
for (int i=0;i<16;i++) {
|
||||
reg[i/4][i%4] = (qs[i + 16*il] * d);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) {
|
||||
const half d = xb->d;
|
||||
@@ -1924,9 +2024,10 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
typedef void (get_rows_t)(device const void *, device const int *, device float *, constant int64_t &, \
|
||||
constant uint64_t &, constant uint64_t &, uint, uint, uint);
|
||||
|
||||
template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows<half4x4, 1, dequantize_f16>;
|
||||
template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows<half4x4, 1, dequantize_f16>;
|
||||
template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
|
||||
template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
|
||||
template [[host_name("kernel_get_rows_q8_0")]] kernel get_rows_t kernel_get_rows<block_q8_0, 2, dequantize_q8_0>;
|
||||
template [[host_name("kernel_get_rows_q2_K")]] kernel get_rows_t kernel_get_rows<block_q2_K, QK_NL, dequantize_q2_K>;
|
||||
template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_t kernel_get_rows<block_q3_K, QK_NL, dequantize_q3_K>;
|
||||
template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows<block_q4_K, QK_NL, dequantize_q4_K>;
|
||||
@@ -1937,9 +2038,10 @@ typedef void (mat_mm_t)(device const uchar *, device const float *, device float
|
||||
constant int64_t &, constant int64_t &, constant int64_t &, constant int64_t &, \
|
||||
constant int64_t &, constant int64_t &, constant uint &, threadgroup uchar *, uint3, uint, uint);
|
||||
|
||||
template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half4x4, 1, dequantize_f16>;
|
||||
template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half4x4, 1, dequantize_f16>;
|
||||
template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2, dequantize_q4_0>;
|
||||
template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_1, 2, dequantize_q4_1>;
|
||||
template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q8_0, 2, dequantize_q8_0>;
|
||||
template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q2_K, QK_NL, dequantize_q2_K>;
|
||||
template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q3_K, QK_NL, dequantize_q3_K>;
|
||||
template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_K, QK_NL, dequantize_q4_K>;
|
||||
|
||||
30
ggml.c
30
ggml.c
@@ -3554,9 +3554,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
|
||||
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
||||
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
||||
|
||||
static const float GELU_COEF_A = 0.044715f;
|
||||
static const float GELU_QUICK_COEF = -1.702f;
|
||||
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||
static const float GELU_COEF_A = 0.044715f;
|
||||
static const float GELU_QUICK_COEF = -1.702f;
|
||||
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||
|
||||
inline static float ggml_gelu_f32(float x) {
|
||||
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
||||
@@ -5555,10 +5555,6 @@ struct ggml_tensor * ggml_repeat(
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
if (ggml_are_same_shape(a, b) && !is_node) {
|
||||
return a;
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
|
||||
|
||||
result->op = GGML_OP_REPEAT;
|
||||
@@ -5789,6 +5785,7 @@ struct ggml_tensor * ggml_silu_back(
|
||||
static struct ggml_tensor * ggml_norm_impl(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
float eps,
|
||||
bool inplace) {
|
||||
bool is_node = false;
|
||||
|
||||
@@ -5799,7 +5796,7 @@ static struct ggml_tensor * ggml_norm_impl(
|
||||
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
|
||||
// TODO: maybe store epsilon here?
|
||||
ggml_set_op_params(result, &eps, sizeof(eps));
|
||||
|
||||
result->op = GGML_OP_NORM;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -5810,14 +5807,16 @@ static struct ggml_tensor * ggml_norm_impl(
|
||||
|
||||
struct ggml_tensor * ggml_norm(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a) {
|
||||
return ggml_norm_impl(ctx, a, false);
|
||||
struct ggml_tensor * a,
|
||||
float eps) {
|
||||
return ggml_norm_impl(ctx, a, eps, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_norm_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a) {
|
||||
return ggml_norm_impl(ctx, a, true);
|
||||
struct ggml_tensor * a,
|
||||
float eps) {
|
||||
return ggml_norm_impl(ctx, a, eps, true);
|
||||
}
|
||||
|
||||
// ggml_rms_norm
|
||||
@@ -10619,7 +10618,8 @@ static void ggml_compute_forward_norm_f32(
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS;
|
||||
|
||||
const float eps = 1e-5f; // TODO: make this a parameter
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
// TODO: optimize
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
@@ -12537,7 +12537,7 @@ static void ggml_compute_forward_rope_f32(
|
||||
dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
|
||||
}
|
||||
} else {
|
||||
// TODO: this is probably wrong, but I can't figure it out ..
|
||||
// TODO: this might be wrong for ne0 != n_dims - need double check
|
||||
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
||||
@@ -12666,7 +12666,7 @@ static void ggml_compute_forward_rope_f16(
|
||||
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
||||
}
|
||||
} else {
|
||||
// TODO: this is probably wrong, but I can't figure it out ..
|
||||
// TODO: this might be wrong for ne0 != n_dims - need double check
|
||||
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
||||
|
||||
7
ggml.h
7
ggml.h
@@ -909,14 +909,15 @@ extern "C" {
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// normalize along rows
|
||||
// TODO: eps is hardcoded to 1e-5 for now
|
||||
GGML_API struct ggml_tensor * ggml_norm(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
struct ggml_tensor * a,
|
||||
float eps);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_norm_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
struct ggml_tensor * a,
|
||||
float eps);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_rms_norm(
|
||||
struct ggml_context * ctx,
|
||||
|
||||
21
gguf-py/LICENSE
Normal file
21
gguf-py/LICENSE
Normal file
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2023 Georgi Gerganov
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
55
gguf-py/README.md
Normal file
55
gguf-py/README.md
Normal file
@@ -0,0 +1,55 @@
|
||||
## gguf
|
||||
|
||||
This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302)
|
||||
(GGML Universal File) format.
|
||||
|
||||
See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-llama-hf-to-gguf.py)
|
||||
as an example for its usage.
|
||||
|
||||
## Installation
|
||||
```sh
|
||||
pip install gguf
|
||||
```
|
||||
|
||||
## Development
|
||||
Maintainers who participate in development of this package are advised to install it in editable mode:
|
||||
|
||||
```sh
|
||||
cd /path/to/llama.cpp/gguf-py
|
||||
|
||||
pip install --editable .
|
||||
```
|
||||
|
||||
**Note**: This may require to upgrade your Pip installation, with a message saying that editable installation currently requires `setup.py`.
|
||||
In this case, upgrade Pip to the latest:
|
||||
|
||||
```sh
|
||||
pip install --upgrade pip
|
||||
```
|
||||
|
||||
## Publishing
|
||||
To publish the package, you need to have `twine` and `build` installed:
|
||||
|
||||
```sh
|
||||
pip install build twine
|
||||
```
|
||||
|
||||
Then, folow these steps to release a new version:
|
||||
|
||||
1. Update the version in `pyproject.toml`.
|
||||
2. Build the package:
|
||||
|
||||
```sh
|
||||
python -m build
|
||||
```
|
||||
|
||||
3. Upload the generated distribution archives:
|
||||
|
||||
```sh
|
||||
python -m twine upload dist/*
|
||||
```
|
||||
|
||||
## TODO
|
||||
- [ ] Add tests
|
||||
- [ ] Include conversion scripts as command line entry points in this package.
|
||||
- Add CI workflow for releasing the package.
|
||||
1
gguf-py/gguf/__init__.py
Normal file
1
gguf-py/gguf/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .gguf import GGUFWriter
|
||||
32
gguf.py → gguf-py/gguf/gguf.py
Executable file → Normal file
32
gguf.py → gguf-py/gguf/gguf.py
Executable file → Normal file
@@ -30,12 +30,12 @@ KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
|
||||
KEY_GENERAL_FILE_TYPE = "general.file_type"
|
||||
|
||||
# LLM
|
||||
KEY_LLM_CONTEXT_LENGTH = "{arch}.context_length"
|
||||
KEY_LLM_EMBEDDING_LENGTH = "{arch}.embedding_length"
|
||||
KEY_LLM_BLOCK_COUNT = "{arch}.block_count"
|
||||
KEY_LLM_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
||||
KEY_LLM_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
||||
KEY_LLM_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
||||
KEY_CONTEXT_LENGTH = "{arch}.context_length"
|
||||
KEY_EMBEDDING_LENGTH = "{arch}.embedding_length"
|
||||
KEY_BLOCK_COUNT = "{arch}.block_count"
|
||||
KEY_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
||||
KEY_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
||||
KEY_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
||||
|
||||
# attention
|
||||
KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count"
|
||||
@@ -47,6 +47,7 @@ KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
||||
|
||||
# RoPE
|
||||
KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||
KEY_ROPE_FREQ_BASE = "{arch}.rope.freq_base"
|
||||
KEY_ROPE_SCALE_LINEAR = "{arch}.rope.scale_linear"
|
||||
|
||||
# tokenization
|
||||
@@ -583,7 +584,7 @@ class GGUFWriter:
|
||||
self.add_string(KEY_GENERAL_AUTHOR, author)
|
||||
|
||||
def add_tensor_data_layout(self, layout: str):
|
||||
self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
||||
self.add_string(KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
||||
|
||||
def add_url(self, url: str):
|
||||
self.add_string(KEY_GENERAL_URL, url)
|
||||
@@ -613,27 +614,27 @@ class GGUFWriter:
|
||||
|
||||
def add_context_length(self, length: int):
|
||||
self.add_uint32(
|
||||
KEY_LLM_CONTEXT_LENGTH.format(arch=self.arch), length)
|
||||
KEY_CONTEXT_LENGTH.format(arch=self.arch), length)
|
||||
|
||||
def add_embedding_length(self, length: int):
|
||||
self.add_uint32(
|
||||
KEY_LLM_EMBEDDING_LENGTH.format(arch=self.arch), length)
|
||||
KEY_EMBEDDING_LENGTH.format(arch=self.arch), length)
|
||||
|
||||
def add_block_count(self, length: int):
|
||||
self.add_uint32(
|
||||
KEY_LLM_BLOCK_COUNT.format(arch=self.arch), length)
|
||||
KEY_BLOCK_COUNT.format(arch=self.arch), length)
|
||||
|
||||
def add_feed_forward_length(self, length: int):
|
||||
self.add_uint32(
|
||||
KEY_LLM_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||
KEY_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||
|
||||
def add_parallel_residual(self, use: bool):
|
||||
self.add_bool(
|
||||
KEY_LLM_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
||||
KEY_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
||||
|
||||
def add_tensor_data_layout(self, layout: str):
|
||||
self.add_string(
|
||||
KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
||||
KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
|
||||
|
||||
def add_head_count(self, count: int):
|
||||
self.add_uint32(
|
||||
@@ -663,7 +664,10 @@ class GGUFWriter:
|
||||
self.add_uint32(
|
||||
KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
|
||||
|
||||
def add_rope_scale_linear(self, value: float):
|
||||
def add_rope_freq_base(self, value: float):
|
||||
self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value)
|
||||
|
||||
def add_rope_scale_linear(self, value: float):
|
||||
self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
|
||||
|
||||
def add_tokenizer_model(self, model: str):
|
||||
28
gguf-py/pyproject.toml
Normal file
28
gguf-py/pyproject.toml
Normal file
@@ -0,0 +1,28 @@
|
||||
[tool.poetry]
|
||||
name = "gguf"
|
||||
version = "0.2.0"
|
||||
description = "Write ML models in GGUF for GGML"
|
||||
authors = ["GGML <ggml@ggml.ai>"]
|
||||
packages = [
|
||||
{include = "gguf"},
|
||||
]
|
||||
readme = "README.md"
|
||||
homepage = "https://ggml.ai"
|
||||
repository = "https://github.com/ggerganov/llama.cpp"
|
||||
keywords = ["ggml", "gguf", "llama.cpp"]
|
||||
classifiers = [
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.8"
|
||||
numpy = ">=1.17"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
pytest = "^5.2"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core>=1.0.0"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
7
gguf-py/tests/test_gguf.py
Normal file
7
gguf-py/tests/test_gguf.py
Normal file
@@ -0,0 +1,7 @@
|
||||
import gguf
|
||||
|
||||
# TODO: add tests
|
||||
|
||||
|
||||
def test_write_gguf():
|
||||
pass
|
||||
17
llama.h
17
llama.h
@@ -247,6 +247,8 @@ extern "C" {
|
||||
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
||||
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
|
||||
LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
|
||||
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
||||
@@ -346,7 +348,7 @@ extern "C" {
|
||||
|
||||
LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
|
||||
|
||||
LLAMA_API llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
|
||||
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
|
||||
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
|
||||
@@ -368,13 +370,6 @@ extern "C" {
|
||||
int n_max_tokens,
|
||||
bool add_bos);
|
||||
|
||||
LLAMA_API int llama_tokenize_bpe(
|
||||
struct llama_context * ctx,
|
||||
const char * text,
|
||||
llama_token * tokens,
|
||||
int n_max_tokens,
|
||||
bool add_bos);
|
||||
|
||||
LLAMA_API int llama_tokenize_with_model(
|
||||
const struct llama_model * model,
|
||||
const char * text,
|
||||
@@ -390,12 +385,6 @@ extern "C" {
|
||||
char * buf,
|
||||
int length);
|
||||
|
||||
LLAMA_API int llama_token_to_str_bpe(
|
||||
const struct llama_context * ctx,
|
||||
llama_token token,
|
||||
char * buf,
|
||||
int length);
|
||||
|
||||
LLAMA_API int llama_token_to_str_with_model(
|
||||
const struct llama_model * model,
|
||||
llama_token token,
|
||||
|
||||
@@ -1,2 +1,3 @@
|
||||
numpy==1.24
|
||||
sentencepiece==0.1.98
|
||||
gguf>=0.1.0
|
||||
|
||||
@@ -1,93 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Measure the performance (time per token) of the various quantization techniques
|
||||
#
|
||||
|
||||
QUANTIZE=0
|
||||
if [ "$1" != "" ]; then
|
||||
echo "Quantizing"
|
||||
QUANTIZE=1
|
||||
fi
|
||||
|
||||
if [ "$QUANTIZE" != "0" ]; then
|
||||
#
|
||||
# quantize
|
||||
#
|
||||
|
||||
# 7B
|
||||
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-7b-q4_0.txt
|
||||
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-7b-q4_1.txt
|
||||
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-7b-q5_0.txt
|
||||
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-7b-q5_1.txt
|
||||
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-7b-q8_0.txt
|
||||
|
||||
# 13B
|
||||
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-13b-q4_0.txt
|
||||
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-13b-q4_1.txt
|
||||
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-13b-q5_0.txt
|
||||
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-13b-q5_1.txt
|
||||
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-13b-q8_0.txt
|
||||
fi
|
||||
|
||||
#
|
||||
# perf
|
||||
# run each command twice
|
||||
#
|
||||
|
||||
set -x
|
||||
|
||||
# 7B - 4 threads
|
||||
./bin/main -m ../models/7B/ggml-model-f16.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/7B/ggml-model-f16.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-f16.txt | grep llama_print_timings
|
||||
./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q4_0.txt | grep llama_print_timings
|
||||
./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q4_1.txt | grep llama_print_timings
|
||||
./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q5_0.txt | grep llama_print_timings
|
||||
./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q5_1.txt | grep llama_print_timings
|
||||
./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-7b-q8_0.txt | grep llama_print_timings
|
||||
|
||||
# 7B - 8 threads
|
||||
./bin/main -m ../models/7B/ggml-model-f16.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/7B/ggml-model-f16.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-f16.txt | grep llama_print_timings
|
||||
./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/7B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q4_0.txt | grep llama_print_timings
|
||||
./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/7B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q4_1.txt | grep llama_print_timings
|
||||
./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/7B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q5_0.txt | grep llama_print_timings
|
||||
./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/7B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q5_1.txt | grep llama_print_timings
|
||||
./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/7B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-7b-q8_0.txt | grep llama_print_timings
|
||||
|
||||
# 13B - 4 threads
|
||||
./bin/main -m ../models/13B/ggml-model-f16.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/13B/ggml-model-f16.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-f16.txt | grep llama_print_timings
|
||||
./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q4_0.txt | grep llama_print_timings
|
||||
./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q4_1.txt | grep llama_print_timings
|
||||
./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q5_0.txt | grep llama_print_timings
|
||||
./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q5_1.txt | grep llama_print_timings
|
||||
./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 4 2>&1 | tee ../perf-13b-q8_0.txt | grep llama_print_timings
|
||||
|
||||
# 13B - 8 threads
|
||||
./bin/main -m ../models/13B/ggml-model-f16.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/13B/ggml-model-f16.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-f16.txt | grep llama_print_timings
|
||||
./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/13B/ggml-model-q4_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q4_0.txt | grep llama_print_timings
|
||||
./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/13B/ggml-model-q4_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q4_1.txt | grep llama_print_timings
|
||||
./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/13B/ggml-model-q5_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q5_0.txt | grep llama_print_timings
|
||||
./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/13B/ggml-model-q5_1.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q5_1.txt | grep llama_print_timings
|
||||
./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | grep "I believe"
|
||||
time ./bin/main -m ../models/13B/ggml-model-q8_0.bin -p "I believe the meaning of life is" --no-mmap -c 2048 --ignore-eos -s 1 -n 64 -t 8 2>&1 | tee ../perf-13b-q8_0.txt | grep llama_print_timings
|
||||
@@ -1,39 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
#
|
||||
# quantize
|
||||
#
|
||||
|
||||
# 7B
|
||||
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-7b-q4_0.txt
|
||||
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-7b-q4_1.txt
|
||||
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-7b-q5_0.txt
|
||||
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-7b-q5_1.txt
|
||||
time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-7b-q8_0.txt
|
||||
|
||||
# 13B
|
||||
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-13b-q4_0.txt
|
||||
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-13b-q4_1.txt
|
||||
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-13b-q5_0.txt
|
||||
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-13b-q5_1.txt
|
||||
time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-13b-q8_0.txt
|
||||
|
||||
#
|
||||
# perplexity
|
||||
#
|
||||
|
||||
# 7B
|
||||
time ./bin/perplexity -m ../models/7B/ggml-model-f16.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-f16.txt
|
||||
time ./bin/perplexity -m ../models/7B/ggml-model-q4_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_0.txt
|
||||
time ./bin/perplexity -m ../models/7B/ggml-model-q4_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_1.txt
|
||||
time ./bin/perplexity -m ../models/7B/ggml-model-q5_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q5_0.txt
|
||||
time ./bin/perplexity -m ../models/7B/ggml-model-q5_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q5_1.txt
|
||||
time ./bin/perplexity -m ../models/7B/ggml-model-q8_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q8_0.txt
|
||||
|
||||
# 13B
|
||||
time ./bin/perplexity -m ../models/13B/ggml-model-f16.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-f16.txt
|
||||
time ./bin/perplexity -m ../models/13B/ggml-model-q4_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_0.txt
|
||||
time ./bin/perplexity -m ../models/13B/ggml-model-q4_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_1.txt
|
||||
time ./bin/perplexity -m ../models/13B/ggml-model-q5_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q5_0.txt
|
||||
time ./bin/perplexity -m ../models/13B/ggml-model-q5_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q5_1.txt
|
||||
time ./bin/perplexity -m ../models/13B/ggml-model-q8_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q8_0.txt
|
||||
27
scripts/qnt-all.sh
Executable file
27
scripts/qnt-all.sh
Executable file
@@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
|
||||
qnt=(q8_0 q6_k q5_k q5_1 q5_0 q4_k q4_1 q4_0 q3_k q2_k)
|
||||
args=""
|
||||
|
||||
if [ -z "$1" ]; then
|
||||
echo "usage: $0 <model> [qnt] [args]"
|
||||
echo "default: $0 <model> \"${qnt[@]}\" \"${args}\""
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -z "$2" ]; then
|
||||
qnt=($2)
|
||||
fi
|
||||
|
||||
if [ ! -z "$3" ]; then
|
||||
args="$3"
|
||||
fi
|
||||
|
||||
model="$1"
|
||||
out="../tmp/results-${model}"
|
||||
|
||||
mkdir -p ${out}
|
||||
|
||||
for q in ${qnt[@]}; do
|
||||
time ./bin/quantize ../models/${model}/ggml-model-f16.gguf ../models/${model}/ggml-model-${q}.gguf ${q} 2>&1 ${args} | tee ${out}/qnt-${q}.txt
|
||||
done
|
||||
31
scripts/run-all-perf.sh
Executable file
31
scripts/run-all-perf.sh
Executable file
@@ -0,0 +1,31 @@
|
||||
#!/bin/bash
|
||||
|
||||
qnt=(f16 q8_0 q6_k q5_k q5_1 q5_0 q4_k q4_1 q4_0 q3_k q2_k)
|
||||
args="-ngl 999 -n 64 -p 512"
|
||||
|
||||
if [ -z "$1" ]; then
|
||||
echo "usage: $0 <model> [qnt] [args]"
|
||||
echo "default: $0 <model> \"${qnt[@]}\" \"${args}\""
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -z "$2" ]; then
|
||||
qnt=($2)
|
||||
fi
|
||||
|
||||
if [ ! -z "$3" ]; then
|
||||
args="$3"
|
||||
fi
|
||||
|
||||
model="$1"
|
||||
out="../tmp/results-${model}"
|
||||
|
||||
mkdir -p ${out}
|
||||
|
||||
mstr=""
|
||||
|
||||
for q in ${qnt[@]}; do
|
||||
mstr="${mstr} -m ../models/${model}/ggml-model-${q}.gguf"
|
||||
done
|
||||
|
||||
./bin/llama-bench ${mstr} ${args} 2> /dev/null
|
||||
27
scripts/run-all-ppl.sh
Executable file
27
scripts/run-all-ppl.sh
Executable file
@@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
|
||||
qnt=(f16 q8_0 q6_k q5_k q5_1 q5_0 q4_k q4_1 q4_0 q3_k q2_k)
|
||||
args="-ngl 999 -t 8"
|
||||
|
||||
if [ -z "$1" ]; then
|
||||
echo "usage: $0 <model> [qnt] [args]"
|
||||
echo "default: $0 <model> \"${qnt[@]}\" \"${args}\""
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -z "$2" ]; then
|
||||
qnt=($2)
|
||||
fi
|
||||
|
||||
if [ ! -z "$3" ]; then
|
||||
args="$3"
|
||||
fi
|
||||
|
||||
model="$1"
|
||||
out="../tmp/results-${model}"
|
||||
|
||||
mkdir -p ${out}
|
||||
|
||||
for q in ${qnt[@]}; do
|
||||
time ./bin/perplexity -m ../models/${model}/ggml-model-f16.gguf -f ./wiki.test.raw ${args} 2>&1 | tee ${out}/ppl-${q}.txt
|
||||
done
|
||||
@@ -28,7 +28,8 @@ llama_build_and_test_executable(test-sampling.cpp)
|
||||
llama_build_executable(test-tokenizer-0.cpp)
|
||||
llama_test_executable (test-tokenizer-0.llama test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||
llama_build_executable(test-tokenizer-1.cpp)
|
||||
llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||
# test-tokenizer-1 requires a BPE vocab. re-enable when we have one.
|
||||
#llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||
llama_build_and_test_executable(test-grammar-parser.cpp)
|
||||
llama_build_and_test_executable(test-llama-grammar.cpp)
|
||||
|
||||
@@ -67,11 +67,13 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_BPE);
|
||||
|
||||
const int n_vocab = llama_n_vocab(ctx);
|
||||
|
||||
for (int i = 0; i < n_vocab; ++i) {
|
||||
std::string forward = llama_token_to_str_bpe(ctx, i);
|
||||
std::vector<llama_token> tokens = llama_tokenize_bpe(ctx, forward, false);
|
||||
std::string forward = llama_token_to_str(ctx, i);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, forward, false);
|
||||
if (tokens.size() == 1) {
|
||||
if (i != tokens[0]) {
|
||||
std::string backward = llama_token_to_str(ctx, tokens[0]);
|
||||
@@ -79,16 +81,6 @@ int main(int argc, char **argv) {
|
||||
__func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str());
|
||||
return 2;
|
||||
}
|
||||
} else {
|
||||
llama_token_type type = llama_token_get_type(ctx, i);
|
||||
if (type == LLAMA_TOKEN_TYPE_UNKNOWN || type == LLAMA_TOKEN_TYPE_CONTROL || type == LLAMA_TOKEN_TYPE_BYTE) {
|
||||
fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n",
|
||||
__func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
|
||||
} else {
|
||||
fprintf(stderr, "%s : error: token %d is string %s but bpe returns tokens %s\n",
|
||||
__func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user