mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-02 15:14:06 +00:00
Compare commits
46 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
272935b281 | ||
|
|
ccf58aa3ec | ||
|
|
91f8ad167d | ||
|
|
6b7e76d28c | ||
|
|
bc0baab2ea | ||
|
|
d795988d9e | ||
|
|
f8c4e745e1 | ||
|
|
47cc7a7bf9 | ||
|
|
bd60d82d0c | ||
|
|
6c0b287748 | ||
|
|
d26e8b669d | ||
|
|
d8b009a945 | ||
|
|
d0d5de42e5 | ||
|
|
b80cf3b2d1 | ||
|
|
970a48060a | ||
|
|
4c28b82529 | ||
|
|
2d15886bb0 | ||
|
|
d199ca79f2 | ||
|
|
104f5e0fc1 | ||
|
|
5e1b7f94a0 | ||
|
|
ac9ee6a4ad | ||
|
|
4f6d1337ca | ||
|
|
2bf8d0f7c4 | ||
|
|
496bc79bc2 | ||
|
|
9b03719ad7 | ||
|
|
3a6efdd03c | ||
|
|
d01b3c4c32 | ||
|
|
cd776c37c9 | ||
|
|
dc0f612548 | ||
|
|
c47cf414ef | ||
|
|
b5f4ae09c3 | ||
|
|
dfbfdd60f9 | ||
|
|
15961ec04d | ||
|
|
a56d09a440 | ||
|
|
d84c48505f | ||
|
|
877b4d0c62 | ||
|
|
12247f4c69 | ||
|
|
4e9a7f7f7f | ||
|
|
3020327f6c | ||
|
|
46acb36767 | ||
|
|
131b058409 | ||
|
|
753e36f650 | ||
|
|
7ce2c77f88 | ||
|
|
aab606a11f | ||
|
|
b0bc9f4a9d | ||
|
|
4755afd1cb |
90
.github/workflows/build.yml
vendored
90
.github/workflows/build.yml
vendored
@@ -48,6 +48,28 @@ jobs:
|
||||
CC=gcc-8 make tests -j $(nproc)
|
||||
make test -j $(nproc)
|
||||
|
||||
ubuntu-focal-make-curl:
|
||||
runs-on: ubuntu-20.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev
|
||||
|
||||
- name: Build
|
||||
id: make_build
|
||||
env:
|
||||
LLAMA_FATAL_WARNINGS: 1
|
||||
LLAMA_CURL: 1
|
||||
run: |
|
||||
CC=gcc-8 make -j $(nproc)
|
||||
|
||||
ubuntu-latest-cmake:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -76,40 +98,40 @@ jobs:
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
ubuntu-latest-cmake-sanitizer:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
build_type: [Debug, Release]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
# ubuntu-latest-cmake-sanitizer:
|
||||
# runs-on: ubuntu-latest
|
||||
#
|
||||
# continue-on-error: true
|
||||
#
|
||||
# strategy:
|
||||
# matrix:
|
||||
# sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
# build_type: [Debug, Release]
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v3
|
||||
#
|
||||
# - name: Dependencies
|
||||
# id: depends
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get install build-essential
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: |
|
||||
# mkdir build
|
||||
# cd build
|
||||
# cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||
# cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
||||
#
|
||||
# - name: Test
|
||||
# id: cmake_test
|
||||
# run: |
|
||||
# cd build
|
||||
# ctest -L main --verbose --timeout 900
|
||||
|
||||
ubuntu-latest-cmake-mpi:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
23
.github/workflows/close-issue.yml
vendored
Normal file
23
.github/workflows/close-issue.yml
vendored
Normal file
@@ -0,0 +1,23 @@
|
||||
name: Close inactive issues
|
||||
on:
|
||||
schedule:
|
||||
- cron: "42 0 * * *"
|
||||
|
||||
jobs:
|
||||
close-issues:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
issues: write
|
||||
pull-requests: write
|
||||
steps:
|
||||
- uses: actions/stale@v5
|
||||
with:
|
||||
exempt-issue-labels: "refactor,help wanted,good first issue,research"
|
||||
days-before-issue-stale: 30
|
||||
days-before-issue-close: 14
|
||||
stale-issue-label: "stale"
|
||||
close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
|
||||
days-before-pr-stale: -1
|
||||
days-before-pr-close: -1
|
||||
operations-per-run: 1000
|
||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
26
.github/workflows/server.yml
vendored
26
.github/workflows/server.yml
vendored
@@ -24,13 +24,13 @@ jobs:
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
# TODO: temporary disabled due to linux kernel issues
|
||||
#sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
sanitizer: [UNDEFINED]
|
||||
build_type: [Debug]
|
||||
include:
|
||||
- build_type: Release
|
||||
sanitizer: ""
|
||||
- build_type: Debug
|
||||
sanitizer: THREAD
|
||||
disabled_on_pr: true
|
||||
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
||||
|
||||
@@ -57,7 +57,8 @@ jobs:
|
||||
cmake \
|
||||
python3-pip \
|
||||
wget \
|
||||
language-pack-en
|
||||
language-pack-en \
|
||||
libcurl4-openssl-dev
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
@@ -67,6 +68,7 @@ jobs:
|
||||
cmake .. \
|
||||
-DLLAMA_NATIVE=OFF \
|
||||
-DLLAMA_BUILD_SERVER=ON \
|
||||
-DLLAMA_CURL=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
|
||||
@@ -101,12 +103,21 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
env:
|
||||
CURL_VERSION: 8.6.0_6
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
|
||||
mkdir $env:RUNNER_TEMP/libcurl
|
||||
tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release ;
|
||||
cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
|
||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
|
||||
|
||||
- name: Python setup
|
||||
@@ -120,6 +131,11 @@ jobs:
|
||||
run: |
|
||||
pip install -r examples/server/tests/requirements.txt
|
||||
|
||||
- name: Copy Libcurl
|
||||
id: prepare_libcurl
|
||||
run: |
|
||||
cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -12,6 +12,8 @@
|
||||
*.dot
|
||||
*.bat
|
||||
*.metallib
|
||||
*.etag
|
||||
*.lastModified
|
||||
.DS_Store
|
||||
.build/
|
||||
.cache/
|
||||
|
||||
@@ -99,6 +99,7 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some
|
||||
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
||||
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
||||
"llama: max. batch size for using peer access")
|
||||
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
||||
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
||||
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||
|
||||
11
Makefile
11
Makefile
@@ -553,7 +553,7 @@ endif
|
||||
endif # LLAMA_METAL
|
||||
|
||||
ifdef LLAMA_METAL
|
||||
ggml-metal.o: ggml-metal.m ggml-metal.h
|
||||
ggml-metal.o: ggml-metal.m ggml-metal.h ggml.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
ifdef LLAMA_METAL_EMBED_LIBRARY
|
||||
@@ -595,6 +595,11 @@ include scripts/get-flags.mk
|
||||
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
||||
endif
|
||||
|
||||
ifdef LLAMA_CURL
|
||||
override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
|
||||
override LDFLAGS := $(LDFLAGS) -lcurl
|
||||
endif
|
||||
|
||||
#
|
||||
# Print build information
|
||||
#
|
||||
@@ -748,6 +753,10 @@ gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
129
README-sycl.md
129
README-sycl.md
@@ -29,6 +29,7 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|
||||
## News
|
||||
|
||||
- 2024.3
|
||||
- New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
|
||||
- Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
|
||||
- Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
|
||||
- Support detecting all GPUs with level-zero and same top **Max compute units**.
|
||||
@@ -81,7 +82,7 @@ For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, rec
|
||||
|-|-|-|
|
||||
|Ampere Series| Support| A100|
|
||||
|
||||
### oneMKL
|
||||
### oneMKL for CUDA
|
||||
|
||||
The current oneMKL release does not contain the oneMKL cuBlas backend.
|
||||
As a result for Nvidia GPU's oneMKL must be built from source.
|
||||
@@ -254,16 +255,16 @@ Run without parameter:
|
||||
Check the ID in startup log, like:
|
||||
|
||||
```
|
||||
found 4 SYCL devices:
|
||||
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
|
||||
found 6 SYCL devices:
|
||||
| | | |Compute |Max compute|Max work|Max sub| |
|
||||
|ID| Device Type| Name|capability|units |group |group |Global mem size|
|
||||
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
|
||||
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
|
||||
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
|
||||
| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
|
||||
| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
|
||||
| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
|
||||
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
|
||||
```
|
||||
|
||||
|Attribute|Note|
|
||||
@@ -271,12 +272,35 @@ found 4 SYCL devices:
|
||||
|compute capability 1.3|Level-zero running time, recommended |
|
||||
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||
|
||||
4. Set device ID and execute llama.cpp
|
||||
4. Device selection and execution of llama.cpp
|
||||
|
||||
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
|
||||
There are two device selection modes:
|
||||
|
||||
- Single device: Use one device assigned by user.
|
||||
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
|
||||
|
||||
|Device selection|Parameter|
|
||||
|-|-|
|
||||
|Single device|--split-mode none --main-gpu DEVICE_ID |
|
||||
|Multiple devices|--split-mode layer (default)|
|
||||
|
||||
Examples:
|
||||
|
||||
- Use device 0:
|
||||
|
||||
```sh
|
||||
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
|
||||
```
|
||||
or run by script:
|
||||
|
||||
```sh
|
||||
./examples/sycl/run_llama2.sh 0
|
||||
```
|
||||
|
||||
- Use multiple devices:
|
||||
|
||||
```sh
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
|
||||
```
|
||||
or run by script:
|
||||
|
||||
@@ -289,12 +313,18 @@ Note:
|
||||
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
|
||||
|
||||
|
||||
5. Check the device ID in output
|
||||
5. Verify the device ID in output
|
||||
|
||||
Verify to see if the selected GPU is shown in the output, like:
|
||||
|
||||
Like:
|
||||
```
|
||||
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||
detect 1 SYCL GPUs: [0] with top Max compute units:512
|
||||
```
|
||||
Or
|
||||
```
|
||||
use 1 SYCL GPUs: [0] with Max compute units:512
|
||||
```
|
||||
|
||||
|
||||
## Windows
|
||||
|
||||
@@ -355,7 +385,7 @@ a. Download & install cmake for Windows: https://cmake.org/download/
|
||||
|
||||
b. Download & install mingw-w64 make for Windows provided by w64devkit
|
||||
|
||||
- Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
||||
- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).
|
||||
|
||||
- Extract `w64devkit` on your pc.
|
||||
|
||||
@@ -430,15 +460,16 @@ build\bin\main.exe
|
||||
Check the ID in startup log, like:
|
||||
|
||||
```
|
||||
found 4 SYCL devices:
|
||||
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
found 6 SYCL devices:
|
||||
| | | |Compute |Max compute|Max work|Max sub| |
|
||||
|ID| Device Type| Name|capability|units |group |group |Global mem size|
|
||||
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
|
||||
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
|
||||
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
|
||||
| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
|
||||
| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
|
||||
| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
|
||||
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
|
||||
|
||||
```
|
||||
|
||||
@@ -447,13 +478,31 @@ found 4 SYCL devices:
|
||||
|compute capability 1.3|Level-zero running time, recommended |
|
||||
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||
|
||||
4. Set device ID and execute llama.cpp
|
||||
|
||||
Set device ID = 0 by **set GGML_SYCL_DEVICE=0**
|
||||
4. Device selection and execution of llama.cpp
|
||||
|
||||
There are two device selection modes:
|
||||
|
||||
- Single device: Use one device assigned by user.
|
||||
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
|
||||
|
||||
|Device selection|Parameter|
|
||||
|-|-|
|
||||
|Single device|--split-mode none --main-gpu DEVICE_ID |
|
||||
|Multiple devices|--split-mode layer (default)|
|
||||
|
||||
Examples:
|
||||
|
||||
- Use device 0:
|
||||
|
||||
```
|
||||
set GGML_SYCL_DEVICE=0
|
||||
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0
|
||||
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
|
||||
```
|
||||
|
||||
- Use multiple devices:
|
||||
|
||||
```
|
||||
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
|
||||
```
|
||||
or run by script:
|
||||
|
||||
@@ -466,11 +515,17 @@ Note:
|
||||
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
|
||||
|
||||
|
||||
5. Check the device ID in output
|
||||
|
||||
Like:
|
||||
5. Verify the device ID in output
|
||||
|
||||
Verify to see if the selected GPU is shown in the output, like:
|
||||
|
||||
```
|
||||
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||
detect 1 SYCL GPUs: [0] with top Max compute units:512
|
||||
```
|
||||
Or
|
||||
```
|
||||
use 1 SYCL GPUs: [0] with Max compute units:512
|
||||
```
|
||||
|
||||
## Environment Variable
|
||||
@@ -489,7 +544,6 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||
|
||||
|Name|Value|Function|
|
||||
|-|-|-|
|
||||
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|
||||
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|
||||
|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|
|
||||
|
||||
@@ -507,6 +561,9 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||
|
||||
## Q&A
|
||||
|
||||
Note: please add prefix **[SYCL]** in issue title, so that we will check it as soon as possible.
|
||||
|
||||
|
||||
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
|
||||
|
||||
Miss to enable oneAPI running environment.
|
||||
@@ -538,4 +595,4 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||
|
||||
## Todo
|
||||
|
||||
- Support multiple cards.
|
||||
- Support row layer split for multiple card runs.
|
||||
|
||||
@@ -112,6 +112,7 @@ Typically finetunes of the base models below are supported as well.
|
||||
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
|
||||
- [x] [Gemma](https://ai.google.dev/gemma)
|
||||
- [x] [Mamba](https://github.com/state-spaces/mamba)
|
||||
- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
|
||||
|
||||
**Multimodal models:**
|
||||
|
||||
@@ -133,6 +134,7 @@ Typically finetunes of the base models below are supported as well.
|
||||
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
|
||||
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
|
||||
- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
|
||||
- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
|
||||
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
|
||||
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
|
||||
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
|
||||
|
||||
@@ -68,6 +68,17 @@ if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
|
||||
set(LLAMA_COMMON_EXTRA_LIBS build_info)
|
||||
|
||||
# Use curl to download model url
|
||||
if (LLAMA_CURL)
|
||||
find_package(CURL REQUIRED)
|
||||
add_definitions(-DLLAMA_USE_CURL)
|
||||
include_directories(${CURL_INCLUDE_DIRS})
|
||||
find_library(CURL_LIBRARY curl REQUIRED)
|
||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
|
||||
endif ()
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC .)
|
||||
target_compile_features(${TARGET} PUBLIC cxx_std_11)
|
||||
target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama)
|
||||
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama)
|
||||
|
||||
2204
common/common.cpp
2204
common/common.cpp
File diff suppressed because it is too large
Load Diff
@@ -37,10 +37,13 @@ extern char const *LLAMA_COMMIT;
|
||||
extern char const *LLAMA_COMPILER;
|
||||
extern char const *LLAMA_BUILD_TARGET;
|
||||
|
||||
struct llama_control_vector_load_info;
|
||||
|
||||
int32_t get_num_physical_cores();
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
//
|
||||
int32_t get_num_physical_cores();
|
||||
|
||||
struct gpt_params {
|
||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||
@@ -86,6 +89,7 @@ struct gpt_params {
|
||||
struct llama_sampling_params sparams;
|
||||
|
||||
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
||||
std::string model_url = ""; // model url to download
|
||||
std::string model_draft = ""; // draft model for speculative decoding
|
||||
std::string model_alias = "unknown"; // model alias
|
||||
std::string prompt = "";
|
||||
@@ -103,6 +107,11 @@ struct gpt_params {
|
||||
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
||||
std::string lora_base = ""; // base model path for the lora adapter
|
||||
|
||||
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
|
||||
|
||||
int32_t control_vector_layer_start = -1; // layer range for control vector
|
||||
int32_t control_vector_layer_end = -1; // layer range for control vector
|
||||
|
||||
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
||||
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
||||
// (which is more convenient to use for plotting)
|
||||
@@ -183,6 +192,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
||||
|
||||
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
|
||||
struct llama_model_params params);
|
||||
|
||||
// Batch utils
|
||||
|
||||
void llama_batch_clear(struct llama_batch & batch);
|
||||
@@ -269,3 +281,24 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40
|
||||
void llama_embd_normalize(const float * inp, float * out, int n);
|
||||
|
||||
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
||||
|
||||
//
|
||||
// Control vector utils
|
||||
//
|
||||
|
||||
struct llama_control_vector_data {
|
||||
int n_embd;
|
||||
|
||||
// stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
|
||||
std::vector<float> data;
|
||||
};
|
||||
|
||||
struct llama_control_vector_load_info {
|
||||
float strength;
|
||||
|
||||
std::string fname;
|
||||
};
|
||||
|
||||
// Load control vectors, scale each by strength, and add them together.
|
||||
// On error, returns {-1, empty}
|
||||
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
|
||||
|
||||
@@ -32,13 +32,13 @@ typedef struct llama_sampling_params {
|
||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.10f; // 1.0 = disabled
|
||||
float penalty_repeat = 1.00f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
float penalty_present = 0.00f; // 0.0 = disabled
|
||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
bool penalize_nl = true; // consider newlines as a repeatable token
|
||||
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||
|
||||
std::vector<llama_sampler_type> samplers_sequence = {
|
||||
llama_sampler_type::TOP_K,
|
||||
|
||||
@@ -1634,7 +1634,7 @@ in chat mode so that the conversation can end normally.")
|
||||
self.post_write_tensors(tensor_map, name, data_torch)
|
||||
|
||||
|
||||
@Model.register("BertModel")
|
||||
@Model.register("BertModel", "CamembertModel")
|
||||
class BertModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.BERT
|
||||
|
||||
@@ -1965,6 +1965,23 @@ class MambaModel(Model):
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
@Model.register("CohereForCausalLM")
|
||||
class CommandR2Model(Model):
|
||||
model_arch = gguf.MODEL_ARCH.COMMAND_R
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# max_position_embeddings = 8192 in config.json but model was actually
|
||||
# trained on 128k context length
|
||||
self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
|
||||
@@ -1167,9 +1167,9 @@ class OutputFile:
|
||||
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
||||
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type
|
||||
|
||||
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
||||
if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
|
||||
return GGMLFileType.AllF32
|
||||
if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)):
|
||||
if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
|
||||
return GGMLFileType.MostlyF16
|
||||
if output_type_str == "q8_0":
|
||||
return GGMLFileType.MostlyQ8_0
|
||||
|
||||
@@ -21,6 +21,7 @@ else()
|
||||
add_subdirectory(embedding)
|
||||
add_subdirectory(finetune)
|
||||
add_subdirectory(gritlm)
|
||||
add_subdirectory(gguf-split)
|
||||
add_subdirectory(infill)
|
||||
add_subdirectory(llama-bench)
|
||||
add_subdirectory(llava)
|
||||
|
||||
5
examples/gguf-split/CMakeLists.txt
Normal file
5
examples/gguf-split/CMakeLists.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
set(TARGET gguf-split)
|
||||
add_executable(${TARGET} gguf-split.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
9
examples/gguf-split/README.md
Normal file
9
examples/gguf-split/README.md
Normal file
@@ -0,0 +1,9 @@
|
||||
## GGUF split Example
|
||||
|
||||
CLI to split / merge GGUF files.
|
||||
|
||||
**Command line options:**
|
||||
|
||||
- `--split`: split GGUF to multiple GGUF, default operation.
|
||||
- `--split-max-tensors`: maximum tensors in each split: default(128)
|
||||
- `--merge`: merge multiple GGUF to a single GGUF.
|
||||
489
examples/gguf-split/gguf-split.cpp
Normal file
489
examples/gguf-split/gguf-split.cpp
Normal file
@@ -0,0 +1,489 @@
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
#include "common.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <ios>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <fcntl.h>
|
||||
#include <string.h>
|
||||
|
||||
enum split_operation : uint8_t {
|
||||
SPLIT_OP_SPLIT,
|
||||
SPLIT_OP_MERGE,
|
||||
};
|
||||
|
||||
static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "general.split";
|
||||
static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "general.split_count";
|
||||
|
||||
static const int SPLIT_FILENAME_MAX = 256;
|
||||
|
||||
static const char * const SPLIT_FILENAME_FORMAT = "%s-%05d-of-%05d.gguf";
|
||||
|
||||
struct split_params {
|
||||
split_operation operation = SPLIT_OP_SPLIT;
|
||||
int n_split_tensors = 128;
|
||||
std::string input;
|
||||
std::string output;
|
||||
};
|
||||
|
||||
static void split_print_usage(const char * executable) {
|
||||
const split_params default_params;
|
||||
printf("\n");
|
||||
printf("usage: %s [options] GGUF_IN GGUF_OUT\n", executable);
|
||||
printf("\n");
|
||||
printf("Apply a GGUF operation on IN to OUT.");
|
||||
printf("\n");
|
||||
printf("options:\n");
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" --version show version and build info\n");
|
||||
printf(" --split split GGUF to multiple GGUF (default)\n");
|
||||
printf(" --split-max-tensors max tensors in each split: default(%d)\n", default_params.n_split_tensors);
|
||||
printf(" --merge merge multiple GGUF to a single GGUF\n");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
static bool split_params_parse_ex(int argc, const char ** argv, split_params & params) {
|
||||
std::string arg;
|
||||
const std::string arg_prefix = "--";
|
||||
bool invalid_param = false;
|
||||
|
||||
int arg_idx = 1;
|
||||
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
|
||||
arg = argv[arg_idx];
|
||||
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
||||
std::replace(arg.begin(), arg.end(), '_', '-');
|
||||
}
|
||||
|
||||
bool arg_found = false;
|
||||
if (arg == "-h" || arg == "--help") {
|
||||
split_print_usage(argv[0]);
|
||||
exit(0);
|
||||
}
|
||||
if (arg == "--version") {
|
||||
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
||||
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if (arg == "--merge") {
|
||||
arg_found = true;
|
||||
params.operation = SPLIT_OP_MERGE;
|
||||
}
|
||||
if (arg == "--split") {
|
||||
arg_found = true;
|
||||
params.operation = SPLIT_OP_SPLIT;
|
||||
}
|
||||
if (arg == "--split-max-tensors") {
|
||||
if (++arg_idx >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
arg_found = true;
|
||||
params.n_split_tensors = atoi(argv[arg_idx]);
|
||||
}
|
||||
|
||||
if (!arg_found) {
|
||||
throw std::invalid_argument("error: unknown argument: " + arg);
|
||||
}
|
||||
}
|
||||
|
||||
if (invalid_param) {
|
||||
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
||||
}
|
||||
|
||||
if (argc - arg_idx < 2) {
|
||||
printf("%s: bad arguments\n", argv[0]);
|
||||
split_print_usage(argv[0]);
|
||||
return false;
|
||||
}
|
||||
|
||||
params.input = argv[arg_idx++];
|
||||
params.output = argv[arg_idx++];
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool split_params_parse(int argc, const char ** argv, split_params & params) {
|
||||
bool result = true;
|
||||
try {
|
||||
if (!split_params_parse_ex(argc, argv, params)) {
|
||||
split_print_usage(argv[0]);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
catch (const std::invalid_argument & ex) {
|
||||
fprintf(stderr, "%s\n", ex.what());
|
||||
split_print_usage(argv[0]);
|
||||
exit(1);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static void zeros(std::ofstream & file, size_t n) {
|
||||
char zero = 0;
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
file.write(&zero, 1);
|
||||
}
|
||||
}
|
||||
|
||||
static std::string split_file_name(const std::string & path, int i_split, int n_split) {
|
||||
char f_split[SPLIT_FILENAME_MAX] = {0};
|
||||
snprintf(f_split, sizeof(f_split), SPLIT_FILENAME_FORMAT, path.c_str(), i_split + 1, n_split);
|
||||
return std::string(f_split);
|
||||
}
|
||||
|
||||
struct split_strategy {
|
||||
const split_params params;
|
||||
std::ifstream & f_input;
|
||||
struct gguf_context * ctx_gguf;
|
||||
struct ggml_context * ctx_meta = NULL;
|
||||
const int n_tensors;
|
||||
|
||||
const int n_split;
|
||||
int i_split = 0;
|
||||
|
||||
int i_tensor = 0;
|
||||
|
||||
std::vector<uint8_t> read_data;
|
||||
|
||||
struct gguf_context * ctx_out;
|
||||
std::ofstream fout;
|
||||
|
||||
split_strategy(const split_params & params,
|
||||
std::ifstream & f_input,
|
||||
struct gguf_context * ctx_gguf,
|
||||
struct ggml_context * ctx_meta) :
|
||||
params(params),
|
||||
f_input(f_input),
|
||||
ctx_gguf(ctx_gguf),
|
||||
ctx_meta(ctx_meta),
|
||||
n_tensors(gguf_get_n_tensors(ctx_gguf)),
|
||||
n_split(std::ceil(1. * n_tensors / params.n_split_tensors)) {
|
||||
}
|
||||
|
||||
bool should_split() const {
|
||||
return i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0;
|
||||
}
|
||||
|
||||
void split_start() {
|
||||
ctx_out = gguf_init_empty();
|
||||
|
||||
// Save all metadata in first split only
|
||||
if (i_split == 0) {
|
||||
gguf_set_kv(ctx_out, ctx_gguf);
|
||||
}
|
||||
gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split);
|
||||
gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split);
|
||||
|
||||
// populate the original tensors, so we get an initial metadata
|
||||
for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) {
|
||||
struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
|
||||
gguf_add_tensor(ctx_out, meta);
|
||||
}
|
||||
|
||||
auto split_name = split_file_name(params.output, i_split, n_split);
|
||||
|
||||
fprintf(stderr, "%s: %s ...", __func__, split_name.c_str());
|
||||
fout = std::ofstream(split_name, std::ios::binary);
|
||||
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
||||
|
||||
auto meta_size = gguf_get_meta_size(ctx_out);
|
||||
|
||||
// placeholder for the meta data
|
||||
::zeros(fout, meta_size);
|
||||
|
||||
i_split++;
|
||||
}
|
||||
|
||||
void next_tensor() {
|
||||
const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
|
||||
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
|
||||
auto n_bytes = ggml_nbytes(t);
|
||||
|
||||
if (read_data.size() < n_bytes) {
|
||||
read_data.resize(n_bytes);
|
||||
}
|
||||
|
||||
auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor);
|
||||
f_input.seekg(offset);
|
||||
f_input.read((char *)read_data.data(), n_bytes);
|
||||
|
||||
t->data = read_data.data();
|
||||
|
||||
// write tensor data + padding
|
||||
fout.write((const char *)t->data, n_bytes);
|
||||
zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
|
||||
|
||||
i_tensor++;
|
||||
}
|
||||
|
||||
void split_end() {
|
||||
// go back to beginning of file and write the updated metadata
|
||||
fout.seekp(0);
|
||||
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
|
||||
gguf_get_meta_data(ctx_out, data.data());
|
||||
fout.write((const char *)data.data(), data.size());
|
||||
|
||||
fout.close();
|
||||
gguf_free(ctx_out);
|
||||
|
||||
fprintf(stderr, "\033[3Ddone\n");
|
||||
}
|
||||
};
|
||||
|
||||
static void gguf_split(const split_params & split_params) {
|
||||
struct ggml_context * ctx_meta = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx_meta,
|
||||
};
|
||||
|
||||
std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
|
||||
if (!f_input.is_open()) {
|
||||
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_params.input.c_str());
|
||||
exit(1);
|
||||
}
|
||||
|
||||
auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params);
|
||||
if (!ctx_gguf) {
|
||||
fprintf(stderr, "%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
|
||||
exit(1);
|
||||
}
|
||||
|
||||
split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta);
|
||||
fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n",
|
||||
__func__, split_params.input.c_str(),
|
||||
split_file_name(split_params.output, strategy.i_split, strategy.n_split).c_str(),
|
||||
split_params.n_split_tensors);
|
||||
|
||||
strategy.split_start();
|
||||
|
||||
while (strategy.i_tensor < strategy.n_tensors) {
|
||||
strategy.next_tensor();
|
||||
if (strategy.should_split()) {
|
||||
strategy.split_end();
|
||||
strategy.split_start();
|
||||
}
|
||||
}
|
||||
strategy.split_end();
|
||||
|
||||
gguf_free(ctx_gguf);
|
||||
f_input.close();
|
||||
|
||||
fprintf(stderr, "%s: %d gguf split written with a total of %d tensors.\n",
|
||||
__func__, strategy.n_split, strategy.n_tensors);
|
||||
}
|
||||
|
||||
static void gguf_merge(const split_params & split_params) {
|
||||
fprintf(stderr, "%s: %s -> %s\n",
|
||||
__func__, split_params.input.c_str(),
|
||||
split_params.output.c_str());
|
||||
int n_split = 1;
|
||||
int total_tensors = 0;
|
||||
|
||||
auto * ctx_out = gguf_init_empty();
|
||||
std::ofstream fout(split_params.output.c_str(), std::ios::binary);
|
||||
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
||||
|
||||
std::vector<uint8_t> read_data;
|
||||
std::vector<ggml_context *> ctx_metas;
|
||||
std::vector<gguf_context *> ctx_ggufs;
|
||||
|
||||
std::string split_prefix;
|
||||
|
||||
// First pass to find KV and tensors metadata
|
||||
for (int i_split = 0; i_split < n_split; i_split++) {
|
||||
struct ggml_context * ctx_meta = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx_meta,
|
||||
};
|
||||
|
||||
auto split_name = split_params.input;
|
||||
if (i_split > 0) {
|
||||
split_name = split_file_name(split_prefix, i_split, n_split);
|
||||
}
|
||||
fprintf(stderr, "%s: reading metadata %s ...", __func__, split_name.c_str());
|
||||
|
||||
auto * ctx_gguf = gguf_init_from_file(split_name.c_str(), params);
|
||||
if (!ctx_gguf) {
|
||||
fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
|
||||
exit(1);
|
||||
}
|
||||
ctx_ggufs.push_back(ctx_gguf);
|
||||
ctx_metas.push_back(ctx_meta);
|
||||
|
||||
if (i_split == 0) {
|
||||
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT);
|
||||
if (key_n_split < 0) {
|
||||
fprintf(stderr,
|
||||
"\n%s: input file does not contain %s metadata\n",
|
||||
__func__,
|
||||
LLM_KV_GENERAL_SPLIT_N_SPLIT);
|
||||
gguf_free(ctx_gguf);
|
||||
gguf_free(ctx_out);
|
||||
fout.close();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
n_split = gguf_get_val_u8(ctx_gguf, key_n_split);
|
||||
if (n_split < 1) {
|
||||
fprintf(stderr,
|
||||
"\n%s: input file does not contain a valid split count %d\n",
|
||||
__func__,
|
||||
n_split);
|
||||
gguf_free(ctx_gguf);
|
||||
gguf_free(ctx_out);
|
||||
fout.close();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Do not trigger merge if we try to merge again the output
|
||||
gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0);
|
||||
|
||||
// Set metadata from the first split
|
||||
gguf_set_kv(ctx_out, ctx_gguf);
|
||||
}
|
||||
|
||||
// Verify the file naming
|
||||
{
|
||||
int i_split_file = 0;
|
||||
int n_split_file = 0;
|
||||
const char * i_split_format = "-00000-of-00000.gguf";
|
||||
|
||||
if (split_name.size() < strlen(i_split_format)) {
|
||||
fprintf(stderr, "\n%s: unexpected input file name: %s\n", __func__, split_params.input.c_str());
|
||||
for (auto * _ctx_gguf : ctx_ggufs) {
|
||||
gguf_free(_ctx_gguf);
|
||||
}
|
||||
gguf_free(ctx_out);
|
||||
fout.close();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
split_prefix = split_name.substr(0, split_name.size() - strlen(i_split_format));
|
||||
|
||||
const char * split_name_c_str = split_name.c_str();
|
||||
int n_part = sscanf(&split_name_c_str[0] + split_prefix.size(), "-%d-of-%d", &i_split_file, &n_split_file);
|
||||
|
||||
if (n_part != 2 || i_split_file - 1 != i_split || n_split_file != n_split) {
|
||||
fprintf(stderr, "\n%s: unexpected input file name: %s"
|
||||
" i_split=%d i_split_file=%d"
|
||||
" n_split=%d n_split_file=%d\n", __func__,
|
||||
split_params.input.c_str(),
|
||||
i_split, i_split_file,
|
||||
n_split, n_split_file);
|
||||
for (auto * _ctx_gguf : ctx_ggufs) {
|
||||
gguf_free(_ctx_gguf);
|
||||
}
|
||||
gguf_free(ctx_out);
|
||||
fout.close();
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
auto n_tensors = gguf_get_n_tensors(ctx_gguf);
|
||||
for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) {
|
||||
const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
|
||||
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
|
||||
gguf_add_tensor(ctx_out, t);
|
||||
}
|
||||
total_tensors += n_tensors;
|
||||
|
||||
fprintf(stderr, "\033[3Ddone\n");
|
||||
}
|
||||
|
||||
// placeholder for the meta data
|
||||
{
|
||||
auto meta_size = gguf_get_meta_size(ctx_out);
|
||||
::zeros(fout, meta_size);
|
||||
}
|
||||
|
||||
// Write tensors data
|
||||
for (int i_split = 0; i_split < n_split; i_split++) {
|
||||
auto split_name = split_file_name(split_prefix, i_split, n_split);
|
||||
std::ifstream f_input(split_name.c_str(), std::ios::binary);
|
||||
if (!f_input.is_open()) {
|
||||
fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_name.c_str());
|
||||
for (auto * _ctx_gguf : ctx_ggufs) {
|
||||
gguf_free(_ctx_gguf);
|
||||
}
|
||||
gguf_free(ctx_out);
|
||||
fout.close();
|
||||
exit(1);
|
||||
}
|
||||
fprintf(stderr, "%s: writing tensors %s ...", __func__, split_name.c_str());
|
||||
|
||||
auto * ctx_gguf = ctx_ggufs[i_split];
|
||||
auto * ctx_meta = ctx_metas[i_split];
|
||||
|
||||
auto n_tensors = gguf_get_n_tensors(ctx_gguf);
|
||||
for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) {
|
||||
const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
|
||||
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
|
||||
|
||||
auto n_bytes = ggml_nbytes(t);
|
||||
|
||||
if (read_data.size() < n_bytes) {
|
||||
read_data.resize(n_bytes);
|
||||
}
|
||||
|
||||
auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor);
|
||||
f_input.seekg(offset);
|
||||
f_input.read((char *)read_data.data(), n_bytes);
|
||||
|
||||
// write tensor data + padding
|
||||
fout.write((const char *)read_data.data(), n_bytes);
|
||||
zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
|
||||
}
|
||||
|
||||
gguf_free(ctx_gguf);
|
||||
ggml_free(ctx_meta);
|
||||
f_input.close();
|
||||
fprintf(stderr, "\033[3Ddone\n");
|
||||
}
|
||||
|
||||
{
|
||||
// go back to beginning of file and write the updated metadata
|
||||
fout.seekp(0);
|
||||
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
|
||||
gguf_get_meta_data(ctx_out, data.data());
|
||||
fout.write((const char *)data.data(), data.size());
|
||||
|
||||
fout.close();
|
||||
gguf_free(ctx_out);
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: %s merged from %d split with %d tensors.\n",
|
||||
__func__, split_params.output.c_str(), n_split, total_tensors);
|
||||
}
|
||||
|
||||
int main(int argc, const char ** argv) {
|
||||
if (argc < 3) {
|
||||
split_print_usage(argv[0]);
|
||||
}
|
||||
|
||||
split_params params;
|
||||
split_params_parse(argc, argv, params);
|
||||
|
||||
switch (params.operation) {
|
||||
case SPLIT_OP_SPLIT: gguf_split(params);
|
||||
break;
|
||||
case SPLIT_OP_MERGE: gguf_merge(params);
|
||||
break;
|
||||
default:split_print_usage(argv[0]);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
62
examples/gritlm/README.md
Normal file
62
examples/gritlm/README.md
Normal file
@@ -0,0 +1,62 @@
|
||||
## Generative Representational Instruction Tuning (GRIT) Example
|
||||
[gritlm] a model which can generate embeddings as well as "normal" text
|
||||
generation depending on the instructions in the prompt.
|
||||
|
||||
* Paper: https://arxiv.org/pdf/2402.09906.pdf
|
||||
|
||||
### Retrieval-Augmented Generation (RAG) use case
|
||||
One use case for `gritlm` is to use it with RAG. If we recall how RAG works is
|
||||
that we take documents that we want to use as context, to ground the large
|
||||
language model (LLM), and we create token embeddings for them. We then store
|
||||
these token embeddings in a vector database.
|
||||
|
||||
When we perform a query, prompt the LLM, we will first create token embeddings
|
||||
for the query and then search the vector database to retrieve the most
|
||||
similar vectors, and return those documents so they can be passed to the LLM as
|
||||
context. Then the query and the context will be passed to the LLM which will
|
||||
have to _again_ create token embeddings for the query. But because gritlm is used
|
||||
the first query can be cached and the second query tokenization generation does
|
||||
not have to be performed at all.
|
||||
|
||||
### Running the example
|
||||
Download a Grit model:
|
||||
```console
|
||||
$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf
|
||||
```
|
||||
|
||||
Run the example using the downloaded model:
|
||||
```console
|
||||
$ ./gritlm -m gritlm-7b_q4_1.gguf
|
||||
|
||||
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
|
||||
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
|
||||
Cosine similarity between "Generative Representational Instruction Tuning" and "A purely peer-to-peer version of electronic cash w" is: 0.112
|
||||
Cosine similarity between "Generative Representational Instruction Tuning" and "All text-based language problems can be reduced to" is: 0.547
|
||||
|
||||
Oh, brave adventurer, who dared to climb
|
||||
The lofty peak of Mt. Fuji in the night,
|
||||
When shadows lurk and ghosts do roam,
|
||||
And darkness reigns, a fearsome sight.
|
||||
|
||||
Thou didst set out, with heart aglow,
|
||||
To conquer this mountain, so high,
|
||||
And reach the summit, where the stars do glow,
|
||||
And the moon shines bright, up in the sky.
|
||||
|
||||
Through the mist and fog, thou didst press on,
|
||||
With steadfast courage, and a steadfast will,
|
||||
Through the darkness, thou didst not be gone,
|
||||
But didst climb on, with a steadfast skill.
|
||||
|
||||
At last, thou didst reach the summit's crest,
|
||||
And gazed upon the world below,
|
||||
And saw the beauty of the night's best,
|
||||
And felt the peace, that only nature knows.
|
||||
|
||||
Oh, brave adventurer, who dared to climb
|
||||
The lofty peak of Mt. Fuji in the night,
|
||||
Thou art a hero, in the eyes of all,
|
||||
For thou didst conquer this mountain, so bright.
|
||||
```
|
||||
|
||||
[gritlm]: https://github.com/ContextualAI/gritlm
|
||||
@@ -56,13 +56,31 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
const struct ggml_tensor * src0 = t->src[0];
|
||||
const struct ggml_tensor * src1 = t->src[1];
|
||||
|
||||
std::string wname;
|
||||
{
|
||||
// remove any prefix and suffixes from the name
|
||||
// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
|
||||
const char * p = strchr(src0->name, '#');
|
||||
if (p != NULL) {
|
||||
p = p + 1;
|
||||
const char * q = strchr(p, '#');
|
||||
if (q != NULL) {
|
||||
wname = std::string(p, q - p);
|
||||
} else {
|
||||
wname = p;
|
||||
}
|
||||
} else {
|
||||
wname = src0->name;
|
||||
}
|
||||
}
|
||||
|
||||
// when ask is true, the scheduler wants to know if we are interested in data from this tensor
|
||||
// if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
|
||||
if (ask) {
|
||||
if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
|
||||
if (t->op != GGML_OP_MUL_MAT) return false;
|
||||
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
|
||||
if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false;
|
||||
if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -94,12 +112,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
// this is necessary to guarantee equal number of "ncall" for each tensor
|
||||
for (int ex = 0; ex < n_as; ++ex) {
|
||||
src0 = t->src[2 + ex];
|
||||
auto& e = m_stats[src0->name];
|
||||
auto& e = m_stats[wname];
|
||||
if (e.values.empty()) {
|
||||
e.values.resize(src1->ne[0], 0);
|
||||
}
|
||||
else if (e.values.size() != (size_t)src1->ne[0]) {
|
||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
|
||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
||||
exit(1); //GGML_ASSERT(false);
|
||||
}
|
||||
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
|
||||
@@ -107,7 +125,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
|
||||
++e.ncall;
|
||||
if (m_params.verbosity > 1) {
|
||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
||||
}
|
||||
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
||||
const int excur = m_ids[row*n_as + idx];
|
||||
@@ -129,17 +147,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
}
|
||||
}
|
||||
} else {
|
||||
auto& e = m_stats[src0->name];
|
||||
auto& e = m_stats[wname];
|
||||
if (e.values.empty()) {
|
||||
e.values.resize(src1->ne[0], 0);
|
||||
}
|
||||
else if (e.values.size() != (size_t)src1->ne[0]) {
|
||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
|
||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
||||
exit(1); //GGML_ASSERT(false);
|
||||
}
|
||||
++e.ncall;
|
||||
if (m_params.verbosity > 1) {
|
||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
||||
}
|
||||
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
||||
const float * x = data + row * src1->ne[0];
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <cstdlib>
|
||||
#include <iterator>
|
||||
#include <map>
|
||||
#include <numeric>
|
||||
@@ -113,10 +114,10 @@ static std::string get_cpu_info() {
|
||||
static std::string get_gpu_info() {
|
||||
std::string id;
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
int count = ggml_cuda_get_device_count();
|
||||
int count = ggml_backend_cuda_get_device_count();
|
||||
for (int i = 0; i < count; i++) {
|
||||
char buf[128];
|
||||
ggml_cuda_get_device_description(i, buf, sizeof(buf));
|
||||
ggml_backend_cuda_get_device_description(i, buf, sizeof(buf));
|
||||
id += buf;
|
||||
if (i < count - 1) {
|
||||
id += "/";
|
||||
@@ -1123,15 +1124,19 @@ struct sql_printer : public printer {
|
||||
static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
|
||||
llama_set_n_threads(ctx, n_threads, n_threads);
|
||||
|
||||
//std::vector<llama_token> tokens(n_prompt, llama_token_bos(llama_get_model(ctx)));
|
||||
//llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt, n_past, 0));
|
||||
//GGML_UNUSED(n_batch);
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const int32_t n_vocab = llama_n_vocab(model);
|
||||
|
||||
std::vector<llama_token> tokens(n_batch);
|
||||
|
||||
std::vector<llama_token> tokens(n_batch, llama_token_bos(llama_get_model(ctx)));
|
||||
int n_processed = 0;
|
||||
|
||||
while (n_processed < n_prompt) {
|
||||
int n_tokens = std::min(n_prompt - n_processed, n_batch);
|
||||
tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
|
||||
for (int i = 1; i < n_tokens; i++) {
|
||||
tokens[i] = std::rand() % n_vocab;
|
||||
}
|
||||
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
|
||||
n_processed += n_tokens;
|
||||
}
|
||||
@@ -1142,11 +1147,15 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
|
||||
static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
|
||||
llama_set_n_threads(ctx, n_threads, n_threads);
|
||||
|
||||
llama_token token = llama_token_bos(llama_get_model(ctx));
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const int32_t n_vocab = llama_n_vocab(model);
|
||||
|
||||
llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
|
||||
|
||||
for (int i = 0; i < n_gen; i++) {
|
||||
llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
|
||||
llama_synchronize(ctx);
|
||||
token = std::rand() % n_vocab;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
# MobileVLM
|
||||
|
||||
Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants.
|
||||
Currently this implementation supports [MobileVLM-1.7B](https://huggingface.co/mtgv/MobileVLM-1.7B) / [MobileVLM_V2-1.7B](https://huggingface.co/mtgv/MobileVLM_V2-1.7B) variants.
|
||||
|
||||
for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM)
|
||||
|
||||
The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
|
||||
|
||||
Notice: The overall process of model inference for both **MobilVLM** and **MobilVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using MobiVLM as an example, the different conversion step will be shown.
|
||||
|
||||
## Usage
|
||||
Build with cmake or run `make llava-cli` to build it.
|
||||
|
||||
@@ -34,7 +36,7 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
|
||||
python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
|
||||
```
|
||||
|
||||
3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF:
|
||||
3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** the arg is `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/convert-image-encoder-to-gguf \
|
||||
@@ -44,6 +46,14 @@ python ./examples/llava/convert-image-encoder-to-gguf \
|
||||
--projector-type ldp
|
||||
```
|
||||
|
||||
```sh
|
||||
python ./examples/llava/convert-image-encoder-to-gguf \
|
||||
-m path/to/clip-vit-large-patch14-336 \
|
||||
--llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
|
||||
--output-dir path/to/MobileVLM-1.7B_V2 \
|
||||
--projector-type ldpv2
|
||||
```
|
||||
|
||||
4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||
|
||||
```sh
|
||||
|
||||
@@ -119,6 +119,7 @@ static std::string format(const char * fmt, ...) {
|
||||
#define TN_LLAVA_PROJ "mm.%d.%s"
|
||||
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
|
||||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
||||
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
||||
#define TN_IMAGE_NEWLINE "model.image_newline"
|
||||
|
||||
|
||||
@@ -126,12 +127,14 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_MLP,
|
||||
PROJECTOR_TYPE_MLP_NORM,
|
||||
PROJECTOR_TYPE_LDP,
|
||||
PROJECTOR_TYPE_LDPV2,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_MLP, "mlp" },
|
||||
{ PROJECTOR_TYPE_LDP, "ldp" },
|
||||
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
||||
};
|
||||
|
||||
|
||||
@@ -475,6 +478,14 @@ struct clip_vision_model {
|
||||
struct ggml_tensor * mm_model_block_2_block_2_0_w;
|
||||
struct ggml_tensor * mm_model_block_2_block_2_1_w;
|
||||
struct ggml_tensor * mm_model_block_2_block_2_1_b;
|
||||
|
||||
// MobileVLM_V2 projection
|
||||
struct ggml_tensor * mm_model_mlp_0_w;
|
||||
struct ggml_tensor * mm_model_mlp_0_b;
|
||||
struct ggml_tensor * mm_model_mlp_2_w;
|
||||
struct ggml_tensor * mm_model_mlp_2_b;
|
||||
struct ggml_tensor * mm_model_peg_0_w;
|
||||
struct ggml_tensor * mm_model_peg_0_b;
|
||||
};
|
||||
|
||||
struct clip_ctx {
|
||||
@@ -497,7 +508,6 @@ struct clip_ctx {
|
||||
|
||||
// memory buffers to evaluate the model
|
||||
ggml_backend_buffer_t params_buffer = NULL;
|
||||
ggml_backend_buffer_t compute_buffer = NULL;
|
||||
|
||||
ggml_backend_t backend = NULL;
|
||||
ggml_gallocr_t compute_alloc = NULL;
|
||||
@@ -808,6 +818,29 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
}
|
||||
embeddings = block_1;
|
||||
}
|
||||
else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2)
|
||||
{
|
||||
int n_patch = 24;
|
||||
struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
|
||||
mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
|
||||
mlp_0 = ggml_gelu(ctx0, mlp_0);
|
||||
struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
|
||||
mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
|
||||
// mlp_2 ne = [2048, 576, 1, 1]
|
||||
// // AVG Pool Layer 2*2, strides = 2
|
||||
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
|
||||
// mlp_2 ne = [576, 2048, 1, 1]
|
||||
mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
|
||||
// mlp_2 ne [24, 24, 2048, 1]
|
||||
mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
|
||||
// weight ne = [3, 3, 2048, 1]
|
||||
struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
|
||||
peg_0 = ggml_add(ctx0, peg_0, mlp_2);
|
||||
peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
|
||||
peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
|
||||
peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
|
||||
embeddings = peg_0;
|
||||
}
|
||||
else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
@@ -1178,7 +1211,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
|
||||
vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
|
||||
vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
|
||||
} else {
|
||||
}
|
||||
else if (new_clip->proj_type == PROJECTOR_TYPE_LDPV2)
|
||||
{
|
||||
// MobilVLM_V2 projection
|
||||
vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "weight"));
|
||||
vision_model.mm_model_mlp_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "bias"));
|
||||
vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "weight"));
|
||||
vision_model.mm_model_mlp_2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "bias"));
|
||||
vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
|
||||
vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
|
||||
}
|
||||
else {
|
||||
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||
}
|
||||
@@ -1235,16 +1279,16 @@ struct clip_image_f32 * clip_image_f32_init() {
|
||||
|
||||
void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
|
||||
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
|
||||
void clip_image_u8_batch_free(struct clip_image_u8_batch & batch) {
|
||||
if (batch.size > 0) {
|
||||
delete[] batch.data;
|
||||
batch.size = 0;
|
||||
void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) {
|
||||
if (batch->size > 0) {
|
||||
delete[] batch->data;
|
||||
batch->size = 0;
|
||||
}
|
||||
}
|
||||
void clip_image_f32_batch_free(struct clip_image_f32_batch & batch) {
|
||||
if (batch.size > 0) {
|
||||
delete[] batch.data;
|
||||
batch.size = 0;
|
||||
void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) {
|
||||
if (batch->size > 0) {
|
||||
delete[] batch->data;
|
||||
batch->size = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1497,7 +1541,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
|
||||
|
||||
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
|
||||
// res_imgs memory is being allocated here, previous allocations will be freed if found
|
||||
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) {
|
||||
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
|
||||
bool pad_to_square = true;
|
||||
if (!ctx->has_vision_encoder) {
|
||||
printf("This gguf file seems to have no vision encoder\n");
|
||||
@@ -1509,11 +1553,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||
pad_to_square = false;
|
||||
}
|
||||
// free the previous res_imgs if any set
|
||||
if (res_imgs.size > 0) {
|
||||
if (res_imgs->size > 0) {
|
||||
clip_image_f32_batch_free(res_imgs);
|
||||
}
|
||||
res_imgs.data = nullptr;
|
||||
res_imgs.size = 0;
|
||||
res_imgs->data = nullptr;
|
||||
res_imgs->size = 0;
|
||||
|
||||
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
|
||||
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
|
||||
@@ -1568,11 +1612,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||
bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
|
||||
patches.insert(patches.begin(), image_original_resize);
|
||||
// clip_image_f32_batch_init(patches.size());
|
||||
res_imgs.size = patches.size();
|
||||
res_imgs.data = new clip_image_f32[res_imgs.size];
|
||||
res_imgs->size = patches.size();
|
||||
res_imgs->data = new clip_image_f32[res_imgs->size];
|
||||
int num=0;
|
||||
for (auto& patch : patches) {
|
||||
normalize_image_u8_to_f32(patch, &res_imgs.data[num], ctx->image_mean, ctx->image_std);
|
||||
normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std);
|
||||
num++;
|
||||
}
|
||||
|
||||
@@ -1660,9 +1704,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||
// }
|
||||
// res_imgs.push_back(res);
|
||||
|
||||
res_imgs.size = 1;
|
||||
res_imgs.data = new clip_image_f32[res_imgs.size];
|
||||
res_imgs.data[0] = *res;
|
||||
res_imgs->size = 1;
|
||||
res_imgs->data = new clip_image_f32[res_imgs->size];
|
||||
res_imgs->data[0] = *res;
|
||||
clip_image_f32_free(res);
|
||||
|
||||
return true;
|
||||
@@ -1676,6 +1720,9 @@ void clip_free(clip_ctx * ctx) {
|
||||
ggml_free(ctx->ctx_data);
|
||||
gguf_free(ctx->ctx_gguf);
|
||||
|
||||
ggml_backend_buffer_free(ctx->params_buffer);
|
||||
ggml_backend_free(ctx->backend);
|
||||
ggml_gallocr_free(ctx->compute_alloc);
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
@@ -1964,6 +2011,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
|
||||
return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
|
||||
}
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
|
||||
return ctx->vision_model.mm_model_peg_0_b->ne[0];
|
||||
}
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
|
||||
return ctx->vision_model.mm_2_b->ne[0];
|
||||
}
|
||||
|
||||
@@ -60,8 +60,8 @@ CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
||||
|
||||
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
|
||||
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
|
||||
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch & batch);
|
||||
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch & batch);
|
||||
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
|
||||
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
|
||||
|
||||
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
||||
|
||||
@@ -69,7 +69,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
|
||||
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
||||
|
||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
|
||||
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs );
|
||||
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
||||
|
||||
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import argparse
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
@@ -38,9 +39,11 @@ def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: b
|
||||
def get_tensor_name(name: str) -> str:
|
||||
if "projection" in name:
|
||||
return name
|
||||
|
||||
if "mm_projector" in name:
|
||||
return name.replace("model.mm_projector", "mm")
|
||||
name = name.replace("model.mm_projector", "mm")
|
||||
name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
|
||||
name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
|
||||
return name
|
||||
|
||||
return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
|
||||
|
||||
@@ -83,7 +86,7 @@ ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
|
||||
ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
|
||||
help="The clip model is from openclip (for ViT-SO400M type))")
|
||||
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
|
||||
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
|
||||
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
|
||||
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
|
||||
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
|
||||
# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
|
||||
|
||||
@@ -223,7 +223,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||
clip_image_f32_batch img_res_v;
|
||||
img_res_v.size = 0;
|
||||
img_res_v.data = nullptr;
|
||||
if (!clip_image_preprocess(ctx_clip, img, img_res_v)) {
|
||||
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
|
||||
fprintf(stderr, "%s: unable to preprocess image\n", __func__);
|
||||
delete[] img_res_v.data;
|
||||
return false;
|
||||
|
||||
@@ -29,9 +29,9 @@ struct llava_image_embed {
|
||||
};
|
||||
|
||||
/** sanity check for clip <-> llava embed size match */
|
||||
LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
|
||||
LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);
|
||||
|
||||
LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
|
||||
LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
|
||||
|
||||
/** build an image embed from image file bytes */
|
||||
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
|
||||
|
||||
@@ -67,6 +67,7 @@ main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
|
||||
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
|
||||
|
||||
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
||||
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
|
||||
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
|
||||
- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
|
||||
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
|
||||
|
||||
@@ -20,6 +20,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
|
||||
- `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
|
||||
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
|
||||
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
|
||||
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
||||
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
||||
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -2195,6 +2195,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
|
||||
}
|
||||
printf(" -m FNAME, --model FNAME\n");
|
||||
printf(" model path (default: %s)\n", params.model.c_str());
|
||||
printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
|
||||
printf(" model download url (default: %s)\n", params.model_url.c_str());
|
||||
printf(" -a ALIAS, --alias ALIAS\n");
|
||||
printf(" set an alias for the model, will be added as `model` field in completion response\n");
|
||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||
@@ -2317,6 +2319,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
||||
break;
|
||||
}
|
||||
params.model = argv[i];
|
||||
} else if (arg == "-mu" || arg == "--model-url") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.model_url = argv[i];
|
||||
} else if (arg == "-a" || arg == "--alias") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
||||
@@ -57,7 +57,7 @@ Feature or Scenario must be annotated with `@llama.cpp` to be included in the de
|
||||
To run a scenario annotated with `@bug`, start:
|
||||
|
||||
```shell
|
||||
DEBUG=ON ./tests.sh --no-skipped --tags bug
|
||||
DEBUG=ON ./tests.sh --no-skipped --tags bug --stop
|
||||
```
|
||||
|
||||
After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated.
|
||||
|
||||
@@ -4,7 +4,8 @@ Feature: llama.cpp server
|
||||
|
||||
Background: Server startup
|
||||
Given a server listening on localhost:8080
|
||||
And a model file bert-bge-small/ggml-model-f16.gguf from HF repo ggml-org/models
|
||||
And a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf
|
||||
And a model file ggml-model-f16.gguf
|
||||
And a model alias bert-bge-small
|
||||
And 42 as server seed
|
||||
And 2 slots
|
||||
|
||||
@@ -1,17 +1,18 @@
|
||||
import errno
|
||||
import os
|
||||
import socket
|
||||
import subprocess
|
||||
import time
|
||||
from contextlib import closing
|
||||
import signal
|
||||
import socket
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from contextlib import closing
|
||||
from subprocess import TimeoutExpired
|
||||
|
||||
|
||||
def before_scenario(context, scenario):
|
||||
context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON'
|
||||
if context.debug:
|
||||
print("DEBUG=ON\n")
|
||||
print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m\n")
|
||||
print("DEBUG=ON")
|
||||
print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m")
|
||||
port = 8080
|
||||
if 'PORT' in os.environ:
|
||||
port = int(os.environ['PORT'])
|
||||
@@ -20,58 +21,45 @@ def before_scenario(context, scenario):
|
||||
|
||||
|
||||
def after_scenario(context, scenario):
|
||||
if context.server_process is None:
|
||||
return
|
||||
if scenario.status == "failed":
|
||||
if 'GITHUB_ACTIONS' in os.environ:
|
||||
print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
|
||||
if os.path.isfile('llama.log'):
|
||||
with closing(open('llama.log', 'r')) as f:
|
||||
for line in f:
|
||||
print(line)
|
||||
if not is_server_listening(context.server_fqdn, context.server_port):
|
||||
print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n")
|
||||
try:
|
||||
if 'server_process' not in context or context.server_process is None:
|
||||
return
|
||||
if scenario.status == "failed":
|
||||
if 'GITHUB_ACTIONS' in os.environ:
|
||||
print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n")
|
||||
if os.path.isfile('llama.log'):
|
||||
with closing(open('llama.log', 'r')) as f:
|
||||
for line in f:
|
||||
print(line)
|
||||
if not is_server_listening(context.server_fqdn, context.server_port):
|
||||
print("\x1b[33;101mERROR: Server stopped listening\x1b[0m")
|
||||
|
||||
if not pid_exists(context.server_process.pid):
|
||||
assert False, f"Server not running pid={context.server_process.pid} ..."
|
||||
if context.server_process.poll() is not None:
|
||||
assert False, f"Server not running pid={context.server_process.pid} ..."
|
||||
|
||||
server_graceful_shutdown(context)
|
||||
server_graceful_shutdown(context) # SIGINT
|
||||
|
||||
# Wait few for socket to free up
|
||||
time.sleep(0.05)
|
||||
try:
|
||||
context.server_process.wait(0.5)
|
||||
except TimeoutExpired:
|
||||
print(f"server still alive after 500ms, force-killing pid={context.server_process.pid} ...")
|
||||
context.server_process.kill() # SIGKILL
|
||||
context.server_process.wait()
|
||||
|
||||
attempts = 0
|
||||
while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port):
|
||||
server_kill(context)
|
||||
time.sleep(0.1)
|
||||
attempts += 1
|
||||
if attempts > 5:
|
||||
server_kill_hard(context)
|
||||
while is_server_listening(context.server_fqdn, context.server_port):
|
||||
time.sleep(0.1)
|
||||
except Exception:
|
||||
print("ignoring error in after_scenario:")
|
||||
traceback.print_exc(file=sys.stdout)
|
||||
|
||||
|
||||
def server_graceful_shutdown(context):
|
||||
print(f"shutting down server pid={context.server_process.pid} ...\n")
|
||||
print(f"shutting down server pid={context.server_process.pid} ...")
|
||||
if os.name == 'nt':
|
||||
os.kill(context.server_process.pid, signal.CTRL_C_EVENT)
|
||||
interrupt = signal.CTRL_C_EVENT
|
||||
else:
|
||||
os.kill(context.server_process.pid, signal.SIGINT)
|
||||
|
||||
|
||||
def server_kill(context):
|
||||
print(f"killing server pid={context.server_process.pid} ...\n")
|
||||
context.server_process.kill()
|
||||
|
||||
|
||||
def server_kill_hard(context):
|
||||
pid = context.server_process.pid
|
||||
path = context.server_path
|
||||
|
||||
print(f"Server dangling exits, hard killing force {pid}={path}...\n")
|
||||
if os.name == 'nt':
|
||||
process = subprocess.check_output(['taskkill', '/F', '/pid', str(pid)]).decode()
|
||||
print(process)
|
||||
else:
|
||||
os.kill(-pid, signal.SIGKILL)
|
||||
interrupt = signal.SIGINT
|
||||
context.server_process.send_signal(interrupt)
|
||||
|
||||
|
||||
def is_server_listening(server_fqdn, server_port):
|
||||
@@ -79,22 +67,5 @@ def is_server_listening(server_fqdn, server_port):
|
||||
result = sock.connect_ex((server_fqdn, server_port))
|
||||
_is_server_listening = result == 0
|
||||
if _is_server_listening:
|
||||
print(f"server is listening on {server_fqdn}:{server_port}...\n")
|
||||
print(f"server is listening on {server_fqdn}:{server_port}...")
|
||||
return _is_server_listening
|
||||
|
||||
|
||||
def pid_exists(pid):
|
||||
"""Check whether pid exists in the current process table."""
|
||||
if pid < 0:
|
||||
return False
|
||||
if os.name == 'nt':
|
||||
output = subprocess.check_output(['TASKLIST', '/FI', f'pid eq {pid}']).decode()
|
||||
print(output)
|
||||
return "No tasks are running" not in output
|
||||
else:
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
except OSError as e:
|
||||
return e.errno == errno.EPERM
|
||||
else:
|
||||
return True
|
||||
|
||||
@@ -4,7 +4,8 @@ Feature: llama.cpp server
|
||||
|
||||
Background: Server startup
|
||||
Given a server listening on localhost:8080
|
||||
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
||||
And a model url https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf
|
||||
And a model file stories260K.gguf
|
||||
And a model alias tinyllama-2
|
||||
And 42 as server seed
|
||||
# KV Cache corresponds to the total amount of tokens
|
||||
@@ -34,9 +35,9 @@ Feature: llama.cpp server
|
||||
And metric llamacpp:tokens_predicted is <n_predicted>
|
||||
|
||||
Examples: Prompts
|
||||
| prompt | n_predict | re_content | n_prompt | n_predicted | truncated |
|
||||
| I believe the meaning of life is | 8 | (read\|going)+ | 18 | 8 | not |
|
||||
| Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids)+ | 46 | 64 | not |
|
||||
| prompt | n_predict | re_content | n_prompt | n_predicted | truncated |
|
||||
| I believe the meaning of life is | 8 | (read\|going)+ | 18 | 8 | not |
|
||||
| Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids\|Anna\|forest)+ | 46 | 64 | not |
|
||||
|
||||
Scenario: Completion prompt truncated
|
||||
Given a prompt:
|
||||
@@ -47,7 +48,7 @@ Feature: llama.cpp server
|
||||
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
||||
"""
|
||||
And a completion request with no api error
|
||||
Then 64 tokens are predicted matching fun|Annaks|popcorns|pictry
|
||||
Then 64 tokens are predicted matching fun|Annaks|popcorns|pictry|bowl
|
||||
And the completion is truncated
|
||||
And 109 prompt tokens are processed
|
||||
|
||||
@@ -64,9 +65,9 @@ Feature: llama.cpp server
|
||||
And the completion is <truncated> truncated
|
||||
|
||||
Examples: Prompts
|
||||
| model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated |
|
||||
| llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | not |
|
||||
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird)+ | -1 | 64 | enabled | |
|
||||
| model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated |
|
||||
| llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | not |
|
||||
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|Annabyear)+ | -1 | 64 | enabled | |
|
||||
|
||||
|
||||
Scenario: Tokenize / Detokenize
|
||||
|
||||
@@ -5,6 +5,8 @@ import os
|
||||
import re
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from contextlib import closing
|
||||
from re import RegexFlag
|
||||
@@ -22,22 +24,27 @@ from prometheus_client import parser
|
||||
def step_server_config(context, server_fqdn, server_port):
|
||||
context.server_fqdn = server_fqdn
|
||||
context.server_port = int(server_port)
|
||||
context.n_gpu_layer = None
|
||||
if 'PORT' in os.environ:
|
||||
context.server_port = int(os.environ['PORT'])
|
||||
print(f"$PORT set, overriding server port with to {context.server_port}")
|
||||
if 'FQDN' in os.environ:
|
||||
context.server_fqdn = os.environ['FQDN']
|
||||
print(f"$FQDN set, overriding server fqdn with to {context.server_fqdn}")
|
||||
if 'N_GPU_LAYERS' in os.environ:
|
||||
context.n_gpu_layer = int(os.environ['N_GPU_LAYERS'])
|
||||
print(f"$N_GPU_LAYERS set, overriding n_gpu_layer with to {context.n_gpu_layer}")
|
||||
|
||||
context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
|
||||
|
||||
context.model_alias = None
|
||||
context.model_file = None
|
||||
context.model_url = None
|
||||
context.n_batch = None
|
||||
context.n_ubatch = None
|
||||
context.n_ctx = None
|
||||
context.n_ga = None
|
||||
context.n_ga_w = None
|
||||
context.n_gpu_layer = None
|
||||
context.n_predict = None
|
||||
context.n_prompts = 0
|
||||
context.n_server_predict = None
|
||||
@@ -62,7 +69,17 @@ def step_server_config(context, server_fqdn, server_port):
|
||||
def step_download_hf_model(context, hf_file, hf_repo):
|
||||
context.model_file = hf_hub_download(repo_id=hf_repo, filename=hf_file)
|
||||
if context.debug:
|
||||
print(f"model file: {context.model_file}\n")
|
||||
print(f"model file: {context.model_file}")
|
||||
|
||||
|
||||
@step('a model file {model_file}')
|
||||
def step_model_file(context, model_file):
|
||||
context.model_file = model_file
|
||||
|
||||
|
||||
@step('a model url {model_url}')
|
||||
def step_model_url(context, model_url):
|
||||
context.model_url = model_url
|
||||
|
||||
|
||||
@step('a model alias {model_alias}')
|
||||
@@ -123,9 +140,12 @@ def step_start_server(context):
|
||||
if 'GITHUB_ACTIONS' in os.environ:
|
||||
max_attempts *= 2
|
||||
|
||||
addrs = socket.getaddrinfo(context.server_fqdn, context.server_port, type=socket.SOCK_STREAM)
|
||||
family, typ, proto, _, sockaddr = addrs[0]
|
||||
|
||||
while True:
|
||||
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
|
||||
result = sock.connect_ex((context.server_fqdn, context.server_port))
|
||||
with closing(socket.socket(family, typ, proto)) as sock:
|
||||
result = sock.connect_ex(sockaddr)
|
||||
if result == 0:
|
||||
print("\x1b[33;46mserver started!\x1b[0m")
|
||||
return
|
||||
@@ -141,7 +161,8 @@ def step_start_server(context):
|
||||
async def step_wait_for_the_server_to_be_started(context, expecting_status):
|
||||
match expecting_status:
|
||||
case 'healthy':
|
||||
await wait_for_health_status(context, context.base_url, 200, 'ok')
|
||||
await wait_for_health_status(context, context.base_url, 200, 'ok',
|
||||
timeout=30)
|
||||
|
||||
case 'ready' | 'idle':
|
||||
await wait_for_health_status(context, context.base_url, 200, 'ok',
|
||||
@@ -194,7 +215,7 @@ async def step_request_completion(context, api_error):
|
||||
user_api_key=context.user_api_key)
|
||||
context.tasks_result.append(completion)
|
||||
if context.debug:
|
||||
print(f"Completion response: {completion}\n")
|
||||
print(f"Completion response: {completion}")
|
||||
if expect_api_error:
|
||||
assert completion == 401, f"completion must be an 401 status code: {completion}"
|
||||
|
||||
@@ -339,7 +360,7 @@ def step_prompt_passkey(context, passkey, i_pos):
|
||||
prompt += context.prompt_junk_suffix
|
||||
if context.debug:
|
||||
passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m"
|
||||
print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```\n")
|
||||
print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```")
|
||||
context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix)
|
||||
context.n_prompts = len(context.prompts)
|
||||
|
||||
@@ -348,7 +369,7 @@ def step_prompt_passkey(context, passkey, i_pos):
|
||||
@async_run_until_complete
|
||||
async def step_oai_chat_completions(context, api_error):
|
||||
if context.debug:
|
||||
print(f"Submitting OAI compatible completions request...\n")
|
||||
print(f"Submitting OAI compatible completions request...")
|
||||
expect_api_error = api_error == 'raised'
|
||||
completion = await oai_chat_completions(context.prompts.pop(),
|
||||
context.system_prompt,
|
||||
@@ -493,12 +514,12 @@ async def step_all_embeddings_are_the_same(context):
|
||||
embedding1 = np.array(embeddings[i])
|
||||
embedding2 = np.array(embeddings[j])
|
||||
if context.debug:
|
||||
print(f"embedding1: {embedding1[-8:]}\n")
|
||||
print(f"embedding2: {embedding2[-8:]}\n")
|
||||
print(f"embedding1: {embedding1[-8:]}")
|
||||
print(f"embedding2: {embedding2[-8:]}")
|
||||
similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
|
||||
msg = f"Similarity between {i} and {j}: {similarity:.10f}"
|
||||
if context.debug:
|
||||
print(f"{msg}\n")
|
||||
print(f"{msg}")
|
||||
assert np.isclose(similarity, 1.0, rtol=1e-05, atol=1e-08, equal_nan=False), msg
|
||||
|
||||
|
||||
@@ -615,7 +636,7 @@ async def step_prometheus_metrics_exported(context):
|
||||
metrics_raw = await metrics_response.text()
|
||||
metric_exported = False
|
||||
if context.debug:
|
||||
print(f"/metrics answer:\n{metrics_raw}\n")
|
||||
print(f"/metrics answer:\n{metrics_raw}")
|
||||
context.metrics = {}
|
||||
for metric in parser.text_string_to_metric_families(metrics_raw):
|
||||
match metric.name:
|
||||
@@ -917,7 +938,7 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
|
||||
last_match = end
|
||||
highlighted += content[last_match:]
|
||||
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
|
||||
print(f"Checking completion response: {highlighted}\n")
|
||||
print(f"Checking completion response: {highlighted}")
|
||||
assert last_match > 0, f'/{re_content}/ must match ```{highlighted}```'
|
||||
if expected_predicted_n and expected_predicted_n > 0:
|
||||
assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
|
||||
@@ -927,7 +948,7 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
|
||||
async def gather_tasks_results(context):
|
||||
n_tasks = len(context.concurrent_tasks)
|
||||
if context.debug:
|
||||
print(f"Waiting for all {n_tasks} tasks results...\n")
|
||||
print(f"Waiting for all {n_tasks} tasks results...")
|
||||
for task_no in range(n_tasks):
|
||||
context.tasks_result.append(await context.concurrent_tasks.pop())
|
||||
n_completions = len(context.tasks_result)
|
||||
@@ -944,7 +965,7 @@ async def wait_for_health_status(context,
|
||||
slots_processing=None,
|
||||
expected_slots=None):
|
||||
if context.debug:
|
||||
print(f"Starting checking for health for expected_health_status={expected_health_status}\n")
|
||||
print(f"Starting checking for health for expected_health_status={expected_health_status}")
|
||||
interval = 0.5
|
||||
counter = 0
|
||||
if 'GITHUB_ACTIONS' in os.environ:
|
||||
@@ -1033,13 +1054,14 @@ def start_server_background(context):
|
||||
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
|
||||
context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
|
||||
server_listen_addr = context.server_fqdn
|
||||
if os.name == 'nt':
|
||||
server_listen_addr = '0.0.0.0'
|
||||
server_args = [
|
||||
'--host', server_listen_addr,
|
||||
'--port', context.server_port,
|
||||
'--model', context.model_file
|
||||
]
|
||||
if context.model_file:
|
||||
server_args.extend(['--model', context.model_file])
|
||||
if context.model_url:
|
||||
server_args.extend(['--model-url', context.model_url])
|
||||
if context.n_batch:
|
||||
server_args.extend(['--batch-size', context.n_batch])
|
||||
if context.n_ubatch:
|
||||
@@ -1070,7 +1092,7 @@ def start_server_background(context):
|
||||
server_args.append('--verbose')
|
||||
if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
|
||||
server_args.extend(['--log-format', "text"])
|
||||
print(f"starting server with: {context.server_path} {server_args}\n")
|
||||
print(f"starting server with: {context.server_path} {server_args}")
|
||||
flags = 0
|
||||
if 'nt' == os.name:
|
||||
flags |= subprocess.DETACHED_PROCESS
|
||||
@@ -1079,8 +1101,23 @@ def start_server_background(context):
|
||||
|
||||
pkwargs = {
|
||||
'creationflags': flags,
|
||||
'stdout': subprocess.PIPE,
|
||||
'stderr': subprocess.PIPE
|
||||
}
|
||||
context.server_process = subprocess.Popen(
|
||||
[str(arg) for arg in [context.server_path, *server_args]],
|
||||
**pkwargs)
|
||||
|
||||
def log_stdout(process):
|
||||
for line in iter(process.stdout.readline, b''):
|
||||
print(line.decode('utf-8'), end='')
|
||||
thread_stdout = threading.Thread(target=log_stdout, args=(context.server_process,))
|
||||
thread_stdout.start()
|
||||
|
||||
def log_stderr(process):
|
||||
for line in iter(process.stderr.readline, b''):
|
||||
print(line.decode('utf-8'), end='', file=sys.stderr)
|
||||
thread_stderr = threading.Thread(target=log_stderr, args=(context.server_process,))
|
||||
thread_stderr.start()
|
||||
|
||||
print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
|
||||
|
||||
@@ -371,6 +371,7 @@ static json oaicompat_completion_params_parse(
|
||||
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
|
||||
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
|
||||
llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z);
|
||||
llama_params["n_keep"] = json_value(body, "n_keep", 0);
|
||||
|
||||
if (body.count("grammar") != 0) {
|
||||
llama_params["grammar"] = json_value(body, "grammar", json::object());
|
||||
|
||||
@@ -13,8 +13,11 @@ source /opt/intel/oneapi/setvars.sh
|
||||
#for FP32
|
||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
#build example/main only
|
||||
#build example/main
|
||||
#cmake --build . --config Release --target main
|
||||
|
||||
#build example/llama-bench
|
||||
#cmake --build . --config Release --target llama-bench
|
||||
|
||||
#build all binary
|
||||
cmake --build . --config Release -v
|
||||
|
||||
@@ -9,18 +9,28 @@ source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
if [ $# -gt 0 ]; then
|
||||
GGML_SYCL_DEVICE=$1
|
||||
GGML_SYCL_SINGLE_GPU=1
|
||||
else
|
||||
GGML_SYCL_DEVICE=0
|
||||
fi
|
||||
echo "use $GGML_SYCL_DEVICE as main GPU"
|
||||
|
||||
#export GGML_SYCL_DEBUG=1
|
||||
|
||||
|
||||
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
|
||||
|
||||
#use all GPUs with same max compute units
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
|
||||
echo "use $GGML_SYCL_DEVICE as main GPU"
|
||||
#use signle GPU only
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||
else
|
||||
#use multiple GPUs with same max compute units
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
fi
|
||||
|
||||
#use main GPU only
|
||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||
|
||||
#use multiple GPUs with same max compute units
|
||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
|
||||
|
||||
@@ -6,8 +6,6 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
|
||||
|
||||
set GGML_SYCL_DEVICE=0
|
||||
rem set GGML_SYCL_DEBUG=1
|
||||
.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
|
||||
|
||||
|
||||
|
||||
6
flake.lock
generated
6
flake.lock
generated
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1709703039,
|
||||
"narHash": "sha256-6hqgQ8OK6gsMu1VtcGKBxKQInRLHtzulDo9Z5jxHEFY=",
|
||||
"lastModified": 1710451336,
|
||||
"narHash": "sha256-pP86Pcfu3BrAvRO7R64x7hs+GaQrjFes+mEPowCfkxY=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "9df3e30ce24fd28c7b3e2de0d986769db5d6225d",
|
||||
"rev": "d691274a972b3165335d261cc4671335f5c67de9",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
10
ggml-alloc.c
10
ggml-alloc.c
@@ -548,7 +548,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
|
||||
if (ggml_is_view(node)) {
|
||||
// TODO: better way to add external dependencies
|
||||
// GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
|
||||
// control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
|
||||
// itself is never used and should not be considered a dependency
|
||||
if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
|
||||
struct ggml_tensor * view_src = node->view_src;
|
||||
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
|
||||
}
|
||||
@@ -565,8 +569,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||
|
||||
ggml_gallocr_hash_get(galloc, src)->n_children += 1;
|
||||
|
||||
// allocate explicit inputs and leafs
|
||||
if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
|
||||
// allocate explicit inputs
|
||||
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -103,6 +103,11 @@ extern "C" {
|
||||
// check if the backend supports an operation
|
||||
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
|
||||
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
||||
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
||||
// even if the weight has to be copied from the CPU temporarily
|
||||
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
|
||||
// (optional) event synchronization
|
||||
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
||||
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
||||
|
||||
278
ggml-backend.c
278
ggml-backend.c
@@ -278,7 +278,7 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
|
||||
return err;
|
||||
}
|
||||
|
||||
bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
return backend->iface.graph_compute(backend, cgraph);
|
||||
}
|
||||
|
||||
@@ -286,6 +286,13 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
|
||||
return backend->iface.supports_op(backend, op);
|
||||
}
|
||||
|
||||
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||
if (backend->iface.offload_op != NULL) {
|
||||
return backend->iface.offload_op(backend, op);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// backend copy
|
||||
|
||||
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
||||
@@ -761,6 +768,10 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
|
||||
|
||||
if (cpu_plan->cplan.work_size > 0) {
|
||||
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
||||
if (cpu_plan->cplan.work_data == NULL) {
|
||||
free(cpu_plan);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
||||
@@ -834,6 +845,7 @@ static struct ggml_backend_i cpu_backend_i = {
|
||||
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
||||
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
||||
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_record = */ NULL,
|
||||
@@ -999,11 +1011,11 @@ static bool ggml_is_view_op(enum ggml_op op) {
|
||||
#endif
|
||||
|
||||
#ifndef GGML_SCHED_MAX_SPLITS
|
||||
#define GGML_SCHED_MAX_SPLITS 256
|
||||
#define GGML_SCHED_MAX_SPLITS 2048
|
||||
#endif
|
||||
|
||||
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
||||
#define GGML_SCHED_MAX_SPLIT_INPUTS 16
|
||||
#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
|
||||
#endif
|
||||
|
||||
#ifndef GGML_SCHED_MAX_COPIES
|
||||
@@ -1043,8 +1055,9 @@ struct ggml_backend_sched {
|
||||
struct ggml_cgraph * graph;
|
||||
|
||||
// graph splits
|
||||
struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
|
||||
struct ggml_backend_sched_split * splits;
|
||||
int n_splits;
|
||||
int splits_capacity;
|
||||
|
||||
// pipeline parallelism support
|
||||
int n_copies;
|
||||
@@ -1114,40 +1127,48 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||
// TODO: use supports_op to check if the backend supports the op
|
||||
|
||||
// assign pre-allocated nodes to their backend
|
||||
// dst
|
||||
int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor);
|
||||
if (cur_backend != -1) {
|
||||
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
|
||||
if (cur_backend_id != -1) {
|
||||
SET_CAUSE(tensor, "1.dst");
|
||||
return cur_backend;
|
||||
return cur_backend_id;
|
||||
}
|
||||
|
||||
// view_src
|
||||
if (tensor->view_src != NULL) {
|
||||
cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
|
||||
if (cur_backend != -1) {
|
||||
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
|
||||
if (cur_backend_id != -1) {
|
||||
SET_CAUSE(tensor, "1.vsrc");
|
||||
return cur_backend;
|
||||
return cur_backend_id;
|
||||
}
|
||||
}
|
||||
|
||||
// input
|
||||
// graph input
|
||||
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
cur_backend = sched->n_backends - 1; // last backend (assumed CPU)
|
||||
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
||||
SET_CAUSE(tensor, "1.inp");
|
||||
return cur_backend;
|
||||
return cur_backend_id;
|
||||
}
|
||||
|
||||
// assign nodes that use weights to the backend of the weights
|
||||
// operations with weights are preferably run on the same backend as the weights
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
const struct ggml_tensor * src = tensor->src[i];
|
||||
if (src == NULL) {
|
||||
continue;
|
||||
}
|
||||
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src);
|
||||
// operations with weights are always run on the same backend as the weights
|
||||
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
|
||||
// check if a backend with higher prio wants to offload the op
|
||||
if (src_backend_id == sched->n_backends - 1) {
|
||||
for (int b = 0; b < src_backend_id; b++) {
|
||||
if (ggml_backend_offload_op(sched->backends[b], tensor)) {
|
||||
SET_CAUSE(tensor, "1.off");
|
||||
return b;
|
||||
}
|
||||
}
|
||||
}
|
||||
SET_CAUSE(tensor, "1.wgt%d", i);
|
||||
return src_backend;
|
||||
return src_backend_id;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1227,28 +1248,31 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
// pass 1: assign backends to ops with pre-allocated inputs
|
||||
for (int i = 0; i < graph->n_leafs; i++) {
|
||||
struct ggml_tensor * leaf = graph->leafs[i];
|
||||
if (tensor_backend_id(leaf) != -1) {
|
||||
int * leaf_backend_id = &tensor_backend_id(leaf);
|
||||
if (*leaf_backend_id != -1) {
|
||||
// do not overwrite user assignments
|
||||
continue;
|
||||
}
|
||||
tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
||||
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
||||
}
|
||||
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
if (tensor_backend_id(node) != -1) {
|
||||
int * node_backend_id = &tensor_backend_id(node);
|
||||
if (*node_backend_id != -1) {
|
||||
// do not overwrite user assignments
|
||||
continue;
|
||||
}
|
||||
tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node);
|
||||
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
|
||||
// src
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (src == NULL) {
|
||||
continue;
|
||||
}
|
||||
if (tensor_backend_id(src) == -1) {
|
||||
tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
|
||||
int * src_backend_id = &tensor_backend_id(src);
|
||||
if (*src_backend_id == -1) {
|
||||
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1270,21 +1294,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
if (ggml_is_view_op(node->op)) {
|
||||
continue;
|
||||
}
|
||||
int tensor_backend_id = tensor_backend_id(node);
|
||||
if (tensor_backend_id != -1) {
|
||||
if (tensor_backend_id == sched->n_backends - 1) {
|
||||
int * node_backend_id = &tensor_backend_id(node);
|
||||
if (*node_backend_id != -1) {
|
||||
if (*node_backend_id == sched->n_backends - 1) {
|
||||
// skip cpu (lowest prio backend)
|
||||
cur_backend_id = -1;
|
||||
} else {
|
||||
cur_backend_id = tensor_backend_id;
|
||||
cur_backend_id = *node_backend_id;
|
||||
}
|
||||
} else {
|
||||
tensor_backend_id(node) = cur_backend_id;
|
||||
*node_backend_id = cur_backend_id;
|
||||
SET_CAUSE(node, "2.2");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// pass 2.1 expand gpu up
|
||||
{
|
||||
int cur_backend_id = -1;
|
||||
@@ -1293,22 +1316,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
if (ggml_is_view_op(node->op)) {
|
||||
continue;
|
||||
}
|
||||
int tensor_backend_id = tensor_backend_id(node);
|
||||
if (tensor_backend_id != -1) {
|
||||
if (tensor_backend_id == sched->n_backends - 1) {
|
||||
int * node_backend_id = &tensor_backend_id(node);
|
||||
if (*node_backend_id != -1) {
|
||||
if (*node_backend_id == sched->n_backends - 1) {
|
||||
// skip cpu (lowest prio backend)
|
||||
cur_backend_id = -1;
|
||||
} else {
|
||||
cur_backend_id = tensor_backend_id;
|
||||
cur_backend_id = *node_backend_id;
|
||||
}
|
||||
} else {
|
||||
tensor_backend_id(node) = cur_backend_id;
|
||||
*node_backend_id = cur_backend_id;
|
||||
SET_CAUSE(node, "2.1");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// pass 2.4 expand rest down
|
||||
{
|
||||
int cur_backend_id = -1;
|
||||
@@ -1317,16 +1338,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
if (ggml_is_view_op(node->op)) {
|
||||
continue;
|
||||
}
|
||||
int tensor_backend_id = tensor_backend_id(node);
|
||||
if (tensor_backend_id != -1) {
|
||||
cur_backend_id = tensor_backend_id;
|
||||
int * node_backend_id = &tensor_backend_id(node);
|
||||
if (*node_backend_id != -1) {
|
||||
cur_backend_id = *node_backend_id;
|
||||
} else {
|
||||
tensor_backend_id(node) = cur_backend_id;
|
||||
*node_backend_id = cur_backend_id;
|
||||
SET_CAUSE(node, "2.4");
|
||||
}
|
||||
}
|
||||
}
|
||||
// pass 2.3 expand rest up
|
||||
// pass 2.3 expand rest up
|
||||
{
|
||||
int cur_backend_id = -1;
|
||||
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
||||
@@ -1334,11 +1355,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
if (ggml_is_view_op(node->op)) {
|
||||
continue;
|
||||
}
|
||||
int tensor_backend_id = tensor_backend_id(node);
|
||||
if (tensor_backend_id != -1) {
|
||||
cur_backend_id = tensor_backend_id;
|
||||
int * node_backend_id = &tensor_backend_id(node);
|
||||
if (*node_backend_id != -1) {
|
||||
cur_backend_id = *node_backend_id;
|
||||
} else {
|
||||
tensor_backend_id(node) = cur_backend_id;
|
||||
*node_backend_id = cur_backend_id;
|
||||
SET_CAUSE(node, "2.3");
|
||||
}
|
||||
}
|
||||
@@ -1351,9 +1372,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
// pass 3: assign backends to remaining src from dst and view_src
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
int cur_backend_id = tensor_backend_id(node);
|
||||
if (node->view_src != NULL && cur_backend_id == -1) {
|
||||
cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src);
|
||||
int * cur_backend_id = &tensor_backend_id(node);
|
||||
if (node->view_src != NULL && *cur_backend_id == -1) {
|
||||
*cur_backend_id = tensor_backend_id(node->view_src);
|
||||
SET_CAUSE(node, "3.vsrc");
|
||||
}
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
@@ -1361,14 +1382,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
if (src == NULL) {
|
||||
continue;
|
||||
}
|
||||
int src_backend_id = tensor_backend_id(src);
|
||||
if (src_backend_id == -1) {
|
||||
int * src_backend_id = &tensor_backend_id(src);
|
||||
if (*src_backend_id == -1) {
|
||||
if (src->view_src != NULL) {
|
||||
// views are always on the same backend as the source
|
||||
tensor_backend_id(src) = tensor_backend_id(src->view_src);
|
||||
*src_backend_id = tensor_backend_id(src->view_src);
|
||||
SET_CAUSE(src, "3.vsrc");
|
||||
} else {
|
||||
tensor_backend_id(src) = cur_backend_id;
|
||||
*src_backend_id = *cur_backend_id;
|
||||
SET_CAUSE(src, "3.cur");
|
||||
}
|
||||
}
|
||||
@@ -1380,19 +1401,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
|
||||
// pass 4: split graph, find tensors that need to be copied
|
||||
{
|
||||
int cur_split = 0;
|
||||
int i_split = 0;
|
||||
struct ggml_backend_sched_split * split = &sched->splits[0];
|
||||
// find the backend of the first split, skipping view ops
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
if (!ggml_is_view_op(node->op)) {
|
||||
sched->splits[0].backend_id = tensor_backend_id(node);
|
||||
split->backend_id = tensor_backend_id(node);
|
||||
break;
|
||||
}
|
||||
}
|
||||
sched->splits[0].i_start = 0;
|
||||
sched->splits[0].n_inputs = 0;
|
||||
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
|
||||
int cur_backend_id = sched->splits[0].backend_id;
|
||||
split->i_start = 0;
|
||||
split->n_inputs = 0;
|
||||
memset(split->inputs, 0, sizeof(split->inputs)); //HACK
|
||||
int cur_backend_id = split->backend_id;
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
|
||||
@@ -1400,18 +1422,54 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
continue;
|
||||
}
|
||||
|
||||
int tensor_backend_id = tensor_backend_id(node);
|
||||
const int node_backend_id = tensor_backend_id(node);
|
||||
|
||||
GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now
|
||||
GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
|
||||
|
||||
if (tensor_backend_id != cur_backend_id) {
|
||||
sched->splits[cur_split].i_end = i;
|
||||
cur_split++;
|
||||
GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
|
||||
sched->splits[cur_split].backend_id = tensor_backend_id;
|
||||
sched->splits[cur_split].i_start = i;
|
||||
sched->splits[cur_split].n_inputs = 0;
|
||||
cur_backend_id = tensor_backend_id;
|
||||
// check if we should start a new split based on the sources of the current node
|
||||
bool need_new_split = false;
|
||||
if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (src == NULL) {
|
||||
continue;
|
||||
}
|
||||
// check if a weight is on a different backend
|
||||
// by starting a new split, the memory of the previously offloaded weights can be reused
|
||||
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||
int src_backend_id = tensor_backend_id(src);
|
||||
if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
|
||||
need_new_split = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// check if the split has too many inputs
|
||||
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
|
||||
const size_t id = hash_id(src);
|
||||
int src_backend_id = sched->tensor_backend_id[id];
|
||||
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
|
||||
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
||||
need_new_split = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (node_backend_id != cur_backend_id || need_new_split) {
|
||||
split->i_end = i;
|
||||
i_split++;
|
||||
if (i_split >= sched->splits_capacity) {
|
||||
sched->splits_capacity *= 2;
|
||||
sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
|
||||
GGML_ASSERT(sched->splits != NULL);
|
||||
}
|
||||
GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
|
||||
split = &sched->splits[i_split];
|
||||
split->backend_id = node_backend_id;
|
||||
split->i_start = i;
|
||||
split->n_inputs = 0;
|
||||
cur_backend_id = node_backend_id;
|
||||
}
|
||||
|
||||
// find inputs that are not on the same backend
|
||||
@@ -1421,10 +1479,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
continue;
|
||||
}
|
||||
|
||||
int src_backend_id = tensor_backend_id(src);
|
||||
const int src_backend_id = tensor_backend_id(src);
|
||||
assert(src_backend_id != -1); // all inputs should be assigned by now
|
||||
|
||||
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
||||
size_t id = hash_id(src);
|
||||
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
||||
ggml_backend_t backend = sched->backends[src_backend_id];
|
||||
@@ -1441,7 +1499,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
||||
}
|
||||
sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
|
||||
tensor_backend_id(tensor_copy) = src_backend_id;
|
||||
SET_CAUSE(tensor_copy, "4.cpy");
|
||||
}
|
||||
int n_graph_inputs = sched->n_graph_inputs++;
|
||||
@@ -1450,9 +1507,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
}
|
||||
}
|
||||
|
||||
if (src_backend_id != tensor_backend_id) {
|
||||
if (src_backend_id != node_backend_id) {
|
||||
// create a copy of the input in the split's backend
|
||||
size_t id = hash_id(src);
|
||||
const size_t id = hash_id(src);
|
||||
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
||||
ggml_backend_t backend = sched->backends[cur_backend_id];
|
||||
for (int c = 0; c < sched->n_copies; c++) {
|
||||
@@ -1463,76 +1520,42 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
||||
}
|
||||
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
|
||||
tensor_backend_id(tensor_copy) = cur_backend_id;
|
||||
SET_CAUSE(tensor_copy, "4.cpy");
|
||||
}
|
||||
int n_inputs = sched->splits[cur_split].n_inputs++;
|
||||
int n_inputs = split->n_inputs++;
|
||||
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
||||
sched->splits[cur_split].inputs[n_inputs] = src;
|
||||
split->inputs[n_inputs] = src;
|
||||
}
|
||||
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
|
||||
}
|
||||
}
|
||||
}
|
||||
sched->splits[cur_split].i_end = graph->n_nodes;
|
||||
sched->n_splits = cur_split + 1;
|
||||
split->i_end = graph->n_nodes;
|
||||
sched->n_splits = i_split + 1;
|
||||
}
|
||||
#ifdef DEBUG_PASS4
|
||||
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
||||
#endif
|
||||
|
||||
#ifndef NDEBUG
|
||||
// sanity check: all sources should have the same backend as the node
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
||||
if (tensor_backend == NULL) {
|
||||
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
||||
}
|
||||
if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
|
||||
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
|
||||
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
||||
node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
|
||||
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
|
||||
}
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (src == NULL) {
|
||||
continue;
|
||||
}
|
||||
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
||||
if (src_backend != tensor_backend /* && src_backend != NULL */) {
|
||||
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
|
||||
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
||||
j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
|
||||
}
|
||||
if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
|
||||
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
|
||||
src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
|
||||
src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
|
||||
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
|
||||
}
|
||||
}
|
||||
}
|
||||
fflush(stderr);
|
||||
#endif
|
||||
|
||||
// create copies of the graph for each split
|
||||
// TODO: avoid this copy
|
||||
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
|
||||
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
|
||||
for (int i = 0; i < sched->n_splits; i++) {
|
||||
struct ggml_backend_sched_split * split = &sched->splits[i];
|
||||
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
||||
|
||||
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
||||
for (int j = 0; j < split->n_inputs; j++) {
|
||||
assert(graph_copy->size > (graph_copy->n_nodes + 1));
|
||||
|
||||
struct ggml_tensor * input = split->inputs[j];
|
||||
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy];
|
||||
const size_t input_id = hash_id(input);
|
||||
struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
|
||||
|
||||
// add a dependency to the input source so that it is not freed before the copy is done
|
||||
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
||||
input_dep->src[0] = input;
|
||||
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
|
||||
sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
|
||||
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
||||
|
||||
// add a dependency to the input copy so that it is allocated at the start of the split
|
||||
@@ -1541,6 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
}
|
||||
|
||||
for (int j = split->i_start; j < split->i_end; j++) {
|
||||
assert(graph_copy->size > graph_copy->n_nodes);
|
||||
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
|
||||
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
||||
}
|
||||
@@ -1625,13 +1649,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
}
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
} else {
|
||||
// wait for the split backend to finish using the input before overwriting it
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
||||
} else {
|
||||
ggml_backend_synchronize(split_backend);
|
||||
ggml_backend_synchronize(input_backend);
|
||||
}
|
||||
|
||||
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
|
||||
}
|
||||
}
|
||||
@@ -1701,17 +1724,21 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
||||
|
||||
// initialize hash table
|
||||
sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
|
||||
sched->hash_set = ggml_hash_set_new(graph_size);
|
||||
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
||||
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
||||
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
|
||||
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size);
|
||||
|
||||
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
||||
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
|
||||
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
|
||||
|
||||
sched->n_backends = n_backends;
|
||||
|
||||
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
||||
|
||||
GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);
|
||||
const int initial_splits_capacity = 16;
|
||||
sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
|
||||
sched->splits_capacity = initial_splits_capacity;
|
||||
|
||||
for (int b = 0; b < n_backends; b++) {
|
||||
sched->backends[b] = backends[b];
|
||||
@@ -1742,6 +1769,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
}
|
||||
ggml_gallocr_free(sched->galloc);
|
||||
ggml_free(sched->ctx);
|
||||
free(sched->splits);
|
||||
free(sched->hash_set.keys);
|
||||
free(sched->tensor_backend_id);
|
||||
free(sched->tensor_copies);
|
||||
@@ -1762,6 +1790,8 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
||||
}
|
||||
|
||||
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
||||
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
|
||||
|
||||
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||
|
||||
// TODO: extract this to a separate function
|
||||
@@ -1776,7 +1806,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||
}
|
||||
|
||||
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
|
||||
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
|
||||
|
||||
ggml_backend_sched_split_graph(sched, graph);
|
||||
|
||||
|
||||
@@ -70,11 +70,11 @@ extern "C" {
|
||||
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
|
||||
GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
|
||||
GGML_API bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
|
||||
// tensor copy between different backends
|
||||
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
3762
ggml-cuda.cu
3762
ggml-cuda.cu
File diff suppressed because it is too large
Load Diff
21
ggml-cuda.h
21
ggml-cuda.h
@@ -17,29 +17,17 @@ extern "C" {
|
||||
|
||||
#define GGML_CUDA_MAX_DEVICES 16
|
||||
|
||||
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
|
||||
GGML_API GGML_CALL void ggml_init_cublas(void);
|
||||
|
||||
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
|
||||
GGML_API GGML_CALL bool ggml_cublas_loaded(void);
|
||||
|
||||
GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
|
||||
GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr);
|
||||
|
||||
GGML_API GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API GGML_CALL int ggml_cuda_get_device_count(void);
|
||||
GGML_API GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||
|
||||
// backend API
|
||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
|
||||
|
||||
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||
|
||||
// device buffer
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||
|
||||
// split tensor buffer that splits matrices by rows across multiple devices
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
||||
|
||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||
|
||||
@@ -47,6 +35,9 @@ GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
|
||||
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
||||
|
||||
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
||||
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1951,6 +1951,7 @@ static struct ggml_backend_i kompute_backend_i = {
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
||||
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_record = */ NULL,
|
||||
|
||||
@@ -2837,6 +2837,7 @@ static struct ggml_backend_i ggml_backend_metal_i = {
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
||||
/* .supports_op = */ ggml_backend_metal_supports_op,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_record = */ NULL,
|
||||
|
||||
344
ggml-sycl.cpp
344
ggml-sycl.cpp
@@ -16,6 +16,7 @@
|
||||
#include <cinttypes>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <float.h>
|
||||
#include <limits>
|
||||
#include <stdint.h>
|
||||
@@ -24,10 +25,9 @@
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <regex>
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <sycl/half_type.hpp>
|
||||
@@ -82,6 +82,30 @@ Following definition copied from DPCT head files, which are used by ggml-sycl.cp
|
||||
#define __dpct_noinline__ __attribute__((noinline))
|
||||
#endif
|
||||
|
||||
|
||||
std::string get_device_type_name(const sycl::device &Device) {
|
||||
auto DeviceType = Device.get_info<sycl::info::device::device_type>();
|
||||
switch (DeviceType) {
|
||||
case sycl::info::device_type::cpu:
|
||||
return "cpu";
|
||||
case sycl::info::device_type::gpu:
|
||||
return "gpu";
|
||||
case sycl::info::device_type::host:
|
||||
return "host";
|
||||
case sycl::info::device_type::accelerator:
|
||||
return "acc";
|
||||
default:
|
||||
return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
std::string get_device_backend_and_type(const sycl::device &device) {
|
||||
std::stringstream device_type;
|
||||
sycl::backend backend = device.get_backend();
|
||||
device_type << backend << ":" << get_device_type_name(device);
|
||||
return device_type.str();
|
||||
}
|
||||
|
||||
namespace dpct
|
||||
{
|
||||
typedef sycl::queue *queue_ptr;
|
||||
@@ -942,17 +966,65 @@ namespace dpct
|
||||
|
||||
private:
|
||||
mutable std::recursive_mutex m_mutex;
|
||||
static bool compare_dev(sycl::device &device1, sycl::device &device2)
|
||||
{
|
||||
dpct::device_info prop1;
|
||||
dpct::get_device_info(prop1, device1);
|
||||
dpct::device_info prop2;
|
||||
dpct::get_device_info(prop2, device2);
|
||||
return prop1.get_max_compute_units() > prop2.get_max_compute_units();
|
||||
}
|
||||
static int convert_backend_index(std::string & backend) {
|
||||
if (backend == "ext_oneapi_level_zero:gpu") return 0;
|
||||
if (backend == "opencl:gpu") return 1;
|
||||
if (backend == "opencl:cpu") return 2;
|
||||
if (backend == "opencl:acc") return 3;
|
||||
printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
static bool compare_backend(std::string &backend1, std::string &backend2) {
|
||||
return convert_backend_index(backend1) < convert_backend_index(backend2);
|
||||
}
|
||||
dev_mgr()
|
||||
{
|
||||
sycl::device default_device =
|
||||
sycl::device(sycl::default_selector_v);
|
||||
_devs.push_back(std::make_shared<device_ext>(default_device));
|
||||
|
||||
std::vector<sycl::device> sycl_all_devs =
|
||||
sycl::device::get_devices(sycl::info::device_type::all);
|
||||
std::vector<sycl::device> sycl_all_devs;
|
||||
// Collect other devices except for the default device.
|
||||
if (default_device.is_cpu())
|
||||
_cpu_device = 0;
|
||||
|
||||
auto Platforms = sycl::platform::get_platforms();
|
||||
// Keep track of the number of devices per backend
|
||||
std::map<sycl::backend, size_t> DeviceNums;
|
||||
std::map<std::string, std::vector<sycl::device>> backend_devices;
|
||||
|
||||
while (!Platforms.empty()) {
|
||||
auto Platform = Platforms.back();
|
||||
Platforms.pop_back();
|
||||
auto devices = Platform.get_devices();
|
||||
std::string backend_type = get_device_backend_and_type(devices[0]);
|
||||
for (const auto &device : devices) {
|
||||
backend_devices[backend_type].push_back(device);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> keys;
|
||||
for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) {
|
||||
keys.push_back(it->first);
|
||||
}
|
||||
std::sort(keys.begin(), keys.end(), compare_backend);
|
||||
|
||||
for (auto &key : keys) {
|
||||
std::vector<sycl::device> devs = backend_devices[key];
|
||||
std::sort(devs.begin(), devs.end(), compare_dev);
|
||||
for (const auto &dev : devs) {
|
||||
sycl_all_devs.push_back(dev);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto &dev : sycl_all_devs)
|
||||
{
|
||||
if (dev == default_device)
|
||||
@@ -3202,6 +3274,11 @@ static int g_work_group_size = 0;
|
||||
#define GGML_SYCL_MMV_Y 1
|
||||
#endif
|
||||
|
||||
enum ggml_sycl_backend_gpu_mode {
|
||||
SYCL_UNSET_GPU_MODE = -1,
|
||||
SYCL_SINGLE_GPU_MODE = 0,
|
||||
SYCL_MUL_GPU_MODE
|
||||
};
|
||||
|
||||
static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||
|
||||
@@ -3401,12 +3478,31 @@ class sycl_gpu_mgr {
|
||||
int work_group_size = 0;
|
||||
std::string gpus_list = "";
|
||||
|
||||
/*
|
||||
Use all GPUs with same top max compute units
|
||||
*/
|
||||
sycl_gpu_mgr() {
|
||||
detect_sycl_gpu_list_with_max_cu();
|
||||
get_allow_gpus();
|
||||
create_context_with_gpus();
|
||||
}
|
||||
|
||||
/*
|
||||
Only use the assigned GPU
|
||||
*/
|
||||
sycl_gpu_mgr(int main_gpu_id) {
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id);
|
||||
dpct::device_info prop;
|
||||
dpct::get_device_info(prop, device);
|
||||
gpus.push_back(main_gpu_id);
|
||||
devices.push_back(device);
|
||||
work_group_size = prop.get_max_work_group_size();
|
||||
max_compute_units = prop.get_max_compute_units();
|
||||
|
||||
get_allow_gpus();
|
||||
create_context_with_gpus();
|
||||
}
|
||||
|
||||
void create_context_with_gpus() {
|
||||
sycl::context ctx = sycl::context(devices);
|
||||
assert(gpus.size() > 0);
|
||||
@@ -3422,7 +3518,7 @@ class sycl_gpu_mgr {
|
||||
gpus_list += std::to_string(gpus[i]);
|
||||
gpus_list += ",";
|
||||
}
|
||||
if (gpus_list.length() > 2) {
|
||||
if (gpus_list.length() > 1) {
|
||||
gpus_list.pop_back();
|
||||
}
|
||||
}
|
||||
@@ -3451,7 +3547,7 @@ class sycl_gpu_mgr {
|
||||
dpct::device_info prop;
|
||||
dpct::get_device_info(prop, device);
|
||||
if (max_compute_units == prop.get_max_compute_units() &&
|
||||
prop.get_major_version() == 1) {
|
||||
is_ext_oneapi_device(device)) {
|
||||
gpus.push_back(id);
|
||||
devices.push_back(device);
|
||||
work_group_size = prop.get_max_work_group_size();
|
||||
@@ -3471,8 +3567,8 @@ class sycl_gpu_mgr {
|
||||
if (gpus[i] == id)
|
||||
return i;
|
||||
}
|
||||
assert(false);
|
||||
return -1;
|
||||
printf("miss to get device index by id=%d\n", id);
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
int get_next_index(int id) {
|
||||
@@ -3481,8 +3577,16 @@ class sycl_gpu_mgr {
|
||||
if (gpus[i] == id)
|
||||
return i;
|
||||
}
|
||||
assert(false);
|
||||
return -1;
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
bool is_ext_oneapi_device(const sycl::device &dev) {
|
||||
sycl::backend dev_backend = dev.get_backend();
|
||||
if (dev_backend == sycl::backend::ext_oneapi_level_zero ||
|
||||
dev_backend == sycl::backend::ext_oneapi_cuda ||
|
||||
dev_backend == sycl::backend::ext_oneapi_hip)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -3491,11 +3595,14 @@ static int g_device_count = -1;
|
||||
static int g_all_sycl_device_count = -1;
|
||||
static int g_main_device = -1;
|
||||
static int g_main_device_id = -1;
|
||||
static bool g_ggml_backend_sycl_buffer_type_initialized = false;
|
||||
|
||||
static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};
|
||||
|
||||
static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0};
|
||||
|
||||
static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode = SYCL_UNSET_GPU_MODE;
|
||||
|
||||
struct sycl_device_capabilities {
|
||||
int cc; // compute capability
|
||||
bool vmm; // virtual memory support
|
||||
@@ -12999,17 +13106,20 @@ bool ggml_sycl_loaded(void) {
|
||||
return g_sycl_loaded;
|
||||
}
|
||||
|
||||
void print_device_detail(int id) {
|
||||
void print_device_detail(int id, sycl::device &device, std::string device_type) {
|
||||
|
||||
dpct::device_info prop;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
dpct::get_device_info(prop, dpct::dev_mgr::instance().get_device(id))));
|
||||
sycl::device cur_device = dpct::dev_mgr::instance().get_device(id);
|
||||
dpct::get_device_info(prop, device)));
|
||||
|
||||
std::string version;
|
||||
version += std::to_string(prop.get_major_version());
|
||||
version += ".";
|
||||
version += std::to_string(prop.get_minor_version());
|
||||
|
||||
fprintf(stderr, "|%2d|%45s|%18s|%17d|%14d|%13d|%15lu|\n", id,
|
||||
device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
|
||||
|
||||
fprintf(stderr, "|%2d|%18s|%45s|%10s|%11d|%8d|%7d|%15lu|\n", id, device_type.c_str(),
|
||||
prop.get_name(), version.c_str(), prop.get_max_compute_units(),
|
||||
prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
|
||||
prop.get_global_mem_size());
|
||||
@@ -13017,19 +13127,35 @@ void print_device_detail(int id) {
|
||||
|
||||
void ggml_backend_sycl_print_sycl_devices() {
|
||||
int device_count = dpct::dev_mgr::instance().device_count();
|
||||
std::map<std::string, size_t> DeviceNums;
|
||||
fprintf(stderr, "found %d SYCL devices:\n", device_count);
|
||||
fprintf(stderr, "|ID| Name |compute capability|Max compute units|Max work group|Max sub group|Global mem size|\n");
|
||||
fprintf(stderr, "|--|---------------------------------------------|------------------|-----------------|--------------|-------------|---------------|\n");
|
||||
fprintf(stderr, "| | | |Compute |Max compute|Max work|Max sub| |\n");
|
||||
fprintf(stderr, "|ID| Device Type| Name|capability|units |group |group |Global mem size|\n");
|
||||
fprintf(stderr, "|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|\n");
|
||||
for (int id = 0; id < device_count; ++id) {
|
||||
print_device_detail(id);
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
||||
sycl::backend backend = device.get_backend();
|
||||
std::string backend_type = get_device_backend_and_type(device);
|
||||
int type_id=DeviceNums[backend_type]++;
|
||||
std::stringstream device_type;
|
||||
device_type << "[" << backend_type << ":" << std::to_string(type_id) << "]";
|
||||
print_device_detail(id, device, device_type.str());
|
||||
}
|
||||
}
|
||||
|
||||
void print_gpu_device_list() {
|
||||
fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n",
|
||||
g_sycl_gpu_mgr->get_gpu_count(),
|
||||
g_sycl_gpu_mgr->gpus_list.c_str(),
|
||||
g_sycl_gpu_mgr->max_compute_units);
|
||||
GGML_ASSERT(g_sycl_gpu_mgr);
|
||||
|
||||
char* hint=NULL;
|
||||
if (g_ggml_sycl_backend_gpu_mode == SYCL_SINGLE_GPU_MODE) {
|
||||
hint = "use %d SYCL GPUs: [%s] with Max compute units:%d\n";
|
||||
} else {
|
||||
hint = "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n";
|
||||
}
|
||||
fprintf(stderr, hint,
|
||||
g_sycl_gpu_mgr->get_gpu_count(),
|
||||
g_sycl_gpu_mgr->gpus_list.c_str(),
|
||||
g_sycl_gpu_mgr->max_compute_units);
|
||||
}
|
||||
|
||||
int get_sycl_env(const char *env_name, int default_val) {
|
||||
@@ -13065,23 +13191,6 @@ void ggml_init_sycl() try {
|
||||
#else
|
||||
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
|
||||
#endif
|
||||
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
|
||||
dpct::dev_mgr::instance().device_count()) != 0) {
|
||||
initialized = true;
|
||||
g_sycl_loaded = false;
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
|
||||
ggml_backend_sycl_print_sycl_devices();
|
||||
|
||||
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
|
||||
|
||||
g_device_count = g_sycl_gpu_mgr->get_gpu_count();
|
||||
g_work_group_size = g_sycl_gpu_mgr->work_group_size;
|
||||
|
||||
print_gpu_device_list();
|
||||
|
||||
int64_t total_vram = 0;
|
||||
|
||||
/* NOT REMOVE, keep it for next optimize for XMX.
|
||||
#if defined(SYCL_USE_XMX)
|
||||
@@ -13090,49 +13199,15 @@ void ggml_init_sycl() try {
|
||||
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
|
||||
#endif
|
||||
*/
|
||||
for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
|
||||
g_device_caps[id].vmm = 0;
|
||||
g_device_caps[id].device_id = -1;
|
||||
g_device_caps[id].cc = 0;
|
||||
g_tensor_split[id] = 0;
|
||||
g_default_tensor_split[id] = 0;
|
||||
|
||||
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
|
||||
dpct::dev_mgr::instance().device_count()) != 0) {
|
||||
initialized = true;
|
||||
g_sycl_loaded = false;
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < g_device_count; ++i) {
|
||||
int device_id = g_sycl_gpu_mgr->gpus[i];
|
||||
g_device_caps[i].vmm = 0;
|
||||
|
||||
dpct::device_info prop;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
||||
prop, dpct::dev_mgr::instance().get_device(device_id))));
|
||||
|
||||
g_default_tensor_split[i] = total_vram;
|
||||
total_vram += prop.get_global_mem_size();
|
||||
|
||||
g_device_caps[i].cc =
|
||||
100 * prop.get_major_version() + 10 * prop.get_minor_version();
|
||||
}
|
||||
|
||||
for (int i = 0; i < g_device_count; ++i) {
|
||||
g_default_tensor_split[i] /= total_vram;
|
||||
}
|
||||
|
||||
for (int i = 0; i < g_device_count; ++i) {
|
||||
SYCL_CHECK(ggml_sycl_set_device(i));
|
||||
|
||||
// create sycl streams
|
||||
for (int is = 0; is < MAX_STREAMS; ++is) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
g_syclStreams[i][is] =
|
||||
dpct::get_current_device().create_queue(
|
||||
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
|
||||
}
|
||||
|
||||
const dpct::queue_ptr stream = g_syclStreams[i][0];
|
||||
// create sycl handle
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
|
||||
}
|
||||
|
||||
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
|
||||
ggml_backend_sycl_print_sycl_devices();
|
||||
initialized = true;
|
||||
g_sycl_loaded = true;
|
||||
}
|
||||
@@ -13143,6 +13218,63 @@ catch (sycl::exception const &exc) {
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
void ggml_init_by_gpus(int device_count) try {
|
||||
g_device_count = device_count;
|
||||
g_work_group_size = g_sycl_gpu_mgr->work_group_size;
|
||||
|
||||
int64_t total_vram = 0;
|
||||
|
||||
print_gpu_device_list();
|
||||
|
||||
for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
|
||||
g_device_caps[id].vmm = 0;
|
||||
g_device_caps[id].device_id = -1;
|
||||
g_device_caps[id].cc = 0;
|
||||
g_tensor_split[id] = 0;
|
||||
g_default_tensor_split[id] = 0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < g_device_count; ++i) {
|
||||
int device_id = g_sycl_gpu_mgr->gpus[i];
|
||||
g_device_caps[i].vmm = 0;
|
||||
|
||||
dpct::device_info prop;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
||||
prop, dpct::dev_mgr::instance().get_device(device_id))));
|
||||
|
||||
g_default_tensor_split[i] = total_vram;
|
||||
total_vram += prop.get_global_mem_size();
|
||||
|
||||
g_device_caps[i].cc =
|
||||
100 * prop.get_major_version() + 10 * prop.get_minor_version();
|
||||
}
|
||||
|
||||
for (int i = 0; i < g_device_count; ++i) {
|
||||
g_default_tensor_split[i] /= total_vram;
|
||||
}
|
||||
|
||||
for (int i = 0; i < g_device_count; ++i) {
|
||||
SYCL_CHECK(ggml_sycl_set_device(i));
|
||||
|
||||
// create sycl streams
|
||||
for (int is = 0; is < MAX_STREAMS; ++is) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
g_syclStreams[i][is] =
|
||||
dpct::get_current_device().create_queue(
|
||||
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
|
||||
}
|
||||
|
||||
const dpct::queue_ptr stream = g_syclStreams[i][0];
|
||||
// create sycl handle
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
|
||||
}
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
void *ggml_sycl_host_malloc(size_t size) try {
|
||||
if (getenv("GGML_SYCL_NO_PINNED") != nullptr) {
|
||||
return nullptr;
|
||||
@@ -16542,22 +16674,24 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
|
||||
/* .is_host = */ nullptr,
|
||||
};
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
|
||||
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
|
||||
if (device_index>=g_device_count or device_index<0) {
|
||||
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
|
||||
device_index, g_device_count-1);
|
||||
GGML_ASSERT(device_index<g_device_count);
|
||||
}
|
||||
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
|
||||
|
||||
static bool ggml_backend_sycl_buffer_type_initialized = false;
|
||||
|
||||
if (!ggml_backend_sycl_buffer_type_initialized) {
|
||||
if (!g_ggml_backend_sycl_buffer_type_initialized) {
|
||||
for (int i = 0; i < g_device_count; i++) {
|
||||
ggml_backend_sycl_buffer_types[i] = {
|
||||
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
||||
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
|
||||
};
|
||||
}
|
||||
ggml_backend_sycl_buffer_type_initialized = true;
|
||||
g_ggml_backend_sycl_buffer_type_initialized = true;
|
||||
}
|
||||
|
||||
return &ggml_backend_sycl_buffer_types[device];
|
||||
return &ggml_backend_sycl_buffer_types[device_index];
|
||||
}
|
||||
|
||||
// sycl split buffer type
|
||||
@@ -17256,6 +17390,7 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
||||
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_record = */ NULL,
|
||||
@@ -17310,11 +17445,42 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
|
||||
return g_sycl_gpu_mgr->get_index(device_id);
|
||||
}
|
||||
|
||||
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
|
||||
return g_sycl_gpu_mgr->gpus[device_index];
|
||||
}
|
||||
|
||||
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
|
||||
GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
|
||||
fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
|
||||
if (g_sycl_gpu_mgr) {
|
||||
delete g_sycl_gpu_mgr;
|
||||
}
|
||||
g_sycl_gpu_mgr = new sycl_gpu_mgr(main_gpu_id);
|
||||
g_ggml_sycl_backend_gpu_mode = SYCL_SINGLE_GPU_MODE;
|
||||
ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
|
||||
g_ggml_backend_sycl_buffer_type_initialized = false;
|
||||
}
|
||||
|
||||
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
|
||||
if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
|
||||
return;
|
||||
}
|
||||
|
||||
fprintf(stderr, "ggml_backend_sycl_set_mul_device_mode: true\n");
|
||||
|
||||
if (g_sycl_gpu_mgr) {
|
||||
delete g_sycl_gpu_mgr;
|
||||
}
|
||||
g_sycl_gpu_mgr = new sycl_gpu_mgr();
|
||||
g_ggml_sycl_backend_gpu_mode = SYCL_MUL_GPU_MODE;
|
||||
ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
|
||||
g_ggml_backend_sycl_buffer_type_initialized = false;
|
||||
}
|
||||
|
||||
extern "C" int ggml_backend_sycl_reg_devices();
|
||||
|
||||
int ggml_backend_sycl_reg_devices() {
|
||||
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
|
||||
g_device_count = g_sycl_gpu_mgr->get_gpu_count();
|
||||
ggml_backend_sycl_set_mul_device_mode();
|
||||
assert(g_device_count>0);
|
||||
for (int i = 0; i < g_device_count; i++) {
|
||||
int id = g_sycl_gpu_mgr->gpus[i];
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_SYCL_MAX_DEVICES 16
|
||||
#define GGML_SYCL_MAX_DEVICES 48
|
||||
#define GGML_SYCL_NAME "SYCL"
|
||||
|
||||
GGML_API void ggml_init_sycl(void);
|
||||
@@ -29,6 +29,11 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_typ
|
||||
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
||||
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
|
||||
|
||||
// TODO: these are temporary
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670
|
||||
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
|
||||
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
|
||||
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -710,6 +710,12 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
|
||||
}
|
||||
}
|
||||
|
||||
// All commands that are allowed on a queue that supports transfer operations are also allowed on a queue that supports either graphics or compute operations.
|
||||
// Thus, if the capabilities of a queue family include VK_QUEUE_GRAPHICS_BIT or VK_QUEUE_COMPUTE_BIT, then reporting the VK_QUEUE_TRANSFER_BIT capability separately for that queue family is optional.
|
||||
if (compute_index >= 0) {
|
||||
return compute_index;
|
||||
}
|
||||
|
||||
std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl;
|
||||
|
||||
for(auto &q_family : queue_family_props) {
|
||||
@@ -5693,6 +5699,7 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
||||
/* .supports_op = */ ggml_backend_vk_supports_op,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_record = */ NULL,
|
||||
|
||||
131
ggml.c
131
ggml.c
@@ -282,8 +282,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
||||
#else
|
||||
#include <cblas.h>
|
||||
#endif
|
||||
#elif defined(GGML_USE_CUBLAS)
|
||||
#include "ggml-cuda.h"
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
#include "ggml-opencl.h"
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
@@ -470,6 +468,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.type_size = sizeof(int32_t),
|
||||
.is_quantized = false,
|
||||
},
|
||||
[GGML_TYPE_I64] = {
|
||||
.type_name = "i64",
|
||||
.blck_size = 1,
|
||||
.type_size = sizeof(int64_t),
|
||||
.is_quantized = false,
|
||||
},
|
||||
[GGML_TYPE_F64] = {
|
||||
.type_name = "f64",
|
||||
.blck_size = 1,
|
||||
.type_size = sizeof(double),
|
||||
.is_quantized = false,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_F32] = {
|
||||
.type_name = "f32",
|
||||
.blck_size = 1,
|
||||
@@ -918,6 +929,101 @@ inline static float vaddvq_f32(float32x4_t v) {
|
||||
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
||||
#endif
|
||||
|
||||
#elif defined(__AVX512F__)
|
||||
|
||||
#define GGML_SIMD
|
||||
|
||||
// F32 AVX512
|
||||
|
||||
#define GGML_F32_STEP 64
|
||||
#define GGML_F32_EPR 16
|
||||
|
||||
#define GGML_F32x16 __m512
|
||||
#define GGML_F32x16_ZERO _mm512_setzero_ps()
|
||||
#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
|
||||
#define GGML_F32x16_LOAD _mm512_loadu_ps
|
||||
#define GGML_F32x16_STORE _mm512_storeu_ps
|
||||
// _mm512_fmadd_ps is defined in AVX512F so no guard is required
|
||||
#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
||||
#define GGML_F32x16_ADD _mm512_add_ps
|
||||
#define GGML_F32x16_MUL _mm512_mul_ps
|
||||
#define GGML_F32x16_REDUCE(res, x) \
|
||||
do { \
|
||||
int offset = GGML_F32_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
res = _mm512_reduce_add_ps(x[0]); \
|
||||
} while (0)
|
||||
|
||||
// TODO: is this optimal ?
|
||||
|
||||
#define GGML_F32_VEC GGML_F32x16
|
||||
#define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
|
||||
#define GGML_F32_VEC_SET1 GGML_F32x16_SET1
|
||||
#define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
|
||||
#define GGML_F32_VEC_STORE GGML_F32x16_STORE
|
||||
#define GGML_F32_VEC_FMA GGML_F32x16_FMA
|
||||
#define GGML_F32_VEC_ADD GGML_F32x16_ADD
|
||||
#define GGML_F32_VEC_MUL GGML_F32x16_MUL
|
||||
#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
|
||||
|
||||
// F16 AVX512
|
||||
|
||||
// F16 AVX
|
||||
|
||||
#define GGML_F16_STEP 64
|
||||
#define GGML_F16_EPR 16
|
||||
|
||||
// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
|
||||
|
||||
#define GGML_F32Cx16 __m512
|
||||
#define GGML_F32Cx16_ZERO _mm512_setzero_ps()
|
||||
#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
|
||||
|
||||
// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
|
||||
// so F16C guard isn't required
|
||||
#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x)))
|
||||
#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
|
||||
|
||||
#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
||||
#define GGML_F32Cx16_ADD _mm512_add_ps
|
||||
#define GGML_F32Cx16_MUL _mm512_mul_ps
|
||||
#define GGML_F32Cx16_REDUCE(res, x) \
|
||||
do { \
|
||||
int offset = GGML_F32_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
res = _mm512_reduce_add_ps(x[0]); \
|
||||
} while (0)
|
||||
|
||||
#define GGML_F16_VEC GGML_F32Cx16
|
||||
#define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
|
||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
|
||||
#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
|
||||
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
|
||||
|
||||
#elif defined(__AVX__)
|
||||
|
||||
#define GGML_SIMD
|
||||
@@ -2532,9 +2638,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
ggml_init_cublas();
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
ggml_cl_init();
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
ggml_vk_init_cpu_assist();
|
||||
@@ -10997,7 +11101,6 @@ static void ggml_compute_forward_out_prod_f32(
|
||||
// nb01 >= nb00 - src0 is not transposed
|
||||
// compute by src0 rows
|
||||
|
||||
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
|
||||
// TODO: #if defined(GGML_USE_CLBLAST)
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
@@ -11197,7 +11300,6 @@ static void ggml_compute_forward_out_prod_q_f32(
|
||||
// nb01 >= nb00 - src0 is not transposed
|
||||
// compute by src0 rows
|
||||
|
||||
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
|
||||
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
|
||||
if (params->type == GGML_TASK_TYPE_INIT) {
|
||||
@@ -12418,6 +12520,8 @@ static void ggml_compute_forward_alibi(
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
case GGML_TYPE_I32:
|
||||
case GGML_TYPE_I64:
|
||||
case GGML_TYPE_F64:
|
||||
case GGML_TYPE_COUNT:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
@@ -12504,6 +12608,8 @@ static void ggml_compute_forward_clamp(
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
case GGML_TYPE_I32:
|
||||
case GGML_TYPE_I64:
|
||||
case GGML_TYPE_F64:
|
||||
case GGML_TYPE_COUNT:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
@@ -15939,14 +16045,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
|
||||
if (skip_cpu) {
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
|
||||
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
#if defined(GGML_USE_VULKAN)
|
||||
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
if (skip_cpu) {
|
||||
@@ -15958,7 +16057,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
}
|
||||
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
|
||||
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#endif // GGML_USE_VULKAN
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
|
||||
|
||||
2
ggml.h
2
ggml.h
@@ -366,6 +366,8 @@ extern "C" {
|
||||
GGML_TYPE_I8 = 24,
|
||||
GGML_TYPE_I16 = 25,
|
||||
GGML_TYPE_I32 = 26,
|
||||
GGML_TYPE_I64 = 27,
|
||||
GGML_TYPE_F64 = 28,
|
||||
GGML_TYPE_COUNT,
|
||||
};
|
||||
|
||||
|
||||
@@ -42,6 +42,7 @@ class Keys:
|
||||
EXPERT_COUNT = "{arch}.expert_count"
|
||||
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
||||
POOLING_TYPE = "{arch}.pooling_type"
|
||||
LOGIT_SCALE = "{arch}.logit_scale"
|
||||
|
||||
class Attention:
|
||||
HEAD_COUNT = "{arch}.attention.head_count"
|
||||
@@ -121,6 +122,7 @@ class MODEL_ARCH(IntEnum):
|
||||
GEMMA = auto()
|
||||
STARCODER2 = auto()
|
||||
MAMBA = auto()
|
||||
COMMAND_R = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
@@ -187,6 +189,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.GEMMA: "gemma",
|
||||
MODEL_ARCH.STARCODER2: "starcoder2",
|
||||
MODEL_ARCH.MAMBA: "mamba",
|
||||
MODEL_ARCH.COMMAND_R: "command-r",
|
||||
}
|
||||
|
||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
@@ -579,6 +582,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.SSM_D,
|
||||
MODEL_TENSOR.SSM_OUT,
|
||||
],
|
||||
MODEL_ARCH.COMMAND_R: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
@@ -665,6 +680,8 @@ class GGMLQuantizationType(IntEnum):
|
||||
I8 = 24
|
||||
I16 = 25
|
||||
I32 = 26
|
||||
I64 = 27
|
||||
F64 = 28
|
||||
|
||||
|
||||
class GGUFEndian(IntEnum):
|
||||
@@ -734,6 +751,8 @@ GGML_QUANT_SIZES = {
|
||||
GGMLQuantizationType.I8: (1, 1),
|
||||
GGMLQuantizationType.I16: (1, 2),
|
||||
GGMLQuantizationType.I32: (1, 4),
|
||||
GGMLQuantizationType.I64: (1, 8),
|
||||
GGMLQuantizationType.F64: (1, 8),
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -242,12 +242,15 @@ class GGUFReader:
|
||||
n_bytes = n_elems * type_size // block_size
|
||||
data_offs = int(start_offs + offset_tensor[0])
|
||||
item_type: npt.DTypeLike
|
||||
if ggml_type == GGMLQuantizationType.F32:
|
||||
item_count = n_elems
|
||||
item_type = np.float32
|
||||
elif ggml_type == GGMLQuantizationType.F16:
|
||||
if ggml_type == GGMLQuantizationType.F16:
|
||||
item_count = n_elems
|
||||
item_type = np.float16
|
||||
elif ggml_type == GGMLQuantizationType.F32:
|
||||
item_count = n_elems
|
||||
item_type = np.float32
|
||||
elif ggml_type == GGMLQuantizationType.F64:
|
||||
item_count = n_elems
|
||||
item_type = np.float64
|
||||
elif ggml_type == GGMLQuantizationType.I8:
|
||||
item_count = n_elems
|
||||
item_type = np.int8
|
||||
@@ -257,6 +260,9 @@ class GGUFReader:
|
||||
elif ggml_type == GGMLQuantizationType.I32:
|
||||
item_count = n_elems
|
||||
item_type = np.int32
|
||||
elif ggml_type == GGMLQuantizationType.I64:
|
||||
item_count = n_elems
|
||||
item_type = np.int64
|
||||
else:
|
||||
item_count = n_bytes
|
||||
item_type = np.uint8
|
||||
|
||||
@@ -204,18 +204,22 @@ class GGUFWriter:
|
||||
for i in range(n_dims):
|
||||
self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
|
||||
if raw_dtype is None:
|
||||
if tensor_dtype == np.float32:
|
||||
dtype = GGMLQuantizationType.F32
|
||||
elif tensor_dtype == np.float16:
|
||||
if tensor_dtype == np.float16:
|
||||
dtype = GGMLQuantizationType.F16
|
||||
elif tensor_dtype == np.float32:
|
||||
dtype = GGMLQuantizationType.F32
|
||||
elif tensor_dtype == np.float64:
|
||||
dtype = GGMLQuantizationType.F64
|
||||
elif tensor_dtype == np.int8:
|
||||
dtype = GGMLQuantizationType.I8
|
||||
elif tensor_dtype == np.int16:
|
||||
dtype = GGMLQuantizationType.I16
|
||||
elif tensor_dtype == np.int32:
|
||||
dtype = GGMLQuantizationType.I32
|
||||
elif tensor_dtype == np.int64:
|
||||
dtype = GGMLQuantizationType.I64
|
||||
else:
|
||||
raise ValueError("Only F32, F16, I8, I16, I32 tensors are supported for now")
|
||||
raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now")
|
||||
else:
|
||||
dtype = raw_dtype
|
||||
self.ti_data += self._pack("I", dtype)
|
||||
@@ -357,6 +361,9 @@ class GGUFWriter:
|
||||
def add_clamp_kqv(self, value: float) -> None:
|
||||
self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)
|
||||
|
||||
def add_logit_scale(self, value: float) -> None:
|
||||
self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value)
|
||||
|
||||
def add_expert_count(self, count: int) -> None:
|
||||
self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)
|
||||
|
||||
|
||||
443
llama.cpp
443
llama.cpp
@@ -214,6 +214,7 @@ enum llm_arch {
|
||||
LLM_ARCH_GEMMA,
|
||||
LLM_ARCH_STARCODER2,
|
||||
LLM_ARCH_MAMBA,
|
||||
LLM_ARCH_COMMAND_R,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -243,6 +244,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_GEMMA, "gemma" },
|
||||
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
||||
{ LLM_ARCH_MAMBA, "mamba" },
|
||||
{ LLM_ARCH_COMMAND_R, "command-r" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@@ -268,6 +270,7 @@ enum llm_kv {
|
||||
LLM_KV_EXPERT_COUNT,
|
||||
LLM_KV_EXPERT_USED_COUNT,
|
||||
LLM_KV_POOLING_TYPE,
|
||||
LLM_KV_LOGIT_SCALE,
|
||||
|
||||
LLM_KV_ATTENTION_HEAD_COUNT,
|
||||
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
||||
@@ -332,6 +335,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
||||
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
||||
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
||||
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
||||
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||
@@ -536,6 +540,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output"},
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
@@ -838,6 +843,21 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_COMMAND_R,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
@@ -1597,6 +1617,7 @@ enum e_model {
|
||||
MODEL_20B,
|
||||
MODEL_30B,
|
||||
MODEL_34B,
|
||||
MODEL_35B,
|
||||
MODEL_40B,
|
||||
MODEL_65B,
|
||||
MODEL_70B,
|
||||
@@ -1643,6 +1664,7 @@ struct llama_hparams {
|
||||
|
||||
float f_clamp_kqv = 0.0f;
|
||||
float f_max_alibi_bias = 0.0f;
|
||||
float f_logit_scale = 0.0f;
|
||||
|
||||
bool causal_attn = true;
|
||||
bool need_kq_pos = false;
|
||||
@@ -1873,6 +1895,31 @@ struct llama_kv_cache {
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_control_vector {
|
||||
std::vector<struct ggml_tensor *> tensors; // per layer
|
||||
std::vector<struct ggml_context *> ctxs;
|
||||
std::vector<ggml_backend_buffer_t> bufs;
|
||||
|
||||
int32_t layer_start = -1;
|
||||
int32_t layer_end = -1;
|
||||
|
||||
ggml_tensor * tensor_for(int il) const {
|
||||
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
|
||||
return nullptr;
|
||||
}
|
||||
return tensors[il];
|
||||
}
|
||||
|
||||
~llama_control_vector() {
|
||||
for (struct ggml_context * ctx : ctxs) {
|
||||
ggml_free(ctx);
|
||||
}
|
||||
for (ggml_backend_buffer_t buf : bufs) {
|
||||
ggml_backend_buffer_free(buf);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_vocab {
|
||||
using id = int32_t;
|
||||
using token = std::string;
|
||||
@@ -1994,6 +2041,11 @@ struct llama_model {
|
||||
ggml_free(ctx);
|
||||
}
|
||||
for (ggml_backend_buffer_t buf : bufs) {
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
|
||||
ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
|
||||
}
|
||||
#endif
|
||||
ggml_backend_buffer_free(buf);
|
||||
}
|
||||
}
|
||||
@@ -2087,6 +2139,9 @@ struct llama_context {
|
||||
struct ggml_tensor * inp_s_mask; // F32 [1, kv_size]
|
||||
struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch]
|
||||
|
||||
// control vectors
|
||||
struct llama_control_vector cvec;
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
ggml_mpi_context * ctx_mpi = NULL;
|
||||
#endif
|
||||
@@ -3231,6 +3286,7 @@ static const char * llama_model_type_name(e_model type) {
|
||||
case MODEL_20B: return "20B";
|
||||
case MODEL_30B: return "30B";
|
||||
case MODEL_34B: return "34B";
|
||||
case MODEL_35B: return "35B";
|
||||
case MODEL_40B: return "40B";
|
||||
case MODEL_65B: return "65B";
|
||||
case MODEL_70B: return "70B";
|
||||
@@ -3623,6 +3679,15 @@ static void llm_load_hparams(
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_COMMAND_R:
|
||||
{
|
||||
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 40: model.type = e_model::MODEL_35B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
default: (void)0;
|
||||
}
|
||||
|
||||
@@ -3944,6 +4009,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
||||
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
||||
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
||||
LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
|
||||
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
||||
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
||||
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
||||
@@ -4235,9 +4301,9 @@ static bool llm_load_tensors(
|
||||
{
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
||||
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) {
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
||||
} else {
|
||||
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||
if (!model.output) {
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
||||
ml.n_created--; // artificial tensor
|
||||
ml.size_data += ggml_nbytes(model.output);
|
||||
@@ -4442,10 +4508,12 @@ static bool llm_load_tensors(
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
|
||||
|
||||
// same as tok_embd, duplicated to allow offloading
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
ml.n_created--; // artificial tensor
|
||||
ml.size_data += ggml_nbytes(model.output);
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||
if (!model.output) {
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
||||
ml.n_created--; // artificial tensor
|
||||
ml.size_data += ggml_nbytes(model.output);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
@@ -4918,6 +4986,37 @@ static bool llm_load_tensors(
|
||||
layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_COMMAND_R:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
|
||||
// output
|
||||
{
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
// init output from the input tok embed
|
||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
ml.n_created--; // artificial tensor
|
||||
ml.size_data += ggml_nbytes(model.output);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
|
||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||
|
||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
@@ -4942,6 +5041,13 @@ static bool llm_load_tensors(
|
||||
size_t first, last;
|
||||
ml.get_mapping_range(&first, &last, ctx);
|
||||
buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
if (n_layer >= n_gpu_layers) {
|
||||
ggml_backend_cuda_register_host_buffer(
|
||||
ggml_backend_buffer_get_base(buf),
|
||||
ggml_backend_buffer_get_size(buf));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#ifdef GGML_USE_METAL
|
||||
else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
|
||||
@@ -5064,6 +5170,16 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
|
||||
ggml_backend_sycl_set_single_device_mode(params.main_gpu);
|
||||
//SYCL use device index (0, 1, 2) directly, uer input device id, then convert to device index.
|
||||
params.main_gpu = ggml_backend_sycl_get_device_index(params.main_gpu);
|
||||
} else {
|
||||
ggml_backend_sycl_set_mul_device_mode();
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!llm_load_tensors(
|
||||
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
||||
params.progress_callback, params.progress_callback_user_data
|
||||
@@ -5858,6 +5974,12 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
||||
if (layer_dir != nullptr) {
|
||||
cur = ggml_add(ctx0, cur, layer_dir);
|
||||
}
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -5893,7 +6015,7 @@ struct llm_build_context {
|
||||
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
struct ggml_tensor * inp_pos = model.type == MODEL_7B ? build_inp_pos() : nullptr;
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||
@@ -5943,7 +6065,6 @@ struct llm_build_context {
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
@@ -8125,7 +8246,6 @@ struct llm_build_context {
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
@@ -8305,6 +8425,121 @@ struct llm_build_context {
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_command_r() {
|
||||
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
const float f_logit_scale = hparams.f_logit_scale;
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
||||
// norm
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
struct ggml_tensor * ffn_inp = cur;
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * attn_out = cur;
|
||||
|
||||
// feed-forward network
|
||||
{
|
||||
cur = llm_build_ffn(ctx0, ffn_inp,
|
||||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
||||
// add together residual + FFN + self-attention
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
cur = ggml_add(ctx0, cur, attn_out);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
|
||||
if (f_logit_scale) {
|
||||
cur = ggml_scale(ctx0, cur, f_logit_scale);
|
||||
}
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
||||
@@ -8380,12 +8615,15 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
}
|
||||
|
||||
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
|
||||
// to fix this, we assign the norm layer manually to the backend of its layer
|
||||
if (il != -1 && strcmp(name, "norm") == 0) {
|
||||
for (auto * backend : lctx.backends) {
|
||||
if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
|
||||
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
|
||||
break;
|
||||
// FIXME: fix in ggml_backend_sched
|
||||
const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
|
||||
if (batch.n_tokens < 32 || full_offload) {
|
||||
if (il != -1 && strcmp(name, "norm") == 0) {
|
||||
for (auto * backend : lctx.backends) {
|
||||
if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
|
||||
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -8487,6 +8725,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_mamba();
|
||||
} break;
|
||||
case LLM_ARCH_COMMAND_R:
|
||||
{
|
||||
result = llm.build_command_r();
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
@@ -11977,7 +12219,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||
return new_type;
|
||||
}
|
||||
|
||||
static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
||||
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
||||
std::mutex mutex;
|
||||
int counter = 0;
|
||||
size_t new_size = 0;
|
||||
@@ -12882,27 +13124,25 @@ struct llama_context * llama_new_context_with_model(
|
||||
ctx->backends.push_back(ctx->backend_metal);
|
||||
}
|
||||
#elif defined(GGML_USE_CUBLAS)
|
||||
if (model->n_gpu_layers > 0) {
|
||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
||||
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
} else {
|
||||
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
||||
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
||||
ggml_backend_t backend = ggml_backend_cuda_init(device);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
} else {
|
||||
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
||||
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
||||
ggml_backend_t backend = ggml_backend_cuda_init(device);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
@@ -12921,23 +13161,22 @@ struct llama_context * llama_new_context_with_model(
|
||||
if (model->n_gpu_layers > 0) {
|
||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
|
||||
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
|
||||
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
|
||||
int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
} else {
|
||||
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
||||
int id_list[GGML_SYCL_MAX_DEVICES];
|
||||
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
||||
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
||||
int device_id = id_list[i];
|
||||
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
|
||||
int id_list[GGML_SYCL_MAX_DEVICES];
|
||||
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
@@ -13061,14 +13300,17 @@ struct llama_context * llama_new_context_with_model(
|
||||
ggml_backend_t backend = ctx->backends[i];
|
||||
ggml_backend_buffer_type_t buft = backend_buft[i];
|
||||
size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
|
||||
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
||||
ggml_backend_buft_name(buft),
|
||||
size / 1024.0 / 1024.0);
|
||||
if (size > 1) {
|
||||
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
||||
ggml_backend_buft_name(buft),
|
||||
size / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
|
||||
// note: the number of splits during measure is higher than during inference due to the kv shift
|
||||
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
||||
LLAMA_LOG_INFO("%s: graph splits: %d\n", __func__, n_splits);
|
||||
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes);
|
||||
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13138,6 +13380,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_ORION:
|
||||
case LLM_ARCH_INTERNLM2:
|
||||
case LLM_ARCH_MINICPM:
|
||||
case LLM_ARCH_COMMAND_R:
|
||||
return LLAMA_ROPE_TYPE_NORM;
|
||||
|
||||
// the pairs of head values are offset by n_rot/2
|
||||
@@ -13174,6 +13417,10 @@ int32_t llama_n_embd(const struct llama_model * model) {
|
||||
return model->hparams.n_embd;
|
||||
}
|
||||
|
||||
int32_t llama_n_layer(const struct llama_model * model) {
|
||||
return model->hparams.n_layer;
|
||||
}
|
||||
|
||||
float llama_rope_freq_scale_train(const struct llama_model * model) {
|
||||
return model->hparams.rope_freq_scale_train;
|
||||
}
|
||||
@@ -13273,6 +13520,96 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
|
||||
}
|
||||
}
|
||||
|
||||
static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
|
||||
GGML_ASSERT(cvec.tensors.empty());
|
||||
GGML_ASSERT(cvec.ctxs.empty());
|
||||
GGML_ASSERT(cvec.bufs.empty());
|
||||
|
||||
// count layer buffer types
|
||||
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
||||
for (int64_t i = 0; i < model.hparams.n_layer; i++) {
|
||||
buft_layer_count[model.buft_layer[i].buft]++;
|
||||
}
|
||||
|
||||
// allocate contexts
|
||||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||
for (auto & it : buft_layer_count) {
|
||||
int n_layers = it.second;
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ n_layers * ggml_tensor_overhead(),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
ggml_context * ctx = ggml_init(params);
|
||||
if (!ctx) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
ctx_map[it.first] = ctx;
|
||||
}
|
||||
|
||||
// make tensors
|
||||
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
||||
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
||||
struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
|
||||
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
|
||||
cvec.tensors.push_back(tensor);
|
||||
}
|
||||
|
||||
// allocate tensors / buffers and zero
|
||||
for (auto it : ctx_map) {
|
||||
ggml_backend_buffer_type_t buft = it.first;
|
||||
ggml_context * ctx = it.second;
|
||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
||||
if (!buf) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
|
||||
return false;
|
||||
}
|
||||
ggml_backend_buffer_clear(buf, 0);
|
||||
cvec.ctxs.push_back(ctx);
|
||||
cvec.bufs.push_back(buf);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) {
|
||||
const llama_model & model = lctx->model;
|
||||
llama_control_vector & cvec = lctx->cvec;
|
||||
|
||||
if (data == nullptr) {
|
||||
// disable the current control vector (but leave allocated for later)
|
||||
cvec.layer_start = -1;
|
||||
cvec.layer_end = -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (n_embd != (int) model.hparams.n_embd) {
|
||||
LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (cvec.tensors.empty()) {
|
||||
if (!llama_control_vector_init(cvec, model)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
cvec.layer_start = il_start;
|
||||
cvec.layer_end = il_end;
|
||||
|
||||
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
||||
assert(cvec.tensors[il] != nullptr);
|
||||
|
||||
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
|
||||
if (off + n_embd <= len) {
|
||||
ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
|
||||
struct llama_kv_cache_view result = {
|
||||
/*.n_cells = */ 0,
|
||||
@@ -14242,6 +14579,26 @@ static int32_t llama_chat_apply_template_internal(
|
||||
if (add_ass) {
|
||||
ss << "<start_of_turn>model\n";
|
||||
}
|
||||
} else if (tmpl == "orion" || tmpl.find("'\\n\\nAssistant: ' + eos_token") != std::string::npos) {
|
||||
// OrionStarAI/Orion-14B-Chat
|
||||
std::string system_prompt = "";
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
if (role == "system") {
|
||||
// there is no system message support, we will merge it with user prompt
|
||||
system_prompt = message->content;
|
||||
continue;
|
||||
} else if (role == "user") {
|
||||
ss << "Human: ";
|
||||
if (!system_prompt.empty()) {
|
||||
ss << system_prompt << "\n\n";
|
||||
system_prompt = "";
|
||||
}
|
||||
ss << message->content << "\n\nAssistant: </s>";
|
||||
} else {
|
||||
ss << message->content << "</s>";
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// template not supported
|
||||
return -1;
|
||||
|
||||
23
llama.h
23
llama.h
@@ -388,6 +388,7 @@ extern "C" {
|
||||
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
|
||||
|
||||
// Get the model's RoPE frequency scaling factor
|
||||
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
||||
@@ -435,10 +436,24 @@ extern "C" {
|
||||
// Returns 0 on success
|
||||
LLAMA_API int32_t llama_model_apply_lora_from_file(
|
||||
const struct llama_model * model,
|
||||
const char * path_lora,
|
||||
float scale,
|
||||
const char * path_base_model,
|
||||
int32_t n_threads);
|
||||
const char * path_lora,
|
||||
float scale,
|
||||
const char * path_base_model,
|
||||
int32_t n_threads);
|
||||
|
||||
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
||||
// the currently loaded vector.
|
||||
// n_embd should be the size of a single layer's control, and data should point
|
||||
// to an n_embd x n_layers buffer starting from layer 1.
|
||||
// il_start and il_end are the layer range the vector should apply to (both inclusive)
|
||||
// See llama_control_vector_load in common to load a control vector.
|
||||
LLAMA_API int32_t llama_control_vector_apply(
|
||||
struct llama_context * lctx,
|
||||
const float * data,
|
||||
size_t len,
|
||||
int32_t n_embd,
|
||||
int32_t il_start,
|
||||
int32_t il_end);
|
||||
|
||||
//
|
||||
// KV cache
|
||||
|
||||
@@ -31,6 +31,8 @@ int main(void) {
|
||||
"{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
|
||||
// google/gemma-7b-it
|
||||
"{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
|
||||
// OrionStarAI/Orion-14B-Chat
|
||||
"{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
|
||||
};
|
||||
std::vector<std::string> expected_output = {
|
||||
// teknium/OpenHermes-2.5-Mistral-7B
|
||||
@@ -45,6 +47,8 @@ int main(void) {
|
||||
"system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n I am an assistant </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
|
||||
// google/gemma-7b-it
|
||||
"<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
|
||||
// OrionStarAI/Orion-14B-Chat
|
||||
"Human: You are a helpful assistant\n\nHello\n\nAssistant: </s>Hi there</s>Human: Who are you\n\nAssistant: </s> I am an assistant </s>Human: Another question\n\nAssistant: </s>",
|
||||
};
|
||||
std::vector<char> formatted_chat(1024);
|
||||
int32_t res;
|
||||
|
||||
Reference in New Issue
Block a user